Completed
Push — master ( 1877f4...a8999e )
by Osma
17s queued 10s
created

DocumentToSubjectCorpusMixin.subjects()   A

Complexity

Conditions 2

Size

Total Lines 5
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 5
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""Mixin classes for converting between SubjectCorpus and DocumentCorpus"""
2
3
import collections
4
import os.path
5
import tempfile
6
from .types import Document, DocumentCorpus, SubjectCorpus
7
8
9
class DocumentToSubjectCorpusMixin(SubjectCorpus):
10
    """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""
11
12
    _subject_corpus = None
13
    _temp_directory = None
14
15
    @property
16
    def subjects(self):
17
        if self._subject_corpus is None:
18
            self._generate_corpus_from_documents()
19
        return self._subject_corpus.subjects
20
21
    def _subject_filename(self, subject_id):
22
        filename = '{:08d}.txt'.format(subject_id)
23
        return os.path.join(self._temp_directory.name, filename)
24
25
    def _create_subject(self, subject_id, uri, label):
26
        filename = self._subject_filename(subject_id)
27
        with open(filename, 'w', encoding='utf-8') as subjfile:
28
            print("{} {}".format(uri, label), file=subjfile)
29
30
    def _add_text_to_subject(self, subject_id, text):
31
        filename = self._subject_filename(subject_id)
32
        with open(filename, 'a', encoding='utf-8') as subjfile:
33
            print(text, file=subjfile)
34
35
    def _generate_corpus_from_documents(self):
36
        self._temp_directory = tempfile.TemporaryDirectory()
37
38
        for subject_id, subject_info in enumerate(self._subject_index):
39
            uri, label = subject_info
40
            self._create_subject(subject_id, uri, label)
41
42
        for doc in self.documents:
43
            for uri in doc.uris:
44
                subject_id = self._subject_index.by_uri(uri)
45
                if subject_id is None:
46
                    continue
47
                self._add_text_to_subject(subject_id, doc.text)
48
49
        from .subject import SubjectDirectory
50
        self._subject_corpus = SubjectDirectory(self._temp_directory.name)
51
52
53
class SubjectToDocumentCorpusMixin(DocumentCorpus):
54
    """Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus"""
55
56
    _document_uris = None
57
    _document_labels = None
58
59
    @property
60
    def documents(self):
61
        if self._document_uris is None:
62
            self._generate_corpus_from_subjects()
63
        for text, uris in self._document_uris.items():
64
            labels = self._document_labels[text]
65
            yield Document(text=text, uris=uris, labels=labels)
66
67
    def _generate_corpus_from_subjects(self):
68
        self._document_uris = collections.defaultdict(set)
69
        self._document_labels = collections.defaultdict(set)
70
        for subj in self.subjects:
71
            for line in subj.text.splitlines():
72
                self._document_uris[line].add(subj.uri)
73
                self._document_labels[line].add(subj.label)
74