Completed
Push — master ( 244db9...8e90e2 )
by Osma
26s queued 11s
created

annif.corpus.convert.SubjectWriter.__init__()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 4
1
"""Mixin classes for converting between SubjectCorpus and DocumentCorpus"""
2
3
import collections
4
import os.path
5
import tempfile
6
from .types import Document, DocumentCorpus, SubjectCorpus
7
8
9
class SubjectWriter:
10
    """Writes a single subject file into a SubjectDirectory, performing
11
    buffering to limit the number of I/O operations."""
12
13
    _buffer = None
14
15
    BUFFER_SIZE = 100
16
17
    def __init__(self, path, uri, label):
18
        self._path = path
19
        self._buffer = ["{} {}".format(uri, label)]
20
        self._created = False
21
22
    def _flush(self):
23
        if self._created:
24
            mode = 'a'
25
        else:
26
            mode = 'w'
27
28
        with open(self._path, mode, encoding='utf-8') as subjfile:
29
            for text in self._buffer:
30
                print(text, file=subjfile)
31
        self._buffer = []
32
        self._created = True
33
34
    def write(self, text):
35
        self._buffer.append(text)
36
        if len(self._buffer) >= self.BUFFER_SIZE:
37
            self._flush()
38
39
    def close(self):
40
        self._flush()
41
42
43
class DocumentToSubjectCorpusMixin(SubjectCorpus):
44
    """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""
45
46
    _subject_corpus = None
47
    _temp_directory = None
48
    _subject_writer = None
49
50
    @property
51
    def subjects(self):
52
        if self._subject_corpus is None:
53
            self._generate_corpus_from_documents()
54
        return self._subject_corpus.subjects
55
56
    def _subject_filename(self, subject_id):
57
        filename = '{:08d}.txt'.format(subject_id)
58
        return os.path.join(self._temp_directory.name, filename)
59
60
    def _create_subject(self, subject_id, uri, label):
61
        filename = self._subject_filename(subject_id)
62
        self._subject_writer[subject_id] = SubjectWriter(filename, uri, label)
63
64
    def _add_text_to_subject(self, subject_id, text):
65
        self._subject_writer[subject_id].write(text)
66
67
    def _generate_corpus_from_documents(self):
68
        self._temp_directory = tempfile.TemporaryDirectory()
69
        self._subject_writer = {}
70
71
        for subject_id, subject_info in enumerate(self._subject_index):
72
            uri, label = subject_info
73
            self._create_subject(subject_id, uri, label)
74
75
        for doc in self.documents:
76
            for uri in doc.uris:
77
                subject_id = self._subject_index.by_uri(uri)
78
                if subject_id is None:
79
                    continue
80
                self._add_text_to_subject(subject_id, doc.text)
81
82
        for subject_id, _ in enumerate(self._subject_index):
83
            self._subject_writer[subject_id].close()
84
85
        from .subject import SubjectDirectory
86
        self._subject_corpus = SubjectDirectory(self._temp_directory.name)
87
88
89
class SubjectToDocumentCorpusMixin(DocumentCorpus):
90
    """Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus"""
91
92
    _document_uris = None
93
    _document_labels = None
94
95
    @property
96
    def documents(self):
97
        if self._document_uris is None:
98
            self._generate_corpus_from_subjects()
99
        for text, uris in self._document_uris.items():
100
            labels = self._document_labels[text]
101
            yield Document(text=text, uris=uris, labels=labels)
102
103
    def _generate_corpus_from_subjects(self):
104
        self._document_uris = collections.defaultdict(set)
105
        self._document_labels = collections.defaultdict(set)
106
        for subj in self.subjects:
107
            for line in subj.text.splitlines():
108
                self._document_uris[line].add(subj.uri)
109
                self._document_labels[line].add(subj.label)
110