|
1
|
|
|
"""Mixin classes for converting between SubjectCorpus and DocumentCorpus""" |
|
2
|
|
|
|
|
3
|
|
|
import collections |
|
4
|
|
|
import os.path |
|
5
|
|
|
import tempfile |
|
6
|
|
|
from .types import Document, DocumentCorpus, SubjectCorpus |
|
7
|
|
|
|
|
8
|
|
|
|
|
9
|
|
|
class SubjectWriter: |
|
10
|
|
|
"""Writes a single subject file into a SubjectDirectory, performing |
|
11
|
|
|
buffering to limit the number of I/O operations.""" |
|
12
|
|
|
|
|
13
|
|
|
_buffer = None |
|
14
|
|
|
|
|
15
|
|
|
BUFFER_SIZE = 100 |
|
16
|
|
|
|
|
17
|
|
|
def __init__(self, path, uri, label): |
|
18
|
|
|
self._path = path |
|
19
|
|
|
self._buffer = ["{} {}".format(uri, label)] |
|
20
|
|
|
self._created = False |
|
21
|
|
|
|
|
22
|
|
|
def _flush(self): |
|
23
|
|
|
if self._created: |
|
24
|
|
|
mode = 'a' |
|
25
|
|
|
else: |
|
26
|
|
|
mode = 'w' |
|
27
|
|
|
|
|
28
|
|
|
with open(self._path, mode, encoding='utf-8') as subjfile: |
|
29
|
|
|
for text in self._buffer: |
|
30
|
|
|
print(text, file=subjfile) |
|
31
|
|
|
self._buffer = [] |
|
32
|
|
|
self._created = True |
|
33
|
|
|
|
|
34
|
|
|
def write(self, text): |
|
35
|
|
|
self._buffer.append(text) |
|
36
|
|
|
if len(self._buffer) >= self.BUFFER_SIZE: |
|
37
|
|
|
self._flush() |
|
38
|
|
|
|
|
39
|
|
|
def close(self): |
|
40
|
|
|
self._flush() |
|
41
|
|
|
|
|
42
|
|
|
|
|
43
|
|
|
class DocumentToSubjectCorpusMixin(SubjectCorpus): |
|
44
|
|
|
"""Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus""" |
|
45
|
|
|
|
|
46
|
|
|
_subject_corpus = None |
|
47
|
|
|
_temp_directory = None |
|
48
|
|
|
_subject_writer = None |
|
49
|
|
|
|
|
50
|
|
|
@property |
|
51
|
|
|
def subjects(self): |
|
52
|
|
|
if self._subject_corpus is None: |
|
53
|
|
|
self._generate_corpus_from_documents() |
|
54
|
|
|
return self._subject_corpus.subjects |
|
55
|
|
|
|
|
56
|
|
|
def _subject_filename(self, subject_id): |
|
57
|
|
|
filename = '{:08d}.txt'.format(subject_id) |
|
58
|
|
|
return os.path.join(self._temp_directory.name, filename) |
|
59
|
|
|
|
|
60
|
|
|
def _create_subject(self, subject_id, uri, label): |
|
61
|
|
|
filename = self._subject_filename(subject_id) |
|
62
|
|
|
self._subject_writer[subject_id] = SubjectWriter(filename, uri, label) |
|
63
|
|
|
|
|
64
|
|
|
def _add_text_to_subject(self, subject_id, text): |
|
65
|
|
|
self._subject_writer[subject_id].write(text) |
|
66
|
|
|
|
|
67
|
|
|
def _generate_corpus_from_documents(self): |
|
68
|
|
|
self._temp_directory = tempfile.TemporaryDirectory() |
|
69
|
|
|
self._subject_writer = {} |
|
70
|
|
|
|
|
71
|
|
|
for subject_id, subject_info in enumerate(self._subject_index): |
|
72
|
|
|
uri, label = subject_info |
|
73
|
|
|
self._create_subject(subject_id, uri, label) |
|
74
|
|
|
|
|
75
|
|
|
for doc in self.documents: |
|
76
|
|
|
for uri in doc.uris: |
|
77
|
|
|
subject_id = self._subject_index.by_uri(uri) |
|
78
|
|
|
if subject_id is None: |
|
79
|
|
|
continue |
|
80
|
|
|
self._add_text_to_subject(subject_id, doc.text) |
|
81
|
|
|
|
|
82
|
|
|
for subject_id, _ in enumerate(self._subject_index): |
|
83
|
|
|
self._subject_writer[subject_id].close() |
|
84
|
|
|
|
|
85
|
|
|
from .subject import SubjectDirectory |
|
86
|
|
|
self._subject_corpus = SubjectDirectory(self._temp_directory.name) |
|
87
|
|
|
|
|
88
|
|
|
|
|
89
|
|
|
class SubjectToDocumentCorpusMixin(DocumentCorpus): |
|
90
|
|
|
"""Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus""" |
|
91
|
|
|
|
|
92
|
|
|
_document_uris = None |
|
93
|
|
|
_document_labels = None |
|
94
|
|
|
|
|
95
|
|
|
@property |
|
96
|
|
|
def documents(self): |
|
97
|
|
|
if self._document_uris is None: |
|
98
|
|
|
self._generate_corpus_from_subjects() |
|
99
|
|
|
for text, uris in self._document_uris.items(): |
|
100
|
|
|
labels = self._document_labels[text] |
|
101
|
|
|
yield Document(text=text, uris=uris, labels=labels) |
|
102
|
|
|
|
|
103
|
|
|
def _generate_corpus_from_subjects(self): |
|
104
|
|
|
self._document_uris = collections.defaultdict(set) |
|
105
|
|
|
self._document_labels = collections.defaultdict(set) |
|
106
|
|
|
for subj in self.subjects: |
|
107
|
|
|
for line in subj.text.splitlines(): |
|
108
|
|
|
self._document_uris[line].add(subj.uri) |
|
109
|
|
|
self._document_labels[line].add(subj.label) |
|
110
|
|
|
|