|
1
|
|
|
"""Mixin classes for converting between SubjectCorpus and DocumentCorpus""" |
|
2
|
|
|
|
|
3
|
|
|
import collections |
|
4
|
|
|
import os.path |
|
5
|
|
|
import tempfile |
|
6
|
|
|
from .types import Document, DocumentCorpus, SubjectCorpus |
|
7
|
|
|
|
|
8
|
|
|
|
|
9
|
|
|
class DocumentToSubjectCorpusMixin(SubjectCorpus): |
|
10
|
|
|
"""Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus""" |
|
11
|
|
|
|
|
12
|
|
|
_subject_corpus = None |
|
13
|
|
|
_temp_directory = None |
|
14
|
|
|
|
|
15
|
|
|
@property |
|
16
|
|
|
def subjects(self): |
|
17
|
|
|
if self._subject_corpus is None: |
|
18
|
|
|
self._generate_corpus_from_documents() |
|
19
|
|
|
return self._subject_corpus.subjects |
|
20
|
|
|
|
|
21
|
|
|
def _subject_filename(self, subject_id): |
|
22
|
|
|
filename = '{:08d}.txt'.format(subject_id) |
|
23
|
|
|
return os.path.join(self._temp_directory.name, filename) |
|
24
|
|
|
|
|
25
|
|
|
def _create_subject(self, subject_id, uri, label): |
|
26
|
|
|
filename = self._subject_filename(subject_id) |
|
27
|
|
|
with open(filename, 'w', encoding='utf-8') as subjfile: |
|
28
|
|
|
print("{} {}".format(uri, label), file=subjfile) |
|
29
|
|
|
|
|
30
|
|
|
def _add_text_to_subject(self, subject_id, text): |
|
31
|
|
|
filename = self._subject_filename(subject_id) |
|
32
|
|
|
with open(filename, 'a', encoding='utf-8') as subjfile: |
|
33
|
|
|
print(text, file=subjfile) |
|
34
|
|
|
|
|
35
|
|
|
def _generate_corpus_from_documents(self): |
|
36
|
|
|
self._temp_directory = tempfile.TemporaryDirectory() |
|
37
|
|
|
|
|
38
|
|
|
for subject_id, subject_info in enumerate(self._subject_index): |
|
39
|
|
|
uri, label = subject_info |
|
40
|
|
|
self._create_subject(subject_id, uri, label) |
|
41
|
|
|
|
|
42
|
|
|
for doc in self.documents: |
|
43
|
|
|
for uri in doc.uris: |
|
44
|
|
|
subject_id = self._subject_index.by_uri(uri) |
|
45
|
|
|
if subject_id is None: |
|
46
|
|
|
continue |
|
47
|
|
|
self._add_text_to_subject(subject_id, doc.text) |
|
48
|
|
|
|
|
49
|
|
|
from .subject import SubjectDirectory |
|
50
|
|
|
self._subject_corpus = SubjectDirectory(self._temp_directory.name) |
|
51
|
|
|
|
|
52
|
|
|
|
|
53
|
|
|
class SubjectToDocumentCorpusMixin(DocumentCorpus): |
|
54
|
|
|
"""Mixin class for enabling a SubjectCorpus to act as a DocumentCorpus""" |
|
55
|
|
|
|
|
56
|
|
|
_document_uris = None |
|
57
|
|
|
_document_labels = None |
|
58
|
|
|
|
|
59
|
|
|
@property |
|
60
|
|
|
def documents(self): |
|
61
|
|
|
if self._document_uris is None: |
|
62
|
|
|
self._generate_corpus_from_subjects() |
|
63
|
|
|
for text, uris in self._document_uris.items(): |
|
64
|
|
|
labels = self._document_labels[text] |
|
65
|
|
|
yield Document(text=text, uris=uris, labels=labels) |
|
66
|
|
|
|
|
67
|
|
|
def _generate_corpus_from_subjects(self): |
|
68
|
|
|
self._document_uris = collections.defaultdict(set) |
|
69
|
|
|
self._document_labels = collections.defaultdict(set) |
|
70
|
|
|
for subj in self.subjects: |
|
71
|
|
|
for line in subj.text.splitlines(): |
|
72
|
|
|
self._document_uris[line].add(subj.uri) |
|
73
|
|
|
self._document_labels[line].add(subj.label) |
|
74
|
|
|
|