Passed
Pull Request — master (#496)
by
unknown
02:17
created

TransformingDocumentCorpus.documents()   A

Complexity

Conditions 2

Size

Total Lines 7
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 7
nop 1
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
"""Clases for supporting document corpora"""
2
3
import glob
4
import os.path
5
import re
6
import gzip
7
import annif.util
8
from itertools import islice
9
from .types import DocumentCorpus
10
from .subject import SubjectSet
11
12
logger = annif.logger
13
14
15
class DocumentDirectory(DocumentCorpus):
16
    """A directory of files as a full text document corpus"""
17
18
    def __init__(self, path, require_subjects=False):
19
        self.path = path
20
        self.require_subjects = require_subjects
21
22
    def __iter__(self):
23
        """Iterate through the directory, yielding tuples of (docfile,
24
        subjectfile) containing file paths. If there is no key file and
25
        require_subjects is False, the subjectfile will be returned as None."""
26
27
        for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
28
            tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
29
            if os.path.exists(tsvfilename):
30
                yield (filename, tsvfilename)
31
                continue
32
            keyfilename = re.sub(r'\.txt$', '.key', filename)
33
            if os.path.exists(keyfilename):
34
                yield (filename, keyfilename)
35
                continue
36
            if not self.require_subjects:
37
                yield (filename, None)
38
39
    @property
40
    def documents(self):
41
        for docfilename, keyfilename in self:
42
            with open(docfilename, errors='replace',
43
                      encoding='utf-8-sig') as docfile:
44
                text = docfile.read()
45
            with open(keyfilename, encoding='utf-8-sig') as keyfile:
46
                subjects = SubjectSet.from_string(keyfile.read())
47
            yield self._create_document(text=text,
48
                                        uris=subjects.subject_uris,
49
                                        labels=subjects.subject_labels)
50
51
52
class DocumentFile(DocumentCorpus):
53
    """A TSV file as a corpus of documents with subjects"""
54
55
    def __init__(self, path):
56
        self.path = path
57
58
    @property
59
    def documents(self):
60
        if self.path.endswith('.gz'):
61
            opener = gzip.open
62
        else:
63
            opener = open
64
        with opener(self.path, mode='rt', encoding='utf-8-sig') as tsvfile:
65
            for line in tsvfile:
66
                yield from self._parse_tsv_line(line)
67
68
    def _parse_tsv_line(self, line):
69
        if '\t' in line:
70
            text, uris = line.split('\t', maxsplit=1)
71
            subjects = [annif.util.cleanup_uri(uri)
72
                        for uri in uris.split()]
73
            yield self._create_document(text=text,
74
                                        uris=subjects,
75
                                        labels=[])
76
        else:
77
            logger.warning('Skipping invalid line (missing tab): "%s"',
78
                           line.rstrip())
79
80
81
class DocumentList(DocumentCorpus):
82
    """A document corpus based on a list of other iterable of Document
83
    objects"""
84
85
    def __init__(self, documents):
86
        self._documents = documents
87
88
    @property
89
    def documents(self):
90
        yield from self._documents
91
92
93
class TransformingDocumentCorpus(DocumentCorpus):
94
    """A document corpus that wraps another document corpus but transforms the
95
    documents using a given transform function"""
96
97
    def __init__(self, corpus, transform_fn):
98
        self._orig_corpus = corpus
99
        self._transform_fn = transform_fn
100
101
    @property
102
    def documents(self):
103
        for doc in self._orig_corpus.documents:
104
            yield self._create_document(
105
                text=self._transform_fn(doc.text),
106
                uris=doc.uris,
107
                labels=doc.labels)
108
109
110
class LimitingDocumentCorpus(DocumentCorpus):
111
    """A document corpus that wraps another document corpus but limits the
112
    number of documents to a given limit"""
113
114
    def __init__(self, corpus, docs_limit):
115
        self._orig_corpus = corpus
116
        self.docs_limit = docs_limit
117
118
    @property
119
    def documents(self):
120
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
121
            yield self._create_document(text=doc.text,
122
                                        uris=doc.uris,
123
                                        labels=doc.labels)
124