Passed
Pull Request — master (#452)
by
unknown
02:08
created

TruncatingDocumentCorpus.documents()   A

Complexity

Conditions 2

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""Clases for supporting document corpora"""
2
3
import glob
4
import os.path
5
import re
6
import gzip
7
import annif.util
8
from .types import DocumentCorpus
9
from .subject import SubjectSet
10
11
logger = annif.logger
12
13
14
class DocumentDirectory(DocumentCorpus):
15
    """A directory of files as a full text document corpus"""
16
17
    def __init__(self, path, require_subjects=False):
18
        self.path = path
19
        self.require_subjects = require_subjects
20
21
    def __iter__(self):
22
        """Iterate through the directory, yielding tuples of (docfile,
23
        subjectfile) containing file paths. If there is no key file and
24
        require_subjects is False, the subjectfile will be returned as None."""
25
26
        for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
27
            tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
28
            if os.path.exists(tsvfilename):
29
                yield (filename, tsvfilename)
30
                continue
31
            keyfilename = re.sub(r'\.txt$', '.key', filename)
32
            if os.path.exists(keyfilename):
33
                yield (filename, keyfilename)
34
                continue
35
            if not self.require_subjects:
36
                yield (filename, None)
37
38
    @property
39
    def documents(self):
40
        for docfilename, keyfilename in self:
41
            with open(docfilename, errors='replace',
42
                      encoding='utf-8-sig') as docfile:
43
                text = docfile.read()
44
            with open(keyfilename, encoding='utf-8-sig') as keyfile:
45
                subjects = SubjectSet.from_string(keyfile.read())
46
            yield self._create_document(text=text,
47
                                        uris=subjects.subject_uris,
48
                                        labels=subjects.subject_labels)
49
50
51
class DocumentFile(DocumentCorpus):
52
    """A TSV file as a corpus of documents with subjects"""
53
54
    def __init__(self, path):
55
        self.path = path
56
57
    @property
58
    def documents(self):
59
        if self.path.endswith('.gz'):
60
            opener = gzip.open
61
        else:
62
            opener = open
63
        with opener(self.path, mode='rt', encoding='utf-8-sig') as tsvfile:
64
            for line in tsvfile:
65
                yield from self._parse_tsv_line(line)
66
67
    def _parse_tsv_line(self, line):
68
        if '\t' in line:
69
            text, uris = line.split('\t', maxsplit=1)
70
            subjects = [annif.util.cleanup_uri(uri)
71
                        for uri in uris.split()]
72
            yield self._create_document(text=text,
73
                                        uris=subjects,
74
                                        labels=[])
75
        else:
76
            logger.warning('Skipping invalid line (missing tab): "%s"',
77
                           line.rstrip())
78
79
80
class DocumentList(DocumentCorpus):
81
    """A document corpus based on a list of other iterable of Document
82
    objects"""
83
84
    def __init__(self, documents):
85
        self._documents = documents
86
87
    @property
88
    def documents(self):
89
        yield from self._documents
90
91
92
class TruncatingDocumentCorpus(DocumentCorpus):
93
    """A document corpus that wraps another document corpus but truncates the
94
    documents to a given length"""
95
96
    def __init__(self, corpus, limit):
97
        self._documents = corpus.documents
98
        self._limit = limit
99
100
    @property
101
    def documents(self):
102
        for doc in self._documents:
103
            yield self._create_document(text=doc.text[:self._limit],
104
                                        uris=doc.uris,
105
                                        labels=doc.labels)
106