Passed
Pull Request — master (#257)
by Osma
02:44
created

DocumentDirectory.documents()   A

Complexity

Conditions 4

Size

Total Lines 9
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 9
rs 9.95
c 0
b 0
f 0
cc 4
nop 1
1
"""Clases for supporting document corpora"""
2
3
import glob
4
import os.path
5
import re
6
import gzip
7
import annif.util
8
from .types import Document, DocumentCorpus
9
from .convert import DocumentToSubjectCorpusMixin
10
from .subject import SubjectSet
11
12
13
class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
14
    """A directory of files as a full text document corpus"""
15
16
    def __init__(self, path, require_subjects=False):
17
        self.path = path
18
        self.require_subjects = require_subjects
19
20
    def __iter__(self):
21
        """Iterate through the directory, yielding tuples of (docfile,
22
        subjectfile) containing file paths. If there is no key file and
23
        require_subjects is False, the subjectfile will be returned as None."""
24
25
        for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
26
            tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
27
            if os.path.exists(tsvfilename):
28
                yield (filename, tsvfilename)
29
                continue
30
            keyfilename = re.sub(r'\.txt$', '.key', filename)
31
            if os.path.exists(keyfilename):
32
                yield (filename, keyfilename)
33
                continue
34
            if not self.require_subjects:
35
                yield (filename, None)
36
37
    @property
38
    def documents(self):
39
        for docfilename, keyfilename in self:
40
            with open(docfilename, errors='replace') as docfile:
41
                text = docfile.read()
42
            with open(keyfilename) as keyfile:
43
                subjects = SubjectSet.from_string(keyfile.read())
44
            yield Document(text=text, uris=subjects.subject_uris,
45
                           labels=subjects.subject_labels)
46
47
48
class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
49
    """A TSV file as a corpus of documents with subjects"""
50
51
    def __init__(self, path):
52
        self.path = path
53
54
    @property
55
    def documents(self):
56
        if self.path.endswith('.gz'):
57
            def opener(path):
58
                """open a gzip compressed file in text mode"""
59
                return gzip.open(path, mode='rt')
60
        else:
61
            opener = open
62
63
        with opener(self.path) as tsvfile:
64
            for line in tsvfile:
65
                text, uris = line.split('\t', maxsplit=1)
66
                subjects = [annif.util.cleanup_uri(uri)
67
                            for uri in uris.split()]
68
                yield Document(text=text, uris=subjects, labels=[])
69
70
71
class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
72
    """A document corpus based on a list of other iterable of Document
73
    objects"""
74
75
    def __init__(self, documents):
76
        self._documents = documents
77
78
    @property
79
    def documents(self):
80
        yield from self._documents
81