Passed
Pull Request — master (#663)
by Juho
03:06
created

annif.corpus.document.DocumentList.documents()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 3
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""Clases for supporting document corpora"""
2
3
import glob
4
import gzip
5
import os.path
6
import re
7
from itertools import islice
8
9
import annif.util
10
11
from .subject import SubjectSet
12
from .types import Document, DocumentCorpus
13
14
logger = annif.logger
15
16
17
class DocumentDirectory(DocumentCorpus):
18
    """A directory of files as a full text document corpus"""
19
20
    def __init__(self, path, subject_index, language, require_subjects=False):
21
        self.path = path
22
        self.subject_index = subject_index
23
        self.language = language
24
        self.require_subjects = require_subjects
25
26
    def __iter__(self):
27
        """Iterate through the directory, yielding tuples of (docfile,
28
        subjectfile) containing file paths. If there is no key file and
29
        require_subjects is False, the subjectfile will be returned as None."""
30
31
        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
32
            tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
33
            if os.path.exists(tsvfilename):
34
                yield (filename, tsvfilename)
35
                continue
36
            keyfilename = re.sub(r"\.txt$", ".key", filename)
37
            if os.path.exists(keyfilename):
38
                yield (filename, keyfilename)
39
                continue
40
            if not self.require_subjects:
41
                yield (filename, None)
42
43
    @property
44
    def documents(self):
45
        for docfilename, keyfilename in self:
46
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
47
                text = docfile.read()
48
            if keyfilename is None:
49
                yield Document(text=text, subject_set=None)
50
                continue
51
            with open(keyfilename, encoding="utf-8-sig") as keyfile:
52
                subjects = SubjectSet.from_string(
53
                    keyfile.read(), self.subject_index, self.language
54
                )
55
            yield Document(text=text, subject_set=subjects)
56
57
58
class DocumentFile(DocumentCorpus):
59
    """A TSV file as a corpus of documents with subjects"""
60
61
    def __init__(self, path, subject_index):
62
        self.path = path
63
        self.subject_index = subject_index
64
65
    @property
66
    def documents(self):
67
        if self.path.endswith(".gz"):
68
            opener = gzip.open
69
        else:
70
            opener = open
71
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
72
            for line in tsvfile:
73
                yield from self._parse_tsv_line(line)
74
75
    def _parse_tsv_line(self, line):
76
        if "\t" in line:
77
            text, uris = line.split("\t", maxsplit=1)
78
            subject_ids = {
79
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
80
                for uri in uris.split()
81
            }
82
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
83
        else:
84
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
85
86
87
class DocumentList(DocumentCorpus):
88
    """A document corpus based on a list of other iterable of Document
89
    objects"""
90
91
    def __init__(self, documents):
92
        self._documents = documents
93
94
    @property
95
    def documents(self):
96
        yield from self._documents
97
98
99
class TransformingDocumentCorpus(DocumentCorpus):
100
    """A document corpus that wraps another document corpus but transforms the
101
    documents using a given transform function"""
102
103
    def __init__(self, corpus, transform_fn):
104
        self._orig_corpus = corpus
105
        self._transform_fn = transform_fn
106
107
    @property
108
    def documents(self):
109
        for doc in self._orig_corpus.documents:
110
            yield Document(
111
                text=self._transform_fn(doc.text), subject_set=doc.subject_set
112
            )
113
114
115
class LimitingDocumentCorpus(DocumentCorpus):
116
    """A document corpus that wraps another document corpus but limits the
117
    number of documents to a given limit"""
118
119
    def __init__(self, corpus, docs_limit):
120
        self._orig_corpus = corpus
121
        self.docs_limit = docs_limit
122
123
    @property
124
    def documents(self):
125
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
126
            yield doc
127
128
129
class BatchingDocumentCorpus(DocumentCorpus):
130
    """A document corpus that wraps another document corpus to allow iterating over the
131
    documents in batches of a given size; a batch is a list of documents."""
132
133
    def __init__(self, corpus):
134
        self._orig_corpus = corpus
135
136
    @property
137
    def documents(self):
138
        yield from self._orig_corpus.documents
139
140
    def doc_batches(self, batch_size):
141
        it = iter(self.documents)
142
        while True:
143
            docs_batch = list(islice(it, batch_size))
144
            if not docs_batch:
145
                return
146
            yield docs_batch
147