annif.corpus.document   B
last analyzed

Complexity

Total Complexity 51

Size/Duplication

Total Lines 281
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 190
dl 0
loc 281
rs 7.92
c 0
b 0
f 0
wmc 51

22 Methods

Rating   Name   Duplication   Size   Complexity  
A DocumentDirectory._read_txt_file() 0 16 5
A DocumentDirectory._get_subject_filename() 0 11 3
A DocumentDirectory.__init__() 0 11 1
A DocumentDirectory.__iter__() 0 10 3
A DocumentDirectory.documents() 0 15 4
A DocumentFileJSONL.documents() 0 17 5
A LimitingDocumentCorpus.documents() 0 4 2
A TransformingDocumentCorpus.documents() 0 4 2
A DocumentFileTSV.__init__() 0 6 1
A DocumentFileJSONL.__init__() 0 11 1
B DocumentFileCSV.documents() 0 23 6
A DocumentFileTSV._parse_tsv_line() 0 15 3
A DocumentList.__init__() 0 2 1
A DocumentFileCSV.__init__() 0 6 1
A DocumentFileCSV._check_fields() 0 6 2
A LimitingDocumentCorpus.__init__() 0 3 1
A DocumentFileTSV.documents() 0 9 4
A DocumentFileCSV._parse_row() 0 15 2
A DocumentFileJSONL.is_jsonl_file() 0 6 1
A TransformingDocumentCorpus.__init__() 0 3 1
A DocumentList.documents() 0 3 1
A DocumentFileCSV.is_csv_file() 0 6 1

How to fix   Complexity   

Complexity

Complex classes like annif.corpus.document often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Classes for supporting document corpora"""
2
3
from __future__ import annotations
4
5
import csv
6
import glob
7
import gzip
8
import os.path
9
import re
10
from itertools import islice
11
from typing import TYPE_CHECKING
12
13
import annif.util
14
from annif.exception import OperationFailedException
15
16
from .json import json_file_to_document, json_to_document
17
from .types import Document, DocumentCorpus, SubjectSet
18
19
if TYPE_CHECKING:
20
    from collections.abc import Iterator
21
22
    from annif.vocab import SubjectIndex
23
24
logger = annif.logger
25
26
27
class DocumentDirectory(DocumentCorpus):
28
    """A directory of files as a full text document corpus"""
29
30
    def __init__(
31
        self,
32
        path: str,
33
        subject_index: SubjectIndex | None = None,
34
        language: str | None = None,
35
        require_subjects: bool = False,
36
    ) -> None:
37
        self.path = path
38
        self.subject_index = subject_index
39
        self.language = language
40
        self.require_subjects = require_subjects
41
42
    def __iter__(self) -> Iterator[str]:
43
        """Iterate through the directory, yielding file paths with corpus documents."""
44
45
        # txt files
46
        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
47
            yield filename
48
49
        # json files
50
        for filename in sorted(glob.glob(os.path.join(self.path, "*.json"))):
51
            yield filename
52
53
    @staticmethod
54
    def _get_subject_filename(filename: str) -> str | None:
55
        tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
56
        if os.path.exists(tsvfilename):
57
            return tsvfilename
58
59
        keyfilename = re.sub(r"\.txt$", ".key", filename)
60
        if os.path.exists(keyfilename):
61
            return keyfilename
62
63
        return None
64
65
    def _read_txt_file(self, filename: str) -> Document | None:
66
        with open(filename, errors="replace", encoding="utf-8-sig") as docfile:
67
            text = docfile.read()
68
        if not self.require_subjects:
69
            return Document(text=text, subject_set=None, file_path=filename)
70
71
        subjfilename = self._get_subject_filename(filename)
72
        if subjfilename is None:
73
            # subjects required but not found, skipping this docfile
74
            return None
75
76
        with open(subjfilename, encoding="utf-8-sig") as subjfile:
77
            subjects = SubjectSet.from_string(
78
                subjfile.read(), self.subject_index, self.language
79
            )
80
        return Document(text=text, subject_set=subjects, file_path=filename)
81
82
    @property
83
    def documents(self) -> Iterator[Document]:
84
        for docfilename in self:
85
            if docfilename.endswith(".txt"):
86
                doc = self._read_txt_file(docfilename)
87
            else:
88
                doc = json_file_to_document(
89
                    docfilename,
90
                    self.subject_index,
91
                    self.language,
92
                    self.require_subjects,
93
                )
94
95
            if doc is not None:
96
                yield doc
97
98
99
class DocumentFileTSV(DocumentCorpus):
100
    """A TSV file as a corpus of documents with subjects"""
101
102
    def __init__(
103
        self, path: str, subject_index: SubjectIndex, require_subjects=True
104
    ) -> None:
105
        self.path = path
106
        self.subject_index = subject_index
107
        self.require_subjects = require_subjects
108
109
    @property
110
    def documents(self) -> Iterator[Document]:
111
        if self.path.endswith(".gz"):
112
            opener = gzip.open
113
        else:
114
            opener = open
115
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
116
            for line in tsvfile:
117
                yield from self._parse_tsv_line(line)
118
119
    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
120
        if "\t" in line:
121
            text, uris = line.split("\t", maxsplit=1)
122
            subject_ids = {
123
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
124
                for uri in uris.split()
125
            }
126
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
127
        else:
128
            if self.require_subjects:
129
                logger.warning(
130
                    'Skipping invalid line (missing tab): "%s"', line.rstrip()
131
                )
132
            else:
133
                yield Document(text=line.strip())
134
135
136
class DocumentFileCSV(DocumentCorpus):
137
    """A CSV file as a corpus of documents with subjects"""
138
139
    def __init__(
140
        self, path: str, subject_index: SubjectIndex, require_subjects=True
141
    ) -> None:
142
        self.path = path
143
        self.subject_index = subject_index
144
        self.require_subjects = require_subjects
145
146
    @property
147
    def documents(self) -> Iterator[Document]:
148
        if self.path.endswith(".gz"):
149
            opener = gzip.open
150
        else:
151
            opener = open
152
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
153
            reader = csv.DictReader(csvfile)
154
            if not self._check_fields(reader):
155
                if self.require_subjects:
156
                    raise OperationFailedException(
157
                        f"Cannot parse CSV file {self.path}. "
158
                        + "The file must have a header row that defines at least "
159
                        + "the columns 'text' and 'subject_uris'."
160
                    )
161
                else:
162
                    raise OperationFailedException(
163
                        f"Cannot parse CSV file {self.path}. "
164
                        + "The file must have a header row that defines at least "
165
                        + "the column 'text'."
166
                    )
167
            for row in reader:
168
                yield from self._parse_row(row)
169
170
    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
171
        if self.require_subjects:
172
            subject_ids = {
173
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
174
                for uri in (row["subject_uris"] or "").strip().split()
175
            }
176
        else:
177
            subject_ids = set()
178
        metadata = {
179
            key: val for key, val in row.items() if key not in ("text", "subject_uris")
180
        }
181
        yield Document(
182
            text=(row["text"] or ""),
183
            subject_set=SubjectSet(subject_ids),
184
            metadata=metadata,
185
        )
186
187
    def _check_fields(self, reader: csv.DictReader) -> bool:
188
        fns = reader.fieldnames
189
        if self.require_subjects:
190
            return fns is not None and "text" in fns and "subject_uris" in fns
191
        else:
192
            return fns is not None and "text" in fns
193
194
    @staticmethod
195
    def is_csv_file(path: str) -> bool:
196
        """return True if the path looks like a CSV file"""
197
198
        path_lc = path.lower()
199
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
200
201
202
class DocumentFileJSONL(DocumentCorpus):
203
    """A JSON Lines file as a corpus of documents with subjects"""
204
205
    def __init__(
206
        self,
207
        path: str,
208
        subject_index: SubjectIndex,
209
        language: str,
210
        require_subjects=True,
211
    ) -> None:
212
        self.path = path
213
        self.subject_index = subject_index
214
        self.language = language
215
        self.require_subjects = require_subjects
216
217
    @property
218
    def documents(self) -> Iterator[Document]:
219
        if self.path.endswith(".gz"):
220
            opener = gzip.open
221
        else:
222
            opener = open
223
        with opener(self.path, mode="rt", encoding="utf-8") as jsonlfile:
224
            for line in jsonlfile:
225
                doc = json_to_document(
226
                    self.path,
227
                    line,
228
                    self.subject_index,
229
                    self.language,
230
                    self.require_subjects,
231
                )
232
                if doc is not None:
233
                    yield doc
234
235
    @staticmethod
236
    def is_jsonl_file(path: str) -> bool:
237
        """return True if the path looks like a JSONL file"""
238
239
        path_lc = path.lower()
240
        return path_lc.endswith(".jsonl") or path_lc.endswith(".jsonl.gz")
241
242
243
class DocumentList(DocumentCorpus):
244
    """A document corpus based on a list of other iterable of Document
245
    objects"""
246
247
    def __init__(self, documents):
248
        self._documents = documents
249
250
    @property
251
    def documents(self):
252
        yield from self._documents
253
254
255
class TransformingDocumentCorpus(DocumentCorpus):
256
    """A document corpus that wraps another document corpus but transforms the
257
    documents using a given transform function"""
258
259
    def __init__(self, corpus, transform_fn):
260
        self._orig_corpus = corpus
261
        self._transform_fn = transform_fn
262
263
    @property
264
    def documents(self):
265
        for doc in self._orig_corpus.documents:
266
            yield self._transform_fn(doc)
267
268
269
class LimitingDocumentCorpus(DocumentCorpus):
270
    """A document corpus that wraps another document corpus but limits the
271
    number of documents to a given limit"""
272
273
    def __init__(self, corpus, docs_limit):
274
        self._orig_corpus = corpus
275
        self.docs_limit = docs_limit
276
277
    @property
278
    def documents(self):
279
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
280
            yield doc
281