annif.corpus.document   A
last analyzed

Complexity

Total Complexity 35

Size/Duplication

Total Lines 191
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 129
dl 0
loc 191
rs 9.6
c 0
b 0
f 0
wmc 35

17 Methods

Rating   Name   Duplication   Size   Complexity  
A DocumentFileTSV.__init__() 0 3 1
A DocumentFileCSV.documents() 0 16 5
A DocumentFileTSV._parse_tsv_line() 0 10 2
A DocumentFileCSV.__init__() 0 3 1
A DocumentDirectory.documents() 0 13 5
A DocumentFileTSV.documents() 0 9 4
A DocumentDirectory.__init__() 0 11 1
A DocumentDirectory.__iter__() 0 17 5
A LimitingDocumentCorpus.documents() 0 4 2
A TransformingDocumentCorpus.documents() 0 4 2
A DocumentList.__init__() 0 2 1
A DocumentFileCSV._check_fields() 0 3 1
A LimitingDocumentCorpus.__init__() 0 3 1
A DocumentFileCSV._parse_row() 0 12 1
A TransformingDocumentCorpus.__init__() 0 3 1
A DocumentList.documents() 0 3 1
A DocumentFileCSV.is_csv_file() 0 6 1
1
"""Clases for supporting document corpora"""
2
3
from __future__ import annotations
4
5
import csv
6
import glob
7
import gzip
8
import os.path
9
import re
10
from itertools import islice
11
from typing import TYPE_CHECKING
12
13
import annif.util
14
from annif.exception import OperationFailedException
15
16
from .types import Document, DocumentCorpus, SubjectSet
17
18
if TYPE_CHECKING:
19
    from collections.abc import Iterator
20
21
    from annif.corpus.subject import SubjectIndex
22
23
logger = annif.logger
24
25
26
class DocumentDirectory(DocumentCorpus):
27
    """A directory of files as a full text document corpus"""
28
29
    def __init__(
30
        self,
31
        path: str,
32
        subject_index: SubjectIndex | None = None,
33
        language: str | None = None,
34
        require_subjects: bool = False,
35
    ) -> None:
36
        self.path = path
37
        self.subject_index = subject_index
38
        self.language = language
39
        self.require_subjects = require_subjects
40
41
    def __iter__(self) -> Iterator[tuple[str, str] | tuple[str, None]]:
42
        """Iterate through the directory, yielding tuples of (docfile,
43
        subjectfile) containing file paths. If require_subjects is False, the
44
        subjectfile will be returned as None."""
45
46
        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
47
            if self.require_subjects:
48
                tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
49
                if os.path.exists(tsvfilename):
50
                    yield (filename, tsvfilename)
51
                    continue
52
                keyfilename = re.sub(r"\.txt$", ".key", filename)
53
                if os.path.exists(keyfilename):
54
                    yield (filename, keyfilename)
55
                    continue
56
            else:
57
                yield (filename, None)
58
59
    @property
60
    def documents(self) -> Iterator[Document]:
61
        for docfilename, subjfilename in self:
62
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
63
                text = docfile.read()
64
            if subjfilename is None:
65
                yield Document(text=text, subject_set=None)
66
                continue
67
            with open(subjfilename, encoding="utf-8-sig") as subjfile:
68
                subjects = SubjectSet.from_string(
69
                    subjfile.read(), self.subject_index, self.language
70
                )
71
            yield Document(text=text, subject_set=subjects)
72
73
74
class DocumentFileTSV(DocumentCorpus):
75
    """A TSV file as a corpus of documents with subjects"""
76
77
    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
78
        self.path = path
79
        self.subject_index = subject_index
80
81
    @property
82
    def documents(self) -> Iterator[Document]:
83
        if self.path.endswith(".gz"):
84
            opener = gzip.open
85
        else:
86
            opener = open
87
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
88
            for line in tsvfile:
89
                yield from self._parse_tsv_line(line)
90
91
    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
92
        if "\t" in line:
93
            text, uris = line.split("\t", maxsplit=1)
94
            subject_ids = {
95
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
96
                for uri in uris.split()
97
            }
98
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
99
        else:
100
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
101
102
103
class DocumentFileCSV(DocumentCorpus):
104
    """A CSV file as a corpus of documents with subjects"""
105
106
    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
107
        self.path = path
108
        self.subject_index = subject_index
109
110
    @property
111
    def documents(self) -> Iterator[Document]:
112
        if self.path.endswith(".gz"):
113
            opener = gzip.open
114
        else:
115
            opener = open
116
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
117
            reader = csv.DictReader(csvfile)
118
            if not self._check_fields(reader):
119
                raise OperationFailedException(
120
                    f"Cannot parse CSV file {self.path}. "
121
                    + "The file must have a header row that defines at least "
122
                    + "the columns 'text' and 'subject_uris'."
123
                )
124
            for row in reader:
125
                yield from self._parse_row(row)
126
127
    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
128
        subject_ids = {
129
            self.subject_index.by_uri(annif.util.cleanup_uri(uri))
130
            for uri in (row["subject_uris"] or "").strip().split()
131
        }
132
        metadata = {
133
            key: val for key, val in row.items() if key not in ("text", "subject_uris")
134
        }
135
        yield Document(
136
            text=(row["text"] or ""),
137
            subject_set=SubjectSet(subject_ids),
138
            metadata=metadata,
139
        )
140
141
    def _check_fields(self, reader: csv.DictReader) -> bool:
142
        fns = reader.fieldnames
143
        return fns is not None and "text" in fns and "subject_uris" in fns
144
145
    @staticmethod
146
    def is_csv_file(path: str) -> bool:
147
        """return True if the path looks like a CSV file"""
148
149
        path_lc = path.lower()
150
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
151
152
153
class DocumentList(DocumentCorpus):
154
    """A document corpus based on a list of other iterable of Document
155
    objects"""
156
157
    def __init__(self, documents):
158
        self._documents = documents
159
160
    @property
161
    def documents(self):
162
        yield from self._documents
163
164
165
class TransformingDocumentCorpus(DocumentCorpus):
166
    """A document corpus that wraps another document corpus but transforms the
167
    documents using a given transform function"""
168
169
    def __init__(self, corpus, transform_fn):
170
        self._orig_corpus = corpus
171
        self._transform_fn = transform_fn
172
173
    @property
174
    def documents(self):
175
        for doc in self._orig_corpus.documents:
176
            yield self._transform_fn(doc)
177
178
179
class LimitingDocumentCorpus(DocumentCorpus):
180
    """A document corpus that wraps another document corpus but limits the
181
    number of documents to a given limit"""
182
183
    def __init__(self, corpus, docs_limit):
184
        self._orig_corpus = corpus
185
        self.docs_limit = docs_limit
186
187
    @property
188
    def documents(self):
189
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
190
            yield doc
191