Passed
Push — issue868-json-corpus-format ( c8699d )
by Osma
04:45
created

DocumentDirectory.documents()   B

Complexity

Conditions 6

Size

Total Lines 19
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 15
nop 1
dl 0
loc 19
rs 8.6666
c 0
b 0
f 0
1
"""Clases for supporting document corpora"""
2
3
from __future__ import annotations
4
5
import csv
6
import glob
7
import gzip
8
import os.path
9
import re
10
from itertools import islice
11
from typing import TYPE_CHECKING
12
13
import annif.util
14
from annif.exception import OperationFailedException
15
16
from .types import Document, DocumentCorpus, SubjectSet
17
18
if TYPE_CHECKING:
19
    from collections.abc import Iterator
20
21
    from annif.corpus.subject import SubjectIndex
22
23
logger = annif.logger
24
25
26
class DocumentDirectory(DocumentCorpus):
27
    """A directory of files as a full text document corpus"""
28
29
    def __init__(
30
        self,
31
        path: str,
32
        subject_index: SubjectIndex | None = None,
33
        language: str | None = None,
34
        require_subjects: bool = False,
35
    ) -> None:
36
        self.path = path
37
        self.subject_index = subject_index
38
        self.language = language
39
        self.require_subjects = require_subjects
40
41
    def __iter__(self) -> Iterator[str]:
42
        """Iterate through the directory, yielding file paths with corpus documents."""
43
44
        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
45
            yield filename
46
47
    @staticmethod
48
    def _get_subject_filename(filename: str) -> str | None:
49
        tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
50
        if os.path.exists(tsvfilename):
51
            return tsvfilename
52
53
        keyfilename = re.sub(r"\.txt$", ".key", filename)
54
        if os.path.exists(keyfilename):
55
            return keyfilename
56
57
        return None
58
59
    @property
60
    def documents(self) -> Iterator[Document]:
61
        for docfilename in self:
62
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
63
                text = docfile.read()
64
            if not self.require_subjects:
65
                yield Document(text=text, subject_set=None)
66
                continue
67
68
            subjfilename = self._get_subject_filename(docfilename)
69
            if subjfilename is None:
70
                # subjects required but not found, skipping this docfile
71
                continue
72
73
            with open(subjfilename, encoding="utf-8-sig") as subjfile:
74
                subjects = SubjectSet.from_string(
75
                    subjfile.read(), self.subject_index, self.language
76
                )
77
            yield Document(text=text, subject_set=subjects)
78
79
80
class DocumentFileTSV(DocumentCorpus):
81
    """A TSV file as a corpus of documents with subjects"""
82
83
    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
84
        self.path = path
85
        self.subject_index = subject_index
86
87
    @property
88
    def documents(self) -> Iterator[Document]:
89
        if self.path.endswith(".gz"):
90
            opener = gzip.open
91
        else:
92
            opener = open
93
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
94
            for line in tsvfile:
95
                yield from self._parse_tsv_line(line)
96
97
    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
98
        if "\t" in line:
99
            text, uris = line.split("\t", maxsplit=1)
100
            subject_ids = {
101
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
102
                for uri in uris.split()
103
            }
104
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
105
        else:
106
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
107
108
109
class DocumentFileCSV(DocumentCorpus):
110
    """A CSV file as a corpus of documents with subjects"""
111
112
    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
113
        self.path = path
114
        self.subject_index = subject_index
115
116
    @property
117
    def documents(self) -> Iterator[Document]:
118
        if self.path.endswith(".gz"):
119
            opener = gzip.open
120
        else:
121
            opener = open
122
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
123
            reader = csv.DictReader(csvfile)
124
            if not self._check_fields(reader):
125
                raise OperationFailedException(
126
                    f"Cannot parse CSV file {self.path}. "
127
                    + "The file must have a header row that defines at least "
128
                    + "the columns 'text' and 'subject_uris'."
129
                )
130
            for row in reader:
131
                yield from self._parse_row(row)
132
133
    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
134
        subject_ids = {
135
            self.subject_index.by_uri(annif.util.cleanup_uri(uri))
136
            for uri in (row["subject_uris"] or "").strip().split()
137
        }
138
        metadata = {
139
            key: val for key, val in row.items() if key not in ("text", "subject_uris")
140
        }
141
        yield Document(
142
            text=(row["text"] or ""),
143
            subject_set=SubjectSet(subject_ids),
144
            metadata=metadata,
145
        )
146
147
    def _check_fields(self, reader: csv.DictReader) -> bool:
148
        fns = reader.fieldnames
149
        return fns is not None and "text" in fns and "subject_uris" in fns
150
151
    @staticmethod
152
    def is_csv_file(path: str) -> bool:
153
        """return True if the path looks like a CSV file"""
154
155
        path_lc = path.lower()
156
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
157
158
159
class DocumentList(DocumentCorpus):
160
    """A document corpus based on a list of other iterable of Document
161
    objects"""
162
163
    def __init__(self, documents):
164
        self._documents = documents
165
166
    @property
167
    def documents(self):
168
        yield from self._documents
169
170
171
class TransformingDocumentCorpus(DocumentCorpus):
172
    """A document corpus that wraps another document corpus but transforms the
173
    documents using a given transform function"""
174
175
    def __init__(self, corpus, transform_fn):
176
        self._orig_corpus = corpus
177
        self._transform_fn = transform_fn
178
179
    @property
180
    def documents(self):
181
        for doc in self._orig_corpus.documents:
182
            yield self._transform_fn(doc)
183
184
185
class LimitingDocumentCorpus(DocumentCorpus):
186
    """A document corpus that wraps another document corpus but limits the
187
    number of documents to a given limit"""
188
189
    def __init__(self, corpus, docs_limit):
190
        self._orig_corpus = corpus
191
        self.docs_limit = docs_limit
192
193
    @property
194
    def documents(self):
195
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
196
            yield doc
197