Passed
Push — issue813-flexible-fusion-csv-s... ( ca1bd6 )
by Osma
03:35
created

annif.corpus.document.DocumentFileTSV.documents()   A

Complexity

Conditions 4

Size

Total Lines 9
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 8
nop 1
dl 0
loc 9
rs 10
c 0
b 0
f 0
1
"""Clases for supporting document corpora"""
2
3
from __future__ import annotations
4
5
import csv
6
import glob
7
import gzip
8
import os.path
9
import re
10
from itertools import islice
11
from typing import TYPE_CHECKING
12
13
import annif.util
14
15
from .types import Document, DocumentCorpus, SubjectSet
16
17
if TYPE_CHECKING:
18
    from collections.abc import Iterator
19
20
    from annif.corpus.subject import SubjectIndex
21
22
logger = annif.logger
23
24
25
class DocumentDirectory(DocumentCorpus):
26
    """A directory of files as a full text document corpus"""
27
28
    def __init__(
29
        self,
30
        path: str,
31
        subject_index: SubjectIndex | None = None,
32
        language: str | None = None,
33
        require_subjects: bool = False,
34
    ) -> None:
35
        self.path = path
36
        self.subject_index = subject_index
37
        self.language = language
38
        self.require_subjects = require_subjects
39
40
    def __iter__(self) -> Iterator[tuple[str, str] | tuple[str, None]]:
41
        """Iterate through the directory, yielding tuples of (docfile,
42
        subjectfile) containing file paths. If require_subjects is False, the
43
        subjectfile will be returned as None."""
44
45
        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
46
            if self.require_subjects:
47
                tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
48
                if os.path.exists(tsvfilename):
49
                    yield (filename, tsvfilename)
50
                    continue
51
                keyfilename = re.sub(r"\.txt$", ".key", filename)
52
                if os.path.exists(keyfilename):
53
                    yield (filename, keyfilename)
54
                    continue
55
            else:
56
                yield (filename, None)
57
58
    @property
59
    def documents(self) -> Iterator[Document]:
60
        for docfilename, subjfilename in self:
61
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
62
                text = docfile.read()
63
            if subjfilename is None:
64
                yield Document(text=text, subject_set=None)
65
                continue
66
            with open(subjfilename, encoding="utf-8-sig") as subjfile:
67
                subjects = SubjectSet.from_string(
68
                    subjfile.read(), self.subject_index, self.language
69
                )
70
            yield Document(text=text, subject_set=subjects)
71
72
73
class DocumentFileTSV(DocumentCorpus):
74
    """A TSV file as a corpus of documents with subjects"""
75
76
    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
77
        self.path = path
78
        self.subject_index = subject_index
79
80
    @property
81
    def documents(self) -> Iterator[Document]:
82
        if self.path.endswith(".gz"):
83
            opener = gzip.open
84
        else:
85
            opener = open
86
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
87
            for line in tsvfile:
88
                yield from self._parse_tsv_line(line)
89
90
    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
91
        if "\t" in line:
92
            text, uris = line.split("\t", maxsplit=1)
93
            subject_ids = {
94
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
95
                for uri in uris.split()
96
            }
97
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
98
        else:
99
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
100
101
102
class DocumentFileCSV(DocumentCorpus):
103
    """A CSV file as a corpus of documents with subjects"""
104
105
    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
106
        self.path = path
107
        self.subject_index = subject_index
108
109
    @property
110
    def documents(self) -> Iterator[Document]:
111
        if self.path.endswith(".gz"):
112
            opener = gzip.open
113
        else:
114
            opener = open
115
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
116
            reader = csv.DictReader(csvfile)
117
            for row in reader:
118
                yield from self._parse_row(row)
119
120
    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
121
        subject_ids = {
122
            self.subject_index.by_uri(annif.util.cleanup_uri(uri))
123
            for uri in row["subject_uris"].strip().split()
124
        }
125
        yield Document(text=row["text"], subject_set=SubjectSet(subject_ids))
126
127
    @staticmethod
128
    def is_csv_file(path: str) -> bool:
129
        """return True if the path looks like a CSV file"""
130
131
        path_lc = path.lower()
132
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
133
134
135
class DocumentList(DocumentCorpus):
136
    """A document corpus based on a list of other iterable of Document
137
    objects"""
138
139
    def __init__(self, documents):
140
        self._documents = documents
141
142
    @property
143
    def documents(self):
144
        yield from self._documents
145
146
147
class TransformingDocumentCorpus(DocumentCorpus):
148
    """A document corpus that wraps another document corpus but transforms the
149
    documents using a given transform function"""
150
151
    def __init__(self, corpus, transform_fn):
152
        self._orig_corpus = corpus
153
        self._transform_fn = transform_fn
154
155
    @property
156
    def documents(self):
157
        for doc in self._orig_corpus.documents:
158
            yield Document(
159
                text=self._transform_fn(doc.text), subject_set=doc.subject_set
160
            )
161
162
163
class LimitingDocumentCorpus(DocumentCorpus):
164
    """A document corpus that wraps another document corpus but limits the
165
    number of documents to a given limit"""
166
167
    def __init__(self, corpus, docs_limit):
168
        self._orig_corpus = corpus
169
        self.docs_limit = docs_limit
170
171
    @property
172
    def documents(self):
173
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
174
            yield doc
175