annif.corpus.document - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

annif.corpus.document B
last analyzed 2025-08-19 09:05 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	281
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	190
dl	0
loc	281
rs	7.92
c	0
b	0
f	0
wmc	51

22 Methods

Rating	Name	Size	Complexity
A	DocumentDirectory._read_txt_file()	16	5
A	DocumentDirectory._get_subject_filename()	11	3
A	DocumentDirectory.__init__()	11	1
A	DocumentDirectory.__iter__()	10	3
A	DocumentDirectory.documents()	15	4
A	DocumentFileJSONL.documents()	17	5
A	LimitingDocumentCorpus.documents()	4	2
A	TransformingDocumentCorpus.documents()	4	2
A	DocumentFileTSV.__init__()	6	1
A	DocumentFileJSONL.__init__()	11	1
B	DocumentFileCSV.documents()	23	6
A	DocumentFileTSV._parse_tsv_line()	15	3
A	DocumentList.__init__()	2	1
A	DocumentFileCSV.__init__()	6	1
A	DocumentFileCSV._check_fields()	6	2
A	LimitingDocumentCorpus.__init__()	3	1
A	DocumentFileTSV.documents()	9	4
A	DocumentFileCSV._parse_row()	15	2
A	DocumentFileJSONL.is_jsonl_file()	6	1
A	TransformingDocumentCorpus.__init__()	3	1
A	DocumentList.documents()	3	1
A	DocumentFileCSV.is_csv_file()	6	1

How to fix Complexity

"""Classes for supporting document corpora"""

from __future__ import annotations

import csv
import glob
import gzip
import os.path
import re
from itertools import islice
from typing import TYPE_CHECKING

import annif.util
from annif.exception import OperationFailedException

from .json import json_file_to_document, json_to_document
from .types import Document, DocumentCorpus, SubjectSet

if TYPE_CHECKING:
    from collections.abc import Iterator

    from annif.vocab import SubjectIndex

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(
        self,
        path: str,
        subject_index: SubjectIndex | None = None,
        language: str | None = None,
        require_subjects: bool = False,
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.language = language
        self.require_subjects = require_subjects

    def __iter__(self) -> Iterator[str]:
        """Iterate through the directory, yielding file paths with corpus documents."""

        # txt files
        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
            yield filename

        # json files
        for filename in sorted(glob.glob(os.path.join(self.path, "*.json"))):
            yield filename

    @staticmethod
    def _get_subject_filename(filename: str) -> str | None:
        tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
        if os.path.exists(tsvfilename):
            return tsvfilename

        keyfilename = re.sub(r"\.txt$", ".key", filename)
        if os.path.exists(keyfilename):
            return keyfilename

        return None

    def _read_txt_file(self, filename: str) -> Document | None:
        with open(filename, errors="replace", encoding="utf-8-sig") as docfile:
            text = docfile.read()
        if not self.require_subjects:
            return Document(text=text, subject_set=None, file_path=filename)

        subjfilename = self._get_subject_filename(filename)
        if subjfilename is None:
            # subjects required but not found, skipping this docfile
            return None

        with open(subjfilename, encoding="utf-8-sig") as subjfile:
            subjects = SubjectSet.from_string(
                subjfile.read(), self.subject_index, self.language
            )
        return Document(text=text, subject_set=subjects, file_path=filename)

    @property
    def documents(self) -> Iterator[Document]:
        for docfilename in self:
            if docfilename.endswith(".txt"):
                doc = self._read_txt_file(docfilename)
            else:
                doc = json_file_to_document(
                    docfilename,
                    self.subject_index,
                    self.language,
                    self.require_subjects,
                )

            if doc is not None:
                yield doc


class DocumentFileTSV(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(
        self, path: str, subject_index: SubjectIndex, require_subjects=True
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.require_subjects = require_subjects

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
        if "\t" in line:
            text, uris = line.split("\t", maxsplit=1)
            subject_ids = {
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
                for uri in uris.split()
            }
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
        else:
            if self.require_subjects:
                logger.warning(
                    'Skipping invalid line (missing tab): "%s"', line.rstrip()
                )
            else:
                yield Document(text=line.strip())


class DocumentFileCSV(DocumentCorpus):
    """A CSV file as a corpus of documents with subjects"""

    def __init__(
        self, path: str, subject_index: SubjectIndex, require_subjects=True
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.require_subjects = require_subjects

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)
            if not self._check_fields(reader):
                if self.require_subjects:
                    raise OperationFailedException(
                        f"Cannot parse CSV file {self.path}. "
                        + "The file must have a header row that defines at least "
                        + "the columns 'text' and 'subject_uris'."
                    )
                else:
                    raise OperationFailedException(
                        f"Cannot parse CSV file {self.path}. "
                        + "The file must have a header row that defines at least "
                        + "the column 'text'."
                    )
            for row in reader:
                yield from self._parse_row(row)

    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
        if self.require_subjects:
            subject_ids = {
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
                for uri in (row["subject_uris"] or "").strip().split()
            }
        else:
            subject_ids = set()
        metadata = {
            key: val for key, val in row.items() if key not in ("text", "subject_uris")
        }
        yield Document(
            text=(row["text"] or ""),
            subject_set=SubjectSet(subject_ids),
            metadata=metadata,
        )

    def _check_fields(self, reader: csv.DictReader) -> bool:
        fns = reader.fieldnames
        if self.require_subjects:
            return fns is not None and "text" in fns and "subject_uris" in fns
        else:
            return fns is not None and "text" in fns

    @staticmethod
    def is_csv_file(path: str) -> bool:
        """return True if the path looks like a CSV file"""

        path_lc = path.lower()
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")


class DocumentFileJSONL(DocumentCorpus):
    """A JSON Lines file as a corpus of documents with subjects"""

    def __init__(
        self,
        path: str,
        subject_index: SubjectIndex,
        language: str,
        require_subjects=True,
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.language = language
        self.require_subjects = require_subjects

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8") as jsonlfile:
            for line in jsonlfile:
                doc = json_to_document(
                    self.path,
                    line,
                    self.subject_index,
                    self.language,
                    self.require_subjects,
                )
                if doc is not None:
                    yield doc

    @staticmethod
    def is_jsonl_file(path: str) -> bool:
        """return True if the path looks like a JSONL file"""

        path_lc = path.lower()
        return path_lc.endswith(".jsonl") or path_lc.endswith(".jsonl.gz")


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TransformingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but transforms the
    documents using a given transform function"""

    def __init__(self, corpus, transform_fn):
        self._orig_corpus = corpus
        self._transform_fn = transform_fn

    @property
    def documents(self):
        for doc in self._orig_corpus.documents:
            yield self._transform_fn(doc)


class LimitingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but limits the
    number of documents to a given limit"""

    def __init__(self, corpus, docs_limit):
        self._orig_corpus = corpus
        self.docs_limit = docs_limit

    @property
    def documents(self):
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
            yield doc


1			"""Classes for supporting document corpora"""
2
3			from __future__ import annotations
4
5			import csv
6			import glob
7			import gzip
8			import os.path
9			import re
10			from itertools import islice
11			from typing import TYPE_CHECKING
12
13			import annif.util
14			from annif.exception import OperationFailedException
15
16			from .json import json_file_to_document, json_to_document
17			from .types import Document, DocumentCorpus, SubjectSet
18
19			if TYPE_CHECKING:
20			from collections.abc import Iterator
21
22			from annif.vocab import SubjectIndex
23
24			logger = annif.logger
25
26
27			class DocumentDirectory(DocumentCorpus):
28			"""A directory of files as a full text document corpus"""
29
30			def __init__(
31			self,
32			path: str,
33			subject_index: SubjectIndex \| None = None,
34			language: str \| None = None,
35			require_subjects: bool = False,
36			) -> None:
37			self.path = path
38			self.subject_index = subject_index
39			self.language = language
40			self.require_subjects = require_subjects
41
42			def __iter__(self) -> Iterator[str]:
43			"""Iterate through the directory, yielding file paths with corpus documents."""
44
45			# txt files
46			for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
47			yield filename
48
49			# json files
50			for filename in sorted(glob.glob(os.path.join(self.path, "*.json"))):
51			yield filename
52
53			@staticmethod
54			def _get_subject_filename(filename: str) -> str \| None:
55			tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
56			if os.path.exists(tsvfilename):
57			return tsvfilename
58
59			keyfilename = re.sub(r"\.txt$", ".key", filename)
60			if os.path.exists(keyfilename):
61			return keyfilename
62
63			return None
64
65			def _read_txt_file(self, filename: str) -> Document \| None:
66			with open(filename, errors="replace", encoding="utf-8-sig") as docfile:
67			text = docfile.read()
68			if not self.require_subjects:
69			return Document(text=text, subject_set=None, file_path=filename)
70
71			subjfilename = self._get_subject_filename(filename)
72			if subjfilename is None:
73			# subjects required but not found, skipping this docfile
74			return None
75
76			with open(subjfilename, encoding="utf-8-sig") as subjfile:
77			subjects = SubjectSet.from_string(
78			subjfile.read(), self.subject_index, self.language
79			)
80			return Document(text=text, subject_set=subjects, file_path=filename)
81
82			@property
83			def documents(self) -> Iterator[Document]:
84			for docfilename in self:
85			if docfilename.endswith(".txt"):
86			doc = self._read_txt_file(docfilename)
87			else:
88			doc = json_file_to_document(
89			docfilename,
90			self.subject_index,
91			self.language,
92			self.require_subjects,
93			)
94
95			if doc is not None:
96			yield doc
97
98
99			class DocumentFileTSV(DocumentCorpus):
100			"""A TSV file as a corpus of documents with subjects"""
101
102			def __init__(
103			self, path: str, subject_index: SubjectIndex, require_subjects=True
104			) -> None:
105			self.path = path
106			self.subject_index = subject_index
107			self.require_subjects = require_subjects
108
109			@property
110			def documents(self) -> Iterator[Document]:
111			if self.path.endswith(".gz"):
112			opener = gzip.open
113			else:
114			opener = open
115			with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
116			for line in tsvfile:
117			yield from self._parse_tsv_line(line)
118
119			def _parse_tsv_line(self, line: str) -> Iterator[Document]:
120			if "\t" in line:
121			text, uris = line.split("\t", maxsplit=1)
122			subject_ids = {
123			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
124			for uri in uris.split()
125			}
126			yield Document(text=text, subject_set=SubjectSet(subject_ids))
127			else:
128			if self.require_subjects:
129			logger.warning(
130			'Skipping invalid line (missing tab): "%s"', line.rstrip()
131			)
132			else:
133			yield Document(text=line.strip())
134
135
136			class DocumentFileCSV(DocumentCorpus):
137			"""A CSV file as a corpus of documents with subjects"""
138
139			def __init__(
140			self, path: str, subject_index: SubjectIndex, require_subjects=True
141			) -> None:
142			self.path = path
143			self.subject_index = subject_index
144			self.require_subjects = require_subjects
145
146			@property
147			def documents(self) -> Iterator[Document]:
148			if self.path.endswith(".gz"):
149			opener = gzip.open
150			else:
151			opener = open
152			with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
153			reader = csv.DictReader(csvfile)
154			if not self._check_fields(reader):
155			if self.require_subjects:
156			raise OperationFailedException(
157			f"Cannot parse CSV file {self.path}. "
158			+ "The file must have a header row that defines at least "
159			+ "the columns 'text' and 'subject_uris'."
160			)
161			else:
162			raise OperationFailedException(
163			f"Cannot parse CSV file {self.path}. "
164			+ "The file must have a header row that defines at least "
165			+ "the column 'text'."
166			)
167			for row in reader:
168			yield from self._parse_row(row)
169
170			def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
171			if self.require_subjects:
172			subject_ids = {
173			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
174			for uri in (row["subject_uris"] or "").strip().split()
175			}
176			else:
177			subject_ids = set()
178			metadata = {
179			key: val for key, val in row.items() if key not in ("text", "subject_uris")
180			}
181			yield Document(
182			text=(row["text"] or ""),
183			subject_set=SubjectSet(subject_ids),
184			metadata=metadata,
185			)
186
187			def _check_fields(self, reader: csv.DictReader) -> bool:
188			fns = reader.fieldnames
189			if self.require_subjects:
190			return fns is not None and "text" in fns and "subject_uris" in fns
191			else:
192			return fns is not None and "text" in fns
193
194			@staticmethod
195			def is_csv_file(path: str) -> bool:
196			"""return True if the path looks like a CSV file"""
197
198			path_lc = path.lower()
199			return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
200
201
202			class DocumentFileJSONL(DocumentCorpus):
203			"""A JSON Lines file as a corpus of documents with subjects"""
204
205			def __init__(
206			self,
207			path: str,
208			subject_index: SubjectIndex,
209			language: str,
210			require_subjects=True,
211			) -> None:
212			self.path = path
213			self.subject_index = subject_index
214			self.language = language
215			self.require_subjects = require_subjects
216
217			@property
218			def documents(self) -> Iterator[Document]:
219			if self.path.endswith(".gz"):
220			opener = gzip.open
221			else:
222			opener = open
223			with opener(self.path, mode="rt", encoding="utf-8") as jsonlfile:
224			for line in jsonlfile:
225			doc = json_to_document(
226			self.path,
227			line,
228			self.subject_index,
229			self.language,
230			self.require_subjects,
231			)
232			if doc is not None:
233			yield doc
234
235			@staticmethod
236			def is_jsonl_file(path: str) -> bool:
237			"""return True if the path looks like a JSONL file"""
238
239			path_lc = path.lower()
240			return path_lc.endswith(".jsonl") or path_lc.endswith(".jsonl.gz")
241
242
243			class DocumentList(DocumentCorpus):
244			"""A document corpus based on a list of other iterable of Document
245			objects"""
246
247			def __init__(self, documents):
248			self._documents = documents
249
250			@property
251			def documents(self):
252			yield from self._documents
253
254
255			class TransformingDocumentCorpus(DocumentCorpus):
256			"""A document corpus that wraps another document corpus but transforms the
257			documents using a given transform function"""
258
259			def __init__(self, corpus, transform_fn):
260			self._orig_corpus = corpus
261			self._transform_fn = transform_fn
262
263			@property
264			def documents(self):
265			for doc in self._orig_corpus.documents:
266			yield self._transform_fn(doc)
267
268
269			class LimitingDocumentCorpus(DocumentCorpus):
270			"""A document corpus that wraps another document corpus but limits the
271			number of documents to a given limit"""
272
273			def __init__(self, corpus, docs_limit):
274			self._orig_corpus = corpus
275			self.docs_limit = docs_limit
276
277			@property
278			def documents(self):
279			for doc in islice(self._orig_corpus.documents, self.docs_limit):
280			yield doc
281

NatLibFi / Annif

annif.corpus.document B last analyzed 2025-08-19 09:05 UTC

Complexity

Size/Duplication

Importance

22 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like

annif.corpus.document B
last analyzed 2025-08-19 09:05 UTC