annif.corpus.document - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

annif.corpus.document A
last analyzed 2025-08-08 10:59 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	191
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	129
dl	0
loc	191
rs	9.6
c	0
b	0
f	0
wmc	35

17 Methods

Rating	Name	Size	Complexity
A	DocumentFileTSV.__init__()	3	1
A	DocumentFileCSV.documents()	16	5
A	DocumentFileTSV._parse_tsv_line()	10	2
A	DocumentFileCSV.__init__()	3	1
A	DocumentDirectory.documents()	13	5
A	DocumentFileTSV.documents()	9	4
A	DocumentDirectory.__init__()	11	1
A	DocumentDirectory.__iter__()	17	5
A	LimitingDocumentCorpus.documents()	4	2
A	TransformingDocumentCorpus.documents()	4	2
A	DocumentList.__init__()	2	1
A	DocumentFileCSV._check_fields()	3	1
A	LimitingDocumentCorpus.__init__()	3	1
A	DocumentFileCSV._parse_row()	12	1
A	TransformingDocumentCorpus.__init__()	3	1
A	DocumentList.documents()	3	1
A	DocumentFileCSV.is_csv_file()	6	1

"""Clases for supporting document corpora"""

from __future__ import annotations

import csv
import glob
import gzip
import os.path
import re
from itertools import islice
from typing import TYPE_CHECKING

import annif.util
from annif.exception import OperationFailedException

from .types import Document, DocumentCorpus, SubjectSet

if TYPE_CHECKING:
    from collections.abc import Iterator

    from annif.corpus.subject import SubjectIndex

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(
        self,
        path: str,
        subject_index: SubjectIndex | None = None,
        language: str | None = None,
        require_subjects: bool = False,
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.language = language
        self.require_subjects = require_subjects

    def __iter__(self) -> Iterator[tuple[str, str] | tuple[str, None]]:
        """Iterate through the directory, yielding tuples of (docfile,
        subjectfile) containing file paths. If require_subjects is False, the
        subjectfile will be returned as None."""

        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
            if self.require_subjects:
                tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
                if os.path.exists(tsvfilename):
                    yield (filename, tsvfilename)
                    continue
                keyfilename = re.sub(r"\.txt$", ".key", filename)
                if os.path.exists(keyfilename):
                    yield (filename, keyfilename)
                    continue
            else:
                yield (filename, None)

    @property
    def documents(self) -> Iterator[Document]:
        for docfilename, subjfilename in self:
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
                text = docfile.read()
            if subjfilename is None:
                yield Document(text=text, subject_set=None)
                continue
            with open(subjfilename, encoding="utf-8-sig") as subjfile:
                subjects = SubjectSet.from_string(
                    subjfile.read(), self.subject_index, self.language
                )
            yield Document(text=text, subject_set=subjects)


class DocumentFileTSV(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
        if "\t" in line:
            text, uris = line.split("\t", maxsplit=1)
            subject_ids = {
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
                for uri in uris.split()
            }
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
        else:
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())


class DocumentFileCSV(DocumentCorpus):
    """A CSV file as a corpus of documents with subjects"""

    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)
            if not self._check_fields(reader):
                raise OperationFailedException(
                    f"Cannot parse CSV file {self.path}. "
                    + "The file must have a header row that defines at least "
                    + "the columns 'text' and 'subject_uris'."
                )
            for row in reader:
                yield from self._parse_row(row)

    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
        subject_ids = {
            self.subject_index.by_uri(annif.util.cleanup_uri(uri))
            for uri in (row["subject_uris"] or "").strip().split()
        }
        metadata = {
            key: val for key, val in row.items() if key not in ("text", "subject_uris")
        }
        yield Document(
            text=(row["text"] or ""),
            subject_set=SubjectSet(subject_ids),
            metadata=metadata,
        )

    def _check_fields(self, reader: csv.DictReader) -> bool:
        fns = reader.fieldnames
        return fns is not None and "text" in fns and "subject_uris" in fns

    @staticmethod
    def is_csv_file(path: str) -> bool:
        """return True if the path looks like a CSV file"""

        path_lc = path.lower()
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TransformingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but transforms the
    documents using a given transform function"""

    def __init__(self, corpus, transform_fn):
        self._orig_corpus = corpus
        self._transform_fn = transform_fn

    @property
    def documents(self):
        for doc in self._orig_corpus.documents:
            yield self._transform_fn(doc)


class LimitingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but limits the
    number of documents to a given limit"""

    def __init__(self, corpus, docs_limit):
        self._orig_corpus = corpus
        self.docs_limit = docs_limit

    @property
    def documents(self):
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
            yield doc


1			"""Clases for supporting document corpora"""
2
3			from __future__ import annotations
4
5			import csv
6			import glob
7			import gzip
8			import os.path
9			import re
10			from itertools import islice
11			from typing import TYPE_CHECKING
12
13			import annif.util
14			from annif.exception import OperationFailedException
15
16			from .types import Document, DocumentCorpus, SubjectSet
17
18			if TYPE_CHECKING:
19			from collections.abc import Iterator
20
21			from annif.corpus.subject import SubjectIndex
22
23			logger = annif.logger
24
25
26			class DocumentDirectory(DocumentCorpus):
27			"""A directory of files as a full text document corpus"""
28
29			def __init__(
30			self,
31			path: str,
32			subject_index: SubjectIndex \| None = None,
33			language: str \| None = None,
34			require_subjects: bool = False,
35			) -> None:
36			self.path = path
37			self.subject_index = subject_index
38			self.language = language
39			self.require_subjects = require_subjects
40
41			def __iter__(self) -> Iterator[tuple[str, str] \| tuple[str, None]]:
42			"""Iterate through the directory, yielding tuples of (docfile,
43			subjectfile) containing file paths. If require_subjects is False, the
44			subjectfile will be returned as None."""
45
46			for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
47			if self.require_subjects:
48			tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
49			if os.path.exists(tsvfilename):
50			yield (filename, tsvfilename)
51			continue
52			keyfilename = re.sub(r"\.txt$", ".key", filename)
53			if os.path.exists(keyfilename):
54			yield (filename, keyfilename)
55			continue
56			else:
57			yield (filename, None)
58
59			@property
60			def documents(self) -> Iterator[Document]:
61			for docfilename, subjfilename in self:
62			with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
63			text = docfile.read()
64			if subjfilename is None:
65			yield Document(text=text, subject_set=None)
66			continue
67			with open(subjfilename, encoding="utf-8-sig") as subjfile:
68			subjects = SubjectSet.from_string(
69			subjfile.read(), self.subject_index, self.language
70			)
71			yield Document(text=text, subject_set=subjects)
72
73
74			class DocumentFileTSV(DocumentCorpus):
75			"""A TSV file as a corpus of documents with subjects"""
76
77			def __init__(self, path: str, subject_index: SubjectIndex) -> None:
78			self.path = path
79			self.subject_index = subject_index
80
81			@property
82			def documents(self) -> Iterator[Document]:
83			if self.path.endswith(".gz"):
84			opener = gzip.open
85			else:
86			opener = open
87			with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
88			for line in tsvfile:
89			yield from self._parse_tsv_line(line)
90
91			def _parse_tsv_line(self, line: str) -> Iterator[Document]:
92			if "\t" in line:
93			text, uris = line.split("\t", maxsplit=1)
94			subject_ids = {
95			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
96			for uri in uris.split()
97			}
98			yield Document(text=text, subject_set=SubjectSet(subject_ids))
99			else:
100			logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
101
102
103			class DocumentFileCSV(DocumentCorpus):
104			"""A CSV file as a corpus of documents with subjects"""
105
106			def __init__(self, path: str, subject_index: SubjectIndex) -> None:
107			self.path = path
108			self.subject_index = subject_index
109
110			@property
111			def documents(self) -> Iterator[Document]:
112			if self.path.endswith(".gz"):
113			opener = gzip.open
114			else:
115			opener = open
116			with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
117			reader = csv.DictReader(csvfile)
118			if not self._check_fields(reader):
119			raise OperationFailedException(
120			f"Cannot parse CSV file {self.path}. "
121			+ "The file must have a header row that defines at least "
122			+ "the columns 'text' and 'subject_uris'."
123			)
124			for row in reader:
125			yield from self._parse_row(row)
126
127			def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
128			subject_ids = {
129			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
130			for uri in (row["subject_uris"] or "").strip().split()
131			}
132			metadata = {
133			key: val for key, val in row.items() if key not in ("text", "subject_uris")
134			}
135			yield Document(
136			text=(row["text"] or ""),
137			subject_set=SubjectSet(subject_ids),
138			metadata=metadata,
139			)
140
141			def _check_fields(self, reader: csv.DictReader) -> bool:
142			fns = reader.fieldnames
143			return fns is not None and "text" in fns and "subject_uris" in fns
144
145			@staticmethod
146			def is_csv_file(path: str) -> bool:
147			"""return True if the path looks like a CSV file"""
148
149			path_lc = path.lower()
150			return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
151
152
153			class DocumentList(DocumentCorpus):
154			"""A document corpus based on a list of other iterable of Document
155			objects"""
156
157			def __init__(self, documents):
158			self._documents = documents
159
160			@property
161			def documents(self):
162			yield from self._documents
163
164
165			class TransformingDocumentCorpus(DocumentCorpus):
166			"""A document corpus that wraps another document corpus but transforms the
167			documents using a given transform function"""
168
169			def __init__(self, corpus, transform_fn):
170			self._orig_corpus = corpus
171			self._transform_fn = transform_fn
172
173			@property
174			def documents(self):
175			for doc in self._orig_corpus.documents:
176			yield self._transform_fn(doc)
177
178
179			class LimitingDocumentCorpus(DocumentCorpus):
180			"""A document corpus that wraps another document corpus but limits the
181			number of documents to a given limit"""
182
183			def __init__(self, corpus, docs_limit):
184			self._orig_corpus = corpus
185			self.docs_limit = docs_limit
186
187			@property
188			def documents(self):
189			for doc in islice(self._orig_corpus.documents, self.docs_limit):
190			yield doc
191

NatLibFi / Annif

annif.corpus.document A last analyzed 2025-08-08 10:59 UTC

Complexity

Size/Duplication

Importance

17 Methods

Duplication Side-by-Side

Filter issues like

annif.corpus.document A
last analyzed 2025-08-08 10:59 UTC