annif.corpus.document.DocumentDirectory.documents() - Code Metrics - Inspection of "refactor DocumentDirectory in preparation for addi..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — issue868-json-corpus-format ( c8699d )

by Osma

created 2025-08-13 12:42 UTC

DocumentDirectory.documents() B

↳ Parent: annif.corpus.document

Complexity

Conditions

Size

Total Lines	19
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	6
eloc	15
nop	1
dl	0
loc	19
rs	8.6666
c	0
b	0
f	0

"""Clases for supporting document corpora"""

from __future__ import annotations

import csv
import glob
import gzip
import os.path
import re
from itertools import islice
from typing import TYPE_CHECKING

import annif.util
from annif.exception import OperationFailedException

from .types import Document, DocumentCorpus, SubjectSet

if TYPE_CHECKING:
    from collections.abc import Iterator

    from annif.corpus.subject import SubjectIndex

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(
        self,
        path: str,
        subject_index: SubjectIndex | None = None,
        language: str | None = None,
        require_subjects: bool = False,
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.language = language
        self.require_subjects = require_subjects

    def __iter__(self) -> Iterator[str]:
        """Iterate through the directory, yielding file paths with corpus documents."""

        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
            yield filename

    @staticmethod
    def _get_subject_filename(filename: str) -> str | None:
        tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
        if os.path.exists(tsvfilename):
            return tsvfilename

        keyfilename = re.sub(r"\.txt$", ".key", filename)
        if os.path.exists(keyfilename):
            return keyfilename

        return None

    @property
    def documents(self) -> Iterator[Document]:
        for docfilename in self:
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
                text = docfile.read()
            if not self.require_subjects:
                yield Document(text=text, subject_set=None)
                continue

            subjfilename = self._get_subject_filename(docfilename)
            if subjfilename is None:
                # subjects required but not found, skipping this docfile
                continue

            with open(subjfilename, encoding="utf-8-sig") as subjfile:
                subjects = SubjectSet.from_string(
                    subjfile.read(), self.subject_index, self.language
                )
            yield Document(text=text, subject_set=subjects)


class DocumentFileTSV(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
        if "\t" in line:
            text, uris = line.split("\t", maxsplit=1)
            subject_ids = {
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
                for uri in uris.split()
            }
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
        else:
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())


class DocumentFileCSV(DocumentCorpus):
    """A CSV file as a corpus of documents with subjects"""

    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)
            if not self._check_fields(reader):
                raise OperationFailedException(
                    f"Cannot parse CSV file {self.path}. "
                    + "The file must have a header row that defines at least "
                    + "the columns 'text' and 'subject_uris'."
                )
            for row in reader:
                yield from self._parse_row(row)

    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
        subject_ids = {
            self.subject_index.by_uri(annif.util.cleanup_uri(uri))
            for uri in (row["subject_uris"] or "").strip().split()
        }
        metadata = {
            key: val for key, val in row.items() if key not in ("text", "subject_uris")
        }
        yield Document(
            text=(row["text"] or ""),
            subject_set=SubjectSet(subject_ids),
            metadata=metadata,
        )

    def _check_fields(self, reader: csv.DictReader) -> bool:
        fns = reader.fieldnames
        return fns is not None and "text" in fns and "subject_uris" in fns

    @staticmethod
    def is_csv_file(path: str) -> bool:
        """return True if the path looks like a CSV file"""

        path_lc = path.lower()
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TransformingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but transforms the
    documents using a given transform function"""

    def __init__(self, corpus, transform_fn):
        self._orig_corpus = corpus
        self._transform_fn = transform_fn

    @property
    def documents(self):
        for doc in self._orig_corpus.documents:
            yield self._transform_fn(doc)


class LimitingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but limits the
    number of documents to a given limit"""

    def __init__(self, corpus, docs_limit):
        self._orig_corpus = corpus
        self.docs_limit = docs_limit

    @property
    def documents(self):
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
            yield doc


1			"""Clases for supporting document corpora"""
2
3			from __future__ import annotations
4
5			import csv
6			import glob
7			import gzip
8			import os.path
9			import re
10			from itertools import islice
11			from typing import TYPE_CHECKING
12
13			import annif.util
14			from annif.exception import OperationFailedException
15
16			from .types import Document, DocumentCorpus, SubjectSet
17
18			if TYPE_CHECKING:
19			from collections.abc import Iterator
20
21			from annif.corpus.subject import SubjectIndex
22
23			logger = annif.logger
24
25
26			class DocumentDirectory(DocumentCorpus):
27			"""A directory of files as a full text document corpus"""
28
29			def __init__(
30			self,
31			path: str,
32			subject_index: SubjectIndex \| None = None,
33			language: str \| None = None,
34			require_subjects: bool = False,
35			) -> None:
36			self.path = path
37			self.subject_index = subject_index
38			self.language = language
39			self.require_subjects = require_subjects
40
41			def __iter__(self) -> Iterator[str]:
42			"""Iterate through the directory, yielding file paths with corpus documents."""
43
44			for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
45			yield filename
46
47			@staticmethod
48			def _get_subject_filename(filename: str) -> str \| None:
49			tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
50			if os.path.exists(tsvfilename):
51			return tsvfilename
52
53			keyfilename = re.sub(r"\.txt$", ".key", filename)
54			if os.path.exists(keyfilename):
55			return keyfilename
56
57			return None
58
59			@property
60			def documents(self) -> Iterator[Document]:
61			for docfilename in self:
62			with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
63			text = docfile.read()
64			if not self.require_subjects:
65			yield Document(text=text, subject_set=None)
66			continue
67
68			subjfilename = self._get_subject_filename(docfilename)
69			if subjfilename is None:
70			# subjects required but not found, skipping this docfile
71			continue
72
73			with open(subjfilename, encoding="utf-8-sig") as subjfile:
74			subjects = SubjectSet.from_string(
75			subjfile.read(), self.subject_index, self.language
76			)
77			yield Document(text=text, subject_set=subjects)
78
79
80			class DocumentFileTSV(DocumentCorpus):
81			"""A TSV file as a corpus of documents with subjects"""
82
83			def __init__(self, path: str, subject_index: SubjectIndex) -> None:
84			self.path = path
85			self.subject_index = subject_index
86
87			@property
88			def documents(self) -> Iterator[Document]:
89			if self.path.endswith(".gz"):
90			opener = gzip.open
91			else:
92			opener = open
93			with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
94			for line in tsvfile:
95			yield from self._parse_tsv_line(line)
96
97			def _parse_tsv_line(self, line: str) -> Iterator[Document]:
98			if "\t" in line:
99			text, uris = line.split("\t", maxsplit=1)
100			subject_ids = {
101			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
102			for uri in uris.split()
103			}
104			yield Document(text=text, subject_set=SubjectSet(subject_ids))
105			else:
106			logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
107
108
109			class DocumentFileCSV(DocumentCorpus):
110			"""A CSV file as a corpus of documents with subjects"""
111
112			def __init__(self, path: str, subject_index: SubjectIndex) -> None:
113			self.path = path
114			self.subject_index = subject_index
115
116			@property
117			def documents(self) -> Iterator[Document]:
118			if self.path.endswith(".gz"):
119			opener = gzip.open
120			else:
121			opener = open
122			with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
123			reader = csv.DictReader(csvfile)
124			if not self._check_fields(reader):
125			raise OperationFailedException(
126			f"Cannot parse CSV file {self.path}. "
127			+ "The file must have a header row that defines at least "
128			+ "the columns 'text' and 'subject_uris'."
129			)
130			for row in reader:
131			yield from self._parse_row(row)
132
133			def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
134			subject_ids = {
135			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
136			for uri in (row["subject_uris"] or "").strip().split()
137			}
138			metadata = {
139			key: val for key, val in row.items() if key not in ("text", "subject_uris")
140			}
141			yield Document(
142			text=(row["text"] or ""),
143			subject_set=SubjectSet(subject_ids),
144			metadata=metadata,
145			)
146
147			def _check_fields(self, reader: csv.DictReader) -> bool:
148			fns = reader.fieldnames
149			return fns is not None and "text" in fns and "subject_uris" in fns
150
151			@staticmethod
152			def is_csv_file(path: str) -> bool:
153			"""return True if the path looks like a CSV file"""
154
155			path_lc = path.lower()
156			return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
157
158
159			class DocumentList(DocumentCorpus):
160			"""A document corpus based on a list of other iterable of Document
161			objects"""
162
163			def __init__(self, documents):
164			self._documents = documents
165
166			@property
167			def documents(self):
168			yield from self._documents
169
170
171			class TransformingDocumentCorpus(DocumentCorpus):
172			"""A document corpus that wraps another document corpus but transforms the
173			documents using a given transform function"""
174
175			def __init__(self, corpus, transform_fn):
176			self._orig_corpus = corpus
177			self._transform_fn = transform_fn
178
179			@property
180			def documents(self):
181			for doc in self._orig_corpus.documents:
182			yield self._transform_fn(doc)
183
184
185			class LimitingDocumentCorpus(DocumentCorpus):
186			"""A document corpus that wraps another document corpus but limits the
187			number of documents to a given limit"""
188
189			def __init__(self, corpus, docs_limit):
190			self._orig_corpus = corpus
191			self.docs_limit = docs_limit
192
193			@property
194			def documents(self):
195			for doc in islice(self._orig_corpus.documents, self.docs_limit):
196			yield doc
197

NatLibFi / Annif

Push — issue868-json-corpus-format ( c8699d )

DocumentDirectory.documents() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like