annif.corpus.document.DocumentFileTSV._parse_tsv_line() - Code Metrics - Inspection of "Implement new CSV document corpus format with colu..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — issue813-flexible-fusion-csv-s... ( ca1bd6 )

by Osma

created 2025-07-24 07:52 UTC

DocumentFileTSV._parse_tsv_line() A

↳ Parent: annif.corpus.document

Complexity

Conditions

Size

Total Lines	10
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	8
nop	2
dl	0
loc	10
rs	10
c	0
b	0
f	0

"""Clases for supporting document corpora"""

from __future__ import annotations

import csv
import glob
import gzip
import os.path
import re
from itertools import islice
from typing import TYPE_CHECKING

import annif.util

from .types import Document, DocumentCorpus, SubjectSet

if TYPE_CHECKING:
    from collections.abc import Iterator

    from annif.corpus.subject import SubjectIndex

logger = annif.logger


class DocumentDirectory(DocumentCorpus):
    """A directory of files as a full text document corpus"""

    def __init__(
        self,
        path: str,
        subject_index: SubjectIndex | None = None,
        language: str | None = None,
        require_subjects: bool = False,
    ) -> None:
        self.path = path
        self.subject_index = subject_index
        self.language = language
        self.require_subjects = require_subjects

    def __iter__(self) -> Iterator[tuple[str, str] | tuple[str, None]]:
        """Iterate through the directory, yielding tuples of (docfile,
        subjectfile) containing file paths. If require_subjects is False, the
        subjectfile will be returned as None."""

        for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
            if self.require_subjects:
                tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
                if os.path.exists(tsvfilename):
                    yield (filename, tsvfilename)
                    continue
                keyfilename = re.sub(r"\.txt$", ".key", filename)
                if os.path.exists(keyfilename):
                    yield (filename, keyfilename)
                    continue
            else:
                yield (filename, None)

    @property
    def documents(self) -> Iterator[Document]:
        for docfilename, subjfilename in self:
            with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
                text = docfile.read()
            if subjfilename is None:
                yield Document(text=text, subject_set=None)
                continue
            with open(subjfilename, encoding="utf-8-sig") as subjfile:
                subjects = SubjectSet.from_string(
                    subjfile.read(), self.subject_index, self.language
                )
            yield Document(text=text, subject_set=subjects)


class DocumentFileTSV(DocumentCorpus):
    """A TSV file as a corpus of documents with subjects"""

    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
            for line in tsvfile:
                yield from self._parse_tsv_line(line)

    def _parse_tsv_line(self, line: str) -> Iterator[Document]:
        if "\t" in line:
            text, uris = line.split("\t", maxsplit=1)
            subject_ids = {
                self.subject_index.by_uri(annif.util.cleanup_uri(uri))
                for uri in uris.split()
            }
            yield Document(text=text, subject_set=SubjectSet(subject_ids))
        else:
            logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())


class DocumentFileCSV(DocumentCorpus):
    """A CSV file as a corpus of documents with subjects"""

    def __init__(self, path: str, subject_index: SubjectIndex) -> None:
        self.path = path
        self.subject_index = subject_index

    @property
    def documents(self) -> Iterator[Document]:
        if self.path.endswith(".gz"):
            opener = gzip.open
        else:
            opener = open
        with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                yield from self._parse_row(row)

    def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
        subject_ids = {
            self.subject_index.by_uri(annif.util.cleanup_uri(uri))
            for uri in row["subject_uris"].strip().split()
        }
        yield Document(text=row["text"], subject_set=SubjectSet(subject_ids))

    @staticmethod
    def is_csv_file(path: str) -> bool:
        """return True if the path looks like a CSV file"""

        path_lc = path.lower()
        return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")


class DocumentList(DocumentCorpus):
    """A document corpus based on a list of other iterable of Document
    objects"""

    def __init__(self, documents):
        self._documents = documents

    @property
    def documents(self):
        yield from self._documents


class TransformingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but transforms the
    documents using a given transform function"""

    def __init__(self, corpus, transform_fn):
        self._orig_corpus = corpus
        self._transform_fn = transform_fn

    @property
    def documents(self):
        for doc in self._orig_corpus.documents:
            yield Document(
                text=self._transform_fn(doc.text), subject_set=doc.subject_set
            )


class LimitingDocumentCorpus(DocumentCorpus):
    """A document corpus that wraps another document corpus but limits the
    number of documents to a given limit"""

    def __init__(self, corpus, docs_limit):
        self._orig_corpus = corpus
        self.docs_limit = docs_limit

    @property
    def documents(self):
        for doc in islice(self._orig_corpus.documents, self.docs_limit):
            yield doc


1			"""Clases for supporting document corpora"""
2
3			from __future__ import annotations
4
5			import csv
6			import glob
7			import gzip
8			import os.path
9			import re
10			from itertools import islice
11			from typing import TYPE_CHECKING
12
13			import annif.util
14
15			from .types import Document, DocumentCorpus, SubjectSet
16
17			if TYPE_CHECKING:
18			from collections.abc import Iterator
19
20			from annif.corpus.subject import SubjectIndex
21
22			logger = annif.logger
23
24
25			class DocumentDirectory(DocumentCorpus):
26			"""A directory of files as a full text document corpus"""
27
28			def __init__(
29			self,
30			path: str,
31			subject_index: SubjectIndex \| None = None,
32			language: str \| None = None,
33			require_subjects: bool = False,
34			) -> None:
35			self.path = path
36			self.subject_index = subject_index
37			self.language = language
38			self.require_subjects = require_subjects
39
40			def __iter__(self) -> Iterator[tuple[str, str] \| tuple[str, None]]:
41			"""Iterate through the directory, yielding tuples of (docfile,
42			subjectfile) containing file paths. If require_subjects is False, the
43			subjectfile will be returned as None."""
44
45			for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
46			if self.require_subjects:
47			tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
48			if os.path.exists(tsvfilename):
49			yield (filename, tsvfilename)
50			continue
51			keyfilename = re.sub(r"\.txt$", ".key", filename)
52			if os.path.exists(keyfilename):
53			yield (filename, keyfilename)
54			continue
55			else:
56			yield (filename, None)
57
58			@property
59			def documents(self) -> Iterator[Document]:
60			for docfilename, subjfilename in self:
61			with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
62			text = docfile.read()
63			if subjfilename is None:
64			yield Document(text=text, subject_set=None)
65			continue
66			with open(subjfilename, encoding="utf-8-sig") as subjfile:
67			subjects = SubjectSet.from_string(
68			subjfile.read(), self.subject_index, self.language
69			)
70			yield Document(text=text, subject_set=subjects)
71
72
73			class DocumentFileTSV(DocumentCorpus):
74			"""A TSV file as a corpus of documents with subjects"""
75
76			def __init__(self, path: str, subject_index: SubjectIndex) -> None:
77			self.path = path
78			self.subject_index = subject_index
79
80			@property
81			def documents(self) -> Iterator[Document]:
82			if self.path.endswith(".gz"):
83			opener = gzip.open
84			else:
85			opener = open
86			with opener(self.path, mode="rt", encoding="utf-8-sig") as tsvfile:
87			for line in tsvfile:
88			yield from self._parse_tsv_line(line)
89
90			def _parse_tsv_line(self, line: str) -> Iterator[Document]:
91			if "\t" in line:
92			text, uris = line.split("\t", maxsplit=1)
93			subject_ids = {
94			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
95			for uri in uris.split()
96			}
97			yield Document(text=text, subject_set=SubjectSet(subject_ids))
98			else:
99			logger.warning('Skipping invalid line (missing tab): "%s"', line.rstrip())
100
101
102			class DocumentFileCSV(DocumentCorpus):
103			"""A CSV file as a corpus of documents with subjects"""
104
105			def __init__(self, path: str, subject_index: SubjectIndex) -> None:
106			self.path = path
107			self.subject_index = subject_index
108
109			@property
110			def documents(self) -> Iterator[Document]:
111			if self.path.endswith(".gz"):
112			opener = gzip.open
113			else:
114			opener = open
115			with opener(self.path, mode="rt", encoding="utf-8-sig") as csvfile:
116			reader = csv.DictReader(csvfile)
117			for row in reader:
118			yield from self._parse_row(row)
119
120			def _parse_row(self, row: dict[str, str]) -> Iterator[Document]:
121			subject_ids = {
122			self.subject_index.by_uri(annif.util.cleanup_uri(uri))
123			for uri in row["subject_uris"].strip().split()
124			}
125			yield Document(text=row["text"], subject_set=SubjectSet(subject_ids))
126
127			@staticmethod
128			def is_csv_file(path: str) -> bool:
129			"""return True if the path looks like a CSV file"""
130
131			path_lc = path.lower()
132			return path_lc.endswith(".csv") or path_lc.endswith(".csv.gz")
133
134
135			class DocumentList(DocumentCorpus):
136			"""A document corpus based on a list of other iterable of Document
137			objects"""
138
139			def __init__(self, documents):
140			self._documents = documents
141
142			@property
143			def documents(self):
144			yield from self._documents
145
146
147			class TransformingDocumentCorpus(DocumentCorpus):
148			"""A document corpus that wraps another document corpus but transforms the
149			documents using a given transform function"""
150
151			def __init__(self, corpus, transform_fn):
152			self._orig_corpus = corpus
153			self._transform_fn = transform_fn
154
155			@property
156			def documents(self):
157			for doc in self._orig_corpus.documents:
158			yield Document(
159			text=self._transform_fn(doc.text), subject_set=doc.subject_set
160			)
161
162
163			class LimitingDocumentCorpus(DocumentCorpus):
164			"""A document corpus that wraps another document corpus but limits the
165			number of documents to a given limit"""
166
167			def __init__(self, corpus, docs_limit):
168			self._orig_corpus = corpus
169			self.docs_limit = docs_limit
170
171			@property
172			def documents(self):
173			for doc in islice(self._orig_corpus.documents, self.docs_limit):
174			yield doc
175

NatLibFi / Annif

Push — issue813-flexible-fusion-csv-s... ( ca1bd6 )

DocumentFileTSV._parse_tsv_line() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like