annif.corpus.subject.SubjectSet.from_string() - Code Metrics - Inspection of "WIP: Subject filtering" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#840)

by Osma

created 2025-03-26 08:20 UTC

annif.corpus.subject.SubjectSet.from_string() A

↳ Parent: annif.corpus.subject

Complexity

Conditions

Size

Total Lines	12
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	10
nop	4
dl	0
loc	12
rs	9.9
c	0
b	0
f	0

"""Classes for supporting subject corpora expressed as directories or files"""

from __future__ import annotations

import csv
import os.path
from typing import TYPE_CHECKING, Any

import annif
import annif.util

from .skos import serialize_subjects_to_skos
from .types import Subject, SubjectCorpus

if TYPE_CHECKING:
    from collections.abc import Generator, Iterator

    import numpy as np

    from annif.vocab import SubjectIndex


logger = annif.logger.getChild("subject")
logger.addFilter(annif.util.DuplicateFilter())


class SubjectFileTSV(SubjectCorpus):
    """A monolingual subject vocabulary stored in a TSV file."""

    def __init__(self, path: str, language: str) -> None:
        """initialize the SubjectFileTSV given a path to a TSV file and the
        language of the vocabulary"""

        self.path = path
        self.language = language

    def _parse_line(self, line: str) -> Iterator[Subject]:
        vals = line.strip().split("\t", 2)
        clean_uri = annif.util.cleanup_uri(vals[0])
        label = vals[1] if len(vals) >= 2 else None
        labels = {self.language: label} if label else None
        notation = vals[2] if len(vals) >= 3 else None
        yield Subject(uri=clean_uri, labels=labels, notation=notation)

    @property
    def languages(self) -> list[str]:
        return [self.language]

    @property
    def subjects(self) -> Generator:
        with open(self.path, encoding="utf-8-sig") as subjfile:
            for line in subjfile:
                yield from self._parse_line(line)

    def save_skos(self, path: str) -> None:
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects, path)


class SubjectFileCSV(SubjectCorpus):
    """A multilingual subject vocabulary stored in a CSV file."""

    def __init__(self, path: str) -> None:
        """initialize the SubjectFileCSV given a path to a CSV file"""
        self.path = path

    def _parse_row(self, row: dict[str, str]) -> Iterator[Subject]:
        labels = {
            fname.replace("label_", ""): value or None
            for fname, value in row.items()
            if fname.startswith("label_")
        }

        # if there are no labels in any language, set labels to None
        # indicating a deprecated subject
        if set(labels.values()) == {None}:
            labels = None

        yield Subject(
            uri=annif.util.cleanup_uri(row["uri"]),
            labels=labels,
            notation=row.get("notation", None) or None,
        )

    @property
    def languages(self) -> list[str]:
        # infer the supported languages from the CSV column names
        with open(self.path, encoding="utf-8-sig") as csvfile:
            reader = csv.reader(csvfile)
            fieldnames = next(reader, None)

        return [
            fname.replace("label_", "")
            for fname in fieldnames
            if fname.startswith("label_")
        ]

    @property
    def subjects(self) -> Generator:
        with open(self.path, encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                yield from self._parse_row(row)

    def save_skos(self, path: str) -> None:
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects, path)

    @staticmethod
    def is_csv_file(path: str) -> bool:
        """return True if the path looks like a CSV file"""

        return os.path.splitext(path)[1].lower() == ".csv"


class SubjectSet:
    """Represents a set of subjects for a document."""

    def __init__(self, subject_ids: Any | None = None) -> None:
        """Create a SubjectSet and optionally initialize it from an iterable
        of subject IDs"""

        if subject_ids:
            # use set comprehension to eliminate possible duplicates
            self._subject_ids = list(
                {subject_id for subject_id in subject_ids if subject_id is not None}
            )
        else:
            self._subject_ids = []

    def __len__(self) -> int:
        return len(self._subject_ids)

    def __getitem__(self, idx: int) -> int:
        return self._subject_ids[idx]

    def __bool__(self) -> bool:
        return bool(self._subject_ids)

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, SubjectSet):
            return self._subject_ids == other._subject_ids

        return False

    @classmethod
    def from_string(
        cls, subj_data: str, subject_index: SubjectIndex, language: str
    ) -> SubjectSet:
        subject_ids = set()
        for line in subj_data.splitlines():
            uri, label = cls._parse_line(line)
            if uri is not None:
                subject_ids.add(subject_index.by_uri(uri))
            else:
                subject_ids.add(subject_index.by_label(label, language))
        return cls(subject_ids)

    @staticmethod
    def _parse_line(
        line: str,
    ) -> tuple[str | None, str | None]:
        uri = label = None
        vals = line.split("\t")
        for val in vals:
            val = val.strip()
            if val == "":
                continue
            if val.startswith("<") and val.endswith(">"):  # URI
                uri = val[1:-1]
                continue
            label = val
            break
        return uri, label

    def as_vector(
        self, size: int | None = None, destination: np.ndarray | None = None

    ) -> np.ndarray:
        """Return the hits as a one-dimensional NumPy array in sklearn
        multilabel indicator format. Use destination array if given (not
        None), otherwise create and return a new one of the given size."""

        if destination is None:
            import numpy as np

            assert size is not None and size > 0
            destination = np.zeros(size, dtype=bool)

        destination[list(self._subject_ids)] = True

        return destination


1			"""Classes for supporting subject corpora expressed as directories or files"""
2
3			from __future__ import annotations
4
5			import csv
6			import os.path
7			from typing import TYPE_CHECKING, Any
8
9			import annif
10			import annif.util
11
12			from .skos import serialize_subjects_to_skos
13			from .types import Subject, SubjectCorpus
14
15			if TYPE_CHECKING:
16			from collections.abc import Generator, Iterator
17
18			import numpy as np
19
20			from annif.vocab import SubjectIndex
21
22
23			logger = annif.logger.getChild("subject")
24			logger.addFilter(annif.util.DuplicateFilter())
25
26
27			class SubjectFileTSV(SubjectCorpus):
28			"""A monolingual subject vocabulary stored in a TSV file."""
29
30			def __init__(self, path: str, language: str) -> None:
31			"""initialize the SubjectFileTSV given a path to a TSV file and the
32			language of the vocabulary"""
33
34			self.path = path
35			self.language = language
36
37			def _parse_line(self, line: str) -> Iterator[Subject]:
38			vals = line.strip().split("\t", 2)
39			clean_uri = annif.util.cleanup_uri(vals[0])
40			label = vals[1] if len(vals) >= 2 else None
41			labels = {self.language: label} if label else None
42			notation = vals[2] if len(vals) >= 3 else None
43			yield Subject(uri=clean_uri, labels=labels, notation=notation)
44
45			@property
46			def languages(self) -> list[str]:
47			return [self.language]
48
49			@property
50			def subjects(self) -> Generator:
51			with open(self.path, encoding="utf-8-sig") as subjfile:
52			for line in subjfile:
53			yield from self._parse_line(line)
54
55			def save_skos(self, path: str) -> None:
56			"""Save the contents of the subject vocabulary into a SKOS/Turtle
57			file with the given path name."""
58			serialize_subjects_to_skos(self.subjects, path)
59
60
61			class SubjectFileCSV(SubjectCorpus):
62			"""A multilingual subject vocabulary stored in a CSV file."""
63
64			def __init__(self, path: str) -> None:
65			"""initialize the SubjectFileCSV given a path to a CSV file"""
66			self.path = path
67
68			def _parse_row(self, row: dict[str, str]) -> Iterator[Subject]:
69			labels = {
70			fname.replace("label_", ""): value or None
71			for fname, value in row.items()
72			if fname.startswith("label_")
73			}
74
75			# if there are no labels in any language, set labels to None
76			# indicating a deprecated subject
77			if set(labels.values()) == {None}:
78			labels = None
79
80			yield Subject(
81			uri=annif.util.cleanup_uri(row["uri"]),
82			labels=labels,
83			notation=row.get("notation", None) or None,
84			)
85
86			@property
87			def languages(self) -> list[str]:
88			# infer the supported languages from the CSV column names
89			with open(self.path, encoding="utf-8-sig") as csvfile:
90			reader = csv.reader(csvfile)
91			fieldnames = next(reader, None)
92
93			return [
94			fname.replace("label_", "")
95			for fname in fieldnames
96			if fname.startswith("label_")
97			]
98
99			@property
100			def subjects(self) -> Generator:
101			with open(self.path, encoding="utf-8-sig") as csvfile:
102			reader = csv.DictReader(csvfile)
103			for row in reader:
104			yield from self._parse_row(row)
105
106			def save_skos(self, path: str) -> None:
107			"""Save the contents of the subject vocabulary into a SKOS/Turtle
108			file with the given path name."""
109			serialize_subjects_to_skos(self.subjects, path)
110
111			@staticmethod
112			def is_csv_file(path: str) -> bool:
113			"""return True if the path looks like a CSV file"""
114
115			return os.path.splitext(path)[1].lower() == ".csv"
116
117
118			class SubjectSet:
119			"""Represents a set of subjects for a document."""
120
121			def __init__(self, subject_ids: Any \| None = None) -> None:
122			"""Create a SubjectSet and optionally initialize it from an iterable
123			of subject IDs"""
124
125			if subject_ids:
126			# use set comprehension to eliminate possible duplicates
127			self._subject_ids = list(
128			{subject_id for subject_id in subject_ids if subject_id is not None}
129			)
130			else:
131			self._subject_ids = []
132
133			def __len__(self) -> int:
134			return len(self._subject_ids)
135
136			def __getitem__(self, idx: int) -> int:
137			return self._subject_ids[idx]
138
139			def __bool__(self) -> bool:
140			return bool(self._subject_ids)
141
142			def __eq__(self, other: Any) -> bool:
143			if isinstance(other, SubjectSet):
144			return self._subject_ids == other._subject_ids
145
146			return False
147
148			@classmethod
149			def from_string(
150			cls, subj_data: str, subject_index: SubjectIndex, language: str
151			) -> SubjectSet:
152			subject_ids = set()
153			for line in subj_data.splitlines():
154			uri, label = cls._parse_line(line)
155			if uri is not None:
156			subject_ids.add(subject_index.by_uri(uri))
157			else:
158			subject_ids.add(subject_index.by_label(label, language))
159			return cls(subject_ids)
160
161			@staticmethod
162			def _parse_line(
163			line: str,
164			) -> tuple[str \| None, str \| None]:
165			uri = label = None
166			vals = line.split("\t")
167			for val in vals:
168			val = val.strip()
169			if val == "":
170			continue
171			if val.startswith("<") and val.endswith(">"): # URI
172			uri = val[1:-1]
173			continue
174			label = val
175			break
176			return uri, label
177
178			def as_vector(
179			self, size: int \| None = None, destination: np.ndarray \| None = None
			0 ignored issues – show introduced 2023-05-23 12:04 UTC by Report Bug Copy Issue Report The variable `np` does not seem to be defined in case `TYPE_CHECKING` on line `15` is `False`. Are you sure this can never be the case? Loading history...
180			) -> np.ndarray:
181			"""Return the hits as a one-dimensional NumPy array in sklearn
182			multilabel indicator format. Use destination array if given (not
183			None), otherwise create and return a new one of the given size."""
184
185			if destination is None:
186			import numpy as np
187
188			assert size is not None and size > 0
189			destination = np.zeros(size, dtype=bool)
190
191			destination[list(self._subject_ids)] = True
192
193			return destination
194

NatLibFi / Annif

Pull Request — main (#840)

annif.corpus.subject.SubjectSet.from_string() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like