annif.corpus.subject - Code Metrics - Inspection of "WIP: Subject filtering" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#840)

by Osma

created 2025-03-27 09:09 UTC

annif.corpus.subject A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	190
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	37
eloc	118
dl	0
loc	190
rs	9.44
c	0
b	0
f	0

19 Methods

Rating	Name	Size	Complexity
A	SubjectFileTSV.__init__()	6	1
A	SubjectFileTSV.languages()	3	1
A	SubjectFileCSV._parse_row()	16	2
A	SubjectFileCSV.__init__()	3	1
A	SubjectFileTSV.subjects()	5	3
A	SubjectFileCSV.is_csv_file()	5	1
A	SubjectFileTSV.save_skos()	4	1
A	SubjectFileCSV.languages()	11	2
A	SubjectFileCSV.subjects()	6	3
A	SubjectFileCSV.save_skos()	4	1
A	SubjectFileTSV._parse_line()	7	4
A	SubjectSet.__bool__()	2	1
A	SubjectSet.__len__()	2	1
A	SubjectSet.__eq__()	5	2
A	SubjectSet.as_vector()	16	2
A	SubjectSet.__getitem__()	2	1
A	SubjectSet.from_string()	12	3
A	SubjectSet.__init__()	11	2
A	SubjectSet._parse_line()	16	5

"""Classes for supporting subject corpora expressed as directories or files"""

from __future__ import annotations

import csv
import os.path
from typing import TYPE_CHECKING, Any

import annif
import annif.util

from .skos import serialize_subjects_to_skos
from .types import Subject, SubjectCorpus

if TYPE_CHECKING:
    from collections.abc import Generator, Iterator

    import numpy as np

    from annif.vocab import SubjectIndex


class SubjectFileTSV(SubjectCorpus):
    """A monolingual subject vocabulary stored in a TSV file."""

    def __init__(self, path: str, language: str) -> None:
        """initialize the SubjectFileTSV given a path to a TSV file and the
        language of the vocabulary"""

        self.path = path
        self.language = language

    def _parse_line(self, line: str) -> Iterator[Subject]:
        vals = line.strip().split("\t", 2)
        clean_uri = annif.util.cleanup_uri(vals[0])
        label = vals[1] if len(vals) >= 2 else None
        labels = {self.language: label} if label else None
        notation = vals[2] if len(vals) >= 3 else None
        yield Subject(uri=clean_uri, labels=labels, notation=notation)

    @property
    def languages(self) -> list[str]:
        return [self.language]

    @property
    def subjects(self) -> Generator:
        with open(self.path, encoding="utf-8-sig") as subjfile:
            for line in subjfile:
                yield from self._parse_line(line)

    def save_skos(self, path: str) -> None:
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects, path)


class SubjectFileCSV(SubjectCorpus):
    """A multilingual subject vocabulary stored in a CSV file."""

    def __init__(self, path: str) -> None:
        """initialize the SubjectFileCSV given a path to a CSV file"""
        self.path = path

    def _parse_row(self, row: dict[str, str]) -> Iterator[Subject]:
        labels = {
            fname.replace("label_", ""): value or None
            for fname, value in row.items()
            if fname.startswith("label_")
        }

        # if there are no labels in any language, set labels to None
        # indicating a deprecated subject
        if set(labels.values()) == {None}:
            labels = None

        yield Subject(
            uri=annif.util.cleanup_uri(row["uri"]),
            labels=labels,
            notation=row.get("notation", None) or None,
        )

    @property
    def languages(self) -> list[str]:
        # infer the supported languages from the CSV column names
        with open(self.path, encoding="utf-8-sig") as csvfile:
            reader = csv.reader(csvfile)
            fieldnames = next(reader, None)

        return [
            fname.replace("label_", "")
            for fname in fieldnames
            if fname.startswith("label_")
        ]

    @property
    def subjects(self) -> Generator:
        with open(self.path, encoding="utf-8-sig") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                yield from self._parse_row(row)

    def save_skos(self, path: str) -> None:
        """Save the contents of the subject vocabulary into a SKOS/Turtle
        file with the given path name."""
        serialize_subjects_to_skos(self.subjects, path)

    @staticmethod
    def is_csv_file(path: str) -> bool:
        """return True if the path looks like a CSV file"""

        return os.path.splitext(path)[1].lower() == ".csv"


class SubjectSet:
    """Represents a set of subjects for a document."""

    def __init__(self, subject_ids: Any | None = None) -> None:
        """Create a SubjectSet and optionally initialize it from an iterable
        of subject IDs"""

        if subject_ids:
            # use set comprehension to eliminate possible duplicates
            self._subject_ids = list(
                {subject_id for subject_id in subject_ids if subject_id is not None}
            )
        else:
            self._subject_ids = []

    def __len__(self) -> int:
        return len(self._subject_ids)

    def __getitem__(self, idx: int) -> int:
        return self._subject_ids[idx]

    def __bool__(self) -> bool:
        return bool(self._subject_ids)

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, SubjectSet):
            return self._subject_ids == other._subject_ids

        return False

    @classmethod
    def from_string(
        cls, subj_data: str, subject_index: SubjectIndex, language: str
    ) -> SubjectSet:
        subject_ids = set()
        for line in subj_data.splitlines():
            uri, label = cls._parse_line(line)
            if uri is not None:
                subject_ids.add(subject_index.by_uri(uri))
            else:
                subject_ids.add(subject_index.by_label(label, language))
        return cls(subject_ids)

    @staticmethod
    def _parse_line(
        line: str,
    ) -> tuple[str | None, str | None]:
        uri = label = None
        vals = line.split("\t")
        for val in vals:
            val = val.strip()
            if val == "":
                continue
            if val.startswith("<") and val.endswith(">"):  # URI
                uri = val[1:-1]
                continue
            label = val
            break
        return uri, label

    def as_vector(
        self, size: int | None = None, destination: np.ndarray | None = None

    ) -> np.ndarray:
        """Return the hits as a one-dimensional NumPy array in sklearn
        multilabel indicator format. Use destination array if given (not
        None), otherwise create and return a new one of the given size."""

        if destination is None:
            import numpy as np

            assert size is not None and size > 0
            destination = np.zeros(size, dtype=bool)

        destination[list(self._subject_ids)] = True

        return destination


1			"""Classes for supporting subject corpora expressed as directories or files"""
2
3			from __future__ import annotations
4
5			import csv
6			import os.path
7			from typing import TYPE_CHECKING, Any
8
9			import annif
10			import annif.util
11
12			from .skos import serialize_subjects_to_skos
13			from .types import Subject, SubjectCorpus
14
15			if TYPE_CHECKING:
16			from collections.abc import Generator, Iterator
17
18			import numpy as np
19
20			from annif.vocab import SubjectIndex
21
22
23			class SubjectFileTSV(SubjectCorpus):
24			"""A monolingual subject vocabulary stored in a TSV file."""
25
26			def __init__(self, path: str, language: str) -> None:
27			"""initialize the SubjectFileTSV given a path to a TSV file and the
28			language of the vocabulary"""
29
30			self.path = path
31			self.language = language
32
33			def _parse_line(self, line: str) -> Iterator[Subject]:
34			vals = line.strip().split("\t", 2)
35			clean_uri = annif.util.cleanup_uri(vals[0])
36			label = vals[1] if len(vals) >= 2 else None
37			labels = {self.language: label} if label else None
38			notation = vals[2] if len(vals) >= 3 else None
39			yield Subject(uri=clean_uri, labels=labels, notation=notation)
40
41			@property
42			def languages(self) -> list[str]:
43			return [self.language]
44
45			@property
46			def subjects(self) -> Generator:
47			with open(self.path, encoding="utf-8-sig") as subjfile:
48			for line in subjfile:
49			yield from self._parse_line(line)
50
51			def save_skos(self, path: str) -> None:
52			"""Save the contents of the subject vocabulary into a SKOS/Turtle
53			file with the given path name."""
54			serialize_subjects_to_skos(self.subjects, path)
55
56
57			class SubjectFileCSV(SubjectCorpus):
58			"""A multilingual subject vocabulary stored in a CSV file."""
59
60			def __init__(self, path: str) -> None:
61			"""initialize the SubjectFileCSV given a path to a CSV file"""
62			self.path = path
63
64			def _parse_row(self, row: dict[str, str]) -> Iterator[Subject]:
65			labels = {
66			fname.replace("label_", ""): value or None
67			for fname, value in row.items()
68			if fname.startswith("label_")
69			}
70
71			# if there are no labels in any language, set labels to None
72			# indicating a deprecated subject
73			if set(labels.values()) == {None}:
74			labels = None
75
76			yield Subject(
77			uri=annif.util.cleanup_uri(row["uri"]),
78			labels=labels,
79			notation=row.get("notation", None) or None,
80			)
81
82			@property
83			def languages(self) -> list[str]:
84			# infer the supported languages from the CSV column names
85			with open(self.path, encoding="utf-8-sig") as csvfile:
86			reader = csv.reader(csvfile)
87			fieldnames = next(reader, None)
88
89			return [
90			fname.replace("label_", "")
91			for fname in fieldnames
92			if fname.startswith("label_")
93			]
94
95			@property
96			def subjects(self) -> Generator:
97			with open(self.path, encoding="utf-8-sig") as csvfile:
98			reader = csv.DictReader(csvfile)
99			for row in reader:
100			yield from self._parse_row(row)
101
102			def save_skos(self, path: str) -> None:
103			"""Save the contents of the subject vocabulary into a SKOS/Turtle
104			file with the given path name."""
105			serialize_subjects_to_skos(self.subjects, path)
106
107			@staticmethod
108			def is_csv_file(path: str) -> bool:
109			"""return True if the path looks like a CSV file"""
110
111			return os.path.splitext(path)[1].lower() == ".csv"
112
113
114			class SubjectSet:
115			"""Represents a set of subjects for a document."""
116
117			def __init__(self, subject_ids: Any \| None = None) -> None:
118			"""Create a SubjectSet and optionally initialize it from an iterable
119			of subject IDs"""
120
121			if subject_ids:
122			# use set comprehension to eliminate possible duplicates
123			self._subject_ids = list(
124			{subject_id for subject_id in subject_ids if subject_id is not None}
125			)
126			else:
127			self._subject_ids = []
128
129			def __len__(self) -> int:
130			return len(self._subject_ids)
131
132			def __getitem__(self, idx: int) -> int:
133			return self._subject_ids[idx]
134
135			def __bool__(self) -> bool:
136			return bool(self._subject_ids)
137
138			def __eq__(self, other: Any) -> bool:
139			if isinstance(other, SubjectSet):
140			return self._subject_ids == other._subject_ids
141
142			return False
143
144			@classmethod
145			def from_string(
146			cls, subj_data: str, subject_index: SubjectIndex, language: str
147			) -> SubjectSet:
148			subject_ids = set()
149			for line in subj_data.splitlines():
150			uri, label = cls._parse_line(line)
151			if uri is not None:
152			subject_ids.add(subject_index.by_uri(uri))
153			else:
154			subject_ids.add(subject_index.by_label(label, language))
155			return cls(subject_ids)
156
157			@staticmethod
158			def _parse_line(
159			line: str,
160			) -> tuple[str \| None, str \| None]:
161			uri = label = None
162			vals = line.split("\t")
163			for val in vals:
164			val = val.strip()
165			if val == "":
166			continue
167			if val.startswith("<") and val.endswith(">"): # URI
168			uri = val[1:-1]
169			continue
170			label = val
171			break
172			return uri, label
173
174			def as_vector(
175			self, size: int \| None = None, destination: np.ndarray \| None = None
			0 ignored issues – show introduced 2023-05-23 12:04 UTC by Report Bug Copy Issue Report The variable `np` does not seem to be defined in case `TYPE_CHECKING` on line `15` is `False`. Are you sure this can never be the case? Loading history...
176			) -> np.ndarray:
177			"""Return the hits as a one-dimensional NumPy array in sklearn
178			multilabel indicator format. Use destination array if given (not
179			None), otherwise create and return a new one of the given size."""
180
181			if destination is None:
182			import numpy as np
183
184			assert size is not None and size > 0
185			destination = np.zeros(size, dtype=bool)
186
187			destination[list(self._subject_ids)] = True
188
189			return destination
190

NatLibFi / Annif

Pull Request — main (#840)

annif.corpus.subject A

Complexity

Size/Duplication

Importance

19 Methods

Duplication Side-by-Side

Filter issues like