annif.vocab.subject_index   A
last analyzed

Complexity

Total Complexity 39

Size/Duplication

Total Lines 164
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 114
dl 0
loc 164
rs 9.28
c 0
b 0
f 0
wmc 39

20 Methods

Rating   Name   Duplication   Size   Complexity  
A SubjectIndexFilter.by_uri() 0 7 2
A SubjectIndexFilter.contains_uri() 0 4 2
A SubjectIndexFile.save() 0 14 5
A SubjectIndexFilter.by_label() 0 9 3
A SubjectIndexFilter.languages() 0 3 1
A SubjectIndexFile.by_label() 0 6 2
A SubjectIndexFilter.active() 0 9 1
A SubjectIndexFile.load() 0 8 1
A SubjectIndexFile.active() 0 6 1
A SubjectIndexFilter.__init__() 0 3 1
A SubjectIndexFilter.__len__() 0 2 1
A SubjectIndexFile.load_subjects() 0 6 2
A SubjectIndexFile.__getitem__() 0 5 2
A SubjectIndexFile.contains_uri() 0 2 1
A SubjectIndexFilter.__getitem__() 0 5 2
A SubjectIndexFile.languages() 0 3 1
A SubjectIndexFile.append() 0 10 5
A SubjectIndexFile.by_uri() 0 10 4
A SubjectIndexFile.__init__() 0 5 1
A SubjectIndexFile.__len__() 0 2 1
1
"""Subject index functionality for Annif"""
2
3
from __future__ import annotations
4
5
import csv
6
7
import annif
8
import annif.util
9
10
from .subject_file import VocabFileCSV
11
from .types import Subject, SubjectIndex, VocabSource
12
13
logger = annif.logger
14
logger.addFilter(annif.util.DuplicateFilter())
15
16
17
class SubjectIndexFile(SubjectIndex):
18
    """SubjectIndex implementation backed by a file."""
19
20
    def __init__(self) -> None:
21
        self._subjects = []
22
        self._uri_idx = {}
23
        self._label_idx = {}
24
        self._languages = None
25
26
    def load_subjects(self, vocab_source: VocabSource) -> None:
27
        """Initialize the subject index from a subject corpus"""
28
29
        self._languages = vocab_source.languages
30
        for subject in vocab_source.subjects:
31
            self.append(subject)
32
33
    def __len__(self) -> int:
34
        return len(self._subjects)
35
36
    @property
37
    def languages(self) -> list[str] | None:
38
        return self._languages
39
40
    def __getitem__(self, subject_id: int) -> Subject:
41
        subject = self._subjects[subject_id]
42
        if subject.labels is None:
43
            raise IndexError(f"Subject is deprecated: {subject_id}")
44
        return subject
45
46
    def append(self, subject: Subject) -> None:
47
        if self._languages is None and subject.labels is not None:
48
            self._languages = list(subject.labels.keys())
49
50
        subject_id = len(self._subjects)
51
        self._uri_idx[subject.uri] = subject_id
52
        if subject.labels:
53
            for lang, label in subject.labels.items():
54
                self._label_idx[(label, lang)] = subject_id
55
        self._subjects.append(subject)
56
57
    def contains_uri(self, uri: str) -> bool:
58
        return uri in self._uri_idx
59
60
    def by_uri(self, uri: str, warnings: bool = True) -> int | None:
61
        try:
62
            subject_id = self._uri_idx[uri]
63
            if self._subjects[subject_id].labels is None:  # deprecated
64
                return None
65
            return subject_id
66
        except KeyError:
67
            if warnings:
68
                logger.warning("Unknown subject URI <%s>", uri)
69
            return None
70
71
    def by_label(self, label: str | None, language: str) -> int | None:
72
        try:
73
            return self._label_idx[(label, language)]
74
        except KeyError:
75
            logger.warning('Unknown subject label "%s"@%s', label, language)
76
            return None
77
78
    @property
79
    def active(self) -> list[tuple[int, Subject]]:
80
        return [
81
            (subj_id, subject)
82
            for subj_id, subject in enumerate(self._subjects)
83
            if subject.labels is not None
84
        ]
85
86
    def save(self, path: str) -> None:
87
        """Save this subject index into a file with the given path name."""
88
89
        fieldnames = ["uri", "notation"] + [f"label_{lang}" for lang in self._languages]
90
91
        with open(path, "w", encoding="utf-8", newline="") as csvfile:
92
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
93
            writer.writeheader()
94
            for subject in self:
95
                row = {"uri": subject.uri, "notation": subject.notation or ""}
96
                if subject.labels:
97
                    for lang, label in subject.labels.items():
98
                        row[f"label_{lang}"] = label
99
                writer.writerow(row)
100
101
    @classmethod
102
    def load(cls, path: str) -> SubjectIndex:
103
        """Load a subject index from a CSV file and return it."""
104
105
        vocab_file = VocabFileCSV(path)
106
        subject_index = cls()
107
        subject_index.load_subjects(vocab_file)
108
        return subject_index
109
110
111
class SubjectIndexFilter(SubjectIndex):
112
    """SubjectIndex implementation that filters another SubjectIndex based
113
    on a list of subject URIs to exclude."""
114
115
    def __init__(self, subject_index: SubjectIndex, exclude: set[str]):
116
        self._subject_index = subject_index
117
        self._exclude = exclude
118
119
    def __len__(self) -> int:
120
        return len(self._subject_index)
121
122
    @property
123
    def languages(self) -> list[str] | None:
124
        return self._subject_index.languages
125
126
    def __getitem__(self, subject_id: int) -> Subject:
127
        subject = self._subject_index[subject_id]
128
        if subject.uri in self._exclude:
129
            raise IndexError(f"Subject is excluded: {subject.uri}")
130
        return subject
131
132
    def contains_uri(self, uri: str) -> bool:
133
        if uri in self._exclude:
134
            return False
135
        return self._subject_index.contains_uri(uri)
136
137
    def by_uri(self, uri: str, warnings: bool = True) -> int | None:
138
        """return the subject ID of a subject by its URI, or None if not found.
139
        If warnings=True, log a warning message if the URI cannot be found."""
140
141
        if uri in self._exclude:
142
            return None
143
        return self._subject_index.by_uri(uri, warnings)
144
145
    def by_label(self, label: str | None, language: str) -> int | None:
146
        """return the subject ID of a subject by its label in a given
147
        language"""
148
149
        subject_id = self._subject_index.by_label(label, language)
150
        subject = self._subject_index[subject_id]
151
        if subject is not None and subject.uri not in self._exclude:
152
            return subject_id
153
        return None
154
155
    @property
156
    def active(self) -> list[tuple[int, Subject]]:
157
        """return a list of (subject_id, Subject) tuples of all subjects that
158
        are available for use"""
159
160
        return [
161
            (subject_id, subject)
162
            for subject_id, subject in self._subject_index.active
163
            if subject.uri not in self._exclude
164
        ]
165