Passed
Push — issue735-subject-filtering ( 5dfc00...88cdf5 )
by Osma
06:53 queued 03:34
created

annif.vocab.SubjectIndexFilter.languages()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
from __future__ import annotations
4
5
import abc
6
import csv
7
import os.path
8
from typing import TYPE_CHECKING
9
10
import annif
11
import annif.corpus
12
import annif.util
13
from annif.datadir import DatadirMixin
14
from annif.exception import NotInitializedException
15
16
if TYPE_CHECKING:
17
    from rdflib.graph import Graph
18
19
    from annif.corpus.skos import SubjectFileSKOS
20
    from annif.corpus.subject import Subject, SubjectCorpus
21
22
23
logger = annif.logger
24
logger.addFilter(annif.util.DuplicateFilter())
25
26
27
class AnnifVocabulary(DatadirMixin):
28
    """Class representing a subject vocabulary which can be used by multiple
29
    Annif projects."""
30
31
    # defaults for uninitialized instances
32
    _subjects = None
33
34
    # constants
35
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
36
    INDEX_FILENAME_TTL = "subjects.ttl"
37
    INDEX_FILENAME_CSV = "subjects.csv"
38
39
    def __init__(self, vocab_id: str, datadir: str) -> None:
40
        DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
41
        self.vocab_id = vocab_id
42
        self._skos_vocab = None
43
44
    def _create_subject_index(self, subject_corpus: SubjectCorpus) -> SubjectIndex:
45
        subjects = SubjectIndexFile()
46
        subjects.load_subjects(subject_corpus)
47
        annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
48
        return subjects
49
50
    def _update_subject_index(self, subject_corpus: SubjectCorpus) -> SubjectIndex:
51
        old_subjects = self.subjects
52
        new_subjects = SubjectIndexFile()
53
        new_subjects.load_subjects(subject_corpus)
54
        updated_subjects = SubjectIndexFile()
55
56
        for old_subject in old_subjects:
57
            if new_subjects.contains_uri(old_subject.uri):
58
                new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
59
            else:  # subject removed from new corpus
60
                new_subject = annif.corpus.Subject(
61
                    uri=old_subject.uri, labels=None, notation=None
62
                )
63
            updated_subjects.append(new_subject)
64
        for new_subject in new_subjects:
65
            if not old_subjects.contains_uri(new_subject.uri):
66
                updated_subjects.append(new_subject)
67
        annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
68
        return updated_subjects
69
70
    @property
71
    def subjects(self) -> SubjectIndex:
72
        if self._subjects is None:
73
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
74
            if os.path.exists(path):
75
                logger.debug("loading subjects from %s", path)
76
                self._subjects = SubjectIndexFile.load(path)
77
            else:
78
                raise NotInitializedException("subject file {} not found".format(path))
79
        return self._subjects
80
81
    @property
82
    def skos(self) -> SubjectFileSKOS:
83
        """return the subject vocabulary from SKOS file"""
84
        if self._skos_vocab is not None:
85
            return self._skos_vocab
86
87
        # attempt to load graph from dump file
88
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
89
        if os.path.exists(dumppath):
90
            logger.debug(f"loading graph dump from {dumppath}")
91
            try:
92
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
93
            except ModuleNotFoundError:
94
                # Probably dump has been saved using a different rdflib version
95
                logger.debug("could not load graph dump, using turtle file")
96
            else:
97
                return self._skos_vocab
98
99
        # graph dump file not found - parse ttl file instead
100
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
101
        if os.path.exists(path):
102
            logger.debug(f"loading graph from {path}")
103
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
104
            # store the dump file so we can use it next time
105
            self._skos_vocab.save_skos(path)
106
            return self._skos_vocab
107
108
        raise NotInitializedException(f"graph file {path} not found")
109
110
    def __len__(self) -> int:
111
        return len(self.subjects)
112
113
    @property
114
    def languages(self) -> list[str]:
115
        try:
116
            return self.subjects.languages
117
        except NotInitializedException:
118
            return []
119
120
    def load_vocabulary(
121
        self,
122
        subject_corpus: SubjectCorpus,
123
        force: bool = False,
124
    ) -> None:
125
        """Load subjects from a subject corpus and save them into one
126
        or more subject index files as well as a SKOS/Turtle file for later
127
        use. If force=True, replace the existing subject index completely."""
128
129
        if not force and os.path.exists(
130
            os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
131
        ):
132
            logger.info("updating existing subject index")
133
            self._subjects = self._update_subject_index(subject_corpus)
134
        else:
135
            logger.info("creating subject index")
136
            self._subjects = self._create_subject_index(subject_corpus)
137
138
        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
139
        logger.info(f"saving vocabulary into SKOS file {skosfile}")
140
        subject_corpus.save_skos(skosfile)
141
142
    def as_graph(self) -> Graph:
143
        """return the vocabulary as an rdflib graph"""
144
        return self.skos.graph
145
146
    def dump(self) -> dict[str, str | list | int | bool]:
147
        """return this vocabulary as a dict"""
148
149
        try:
150
            languages = list(sorted(self.languages))
151
            size = len(self)
152
            loaded = True
153
        except NotInitializedException:
154
            languages = []
155
            size = None
156
            loaded = False
157
158
        return {
159
            "vocab_id": self.vocab_id,
160
            "languages": languages,
161
            "size": size,
162
            "loaded": loaded,
163
        }
164
165
166
class SubjectIndex(metaclass=abc.ABCMeta):
167
    """Base class for an index that remembers the associations between
168
    integer subject IDs and their URIs and labels."""
169
170
    @abc.abstractmethod
171
    def __len__(self) -> int:
172
        pass
173
174
    @property
175
    @abc.abstractmethod
176
    def languages(self) -> list[str] | None:
177
        pass
178
179
    @abc.abstractmethod
180
    def __getitem__(self, subject_id: int) -> Subject:
181
        pass
182
183
    @abc.abstractmethod
184
    def contains_uri(self, uri: str) -> bool:
185
        pass
186
187
    @abc.abstractmethod
188
    def by_uri(self, uri: str, warnings: bool = True) -> int | None:
189
        """return the subject ID of a subject by its URI, or None if not found.
190
        If warnings=True, log a warning message if the URI cannot be found."""
191
        pass
192
193
    @abc.abstractmethod
194
    def by_label(self, label: str | None, language: str) -> int | None:
195
        """return the subject ID of a subject by its label in a given
196
        language"""
197
        pass
198
199
    @abc.abstractmethod
200
    def active(self) -> list[tuple[int, Subject]]:
201
        """return a list of (subject_id, Subject) tuples of all subjects that
202
        are available for use"""
203
        pass
204
205
206
class SubjectIndexFile(SubjectIndex):
207
    """SubjectIndex implementation backed by a file."""
208
209
    def __init__(self) -> None:
210
        self._subjects = []
211
        self._uri_idx = {}
212
        self._label_idx = {}
213
        self._languages = None
214
215
    def load_subjects(self, corpus: SubjectCorpus) -> None:
216
        """Initialize the subject index from a subject corpus"""
217
218
        self._languages = corpus.languages
219
        for subject in corpus.subjects:
220
            self.append(subject)
221
222
    def __len__(self) -> int:
223
        return len(self._subjects)
224
225
    @property
226
    def languages(self) -> list[str] | None:
227
        return self._languages
228
229
    def __getitem__(self, subject_id: int) -> Subject:
230
        return self._subjects[subject_id]
231
232
    def append(self, subject: Subject) -> None:
233
        if self._languages is None and subject.labels is not None:
234
            self._languages = list(subject.labels.keys())
235
236
        subject_id = len(self._subjects)
237
        self._uri_idx[subject.uri] = subject_id
238
        if subject.labels:
239
            for lang, label in subject.labels.items():
240
                self._label_idx[(label, lang)] = subject_id
241
        self._subjects.append(subject)
242
243
    def contains_uri(self, uri: str) -> bool:
244
        return uri in self._uri_idx
245
246
    def by_uri(self, uri: str, warnings: bool = True) -> int | None:
247
        try:
248
            return self._uri_idx[uri]
249
        except KeyError:
250
            if warnings:
251
                logger.warning("Unknown subject URI <%s>", uri)
252
            return None
253
254
    def by_label(self, label: str | None, language: str) -> int | None:
255
        try:
256
            return self._label_idx[(label, language)]
257
        except KeyError:
258
            logger.warning('Unknown subject label "%s"@%s', label, language)
259
            return None
260
261
    @property
262
    def active(self) -> list[tuple[int, Subject]]:
263
        return [
264
            (subj_id, subject)
265
            for subj_id, subject in enumerate(self._subjects)
266
            if subject.labels is not None
267
        ]
268
269
    def save(self, path: str) -> None:
270
        """Save this subject index into a file with the given path name."""
271
272
        fieldnames = ["uri", "notation"] + [f"label_{lang}" for lang in self._languages]
273
274
        with open(path, "w", encoding="utf-8", newline="") as csvfile:
275
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
276
            writer.writeheader()
277
            for subject in self:
278
                row = {"uri": subject.uri, "notation": subject.notation or ""}
279
                if subject.labels:
280
                    for lang, label in subject.labels.items():
281
                        row[f"label_{lang}"] = label
282
                writer.writerow(row)
283
284
    @classmethod
285
    def load(cls, path: str) -> SubjectIndex:
286
        """Load a subject index from a CSV file and return it."""
287
288
        corpus = annif.corpus.SubjectFileCSV(path)
289
        subject_index = cls()
290
        subject_index.load_subjects(corpus)
291
        return subject_index
292
293
294
class SubjectIndexFilter(SubjectIndex):
295
    """SubjectIndex implementation that filters another SubjectIndex based
296
    on a list of subject URIs to exclude."""
297
298
    def __init__(self, subject_index: SubjectIndex, exclude: list[str]):
299
        self._subject_index = subject_index
300
        self._exclude = set(exclude)
301
302
    def __len__(self) -> int:
303
        return len(self._subject_index)
304
305
    @property
306
    def languages(self) -> list[str] | None:
307
        return self._subject_index.languages
308
309
    def __getitem__(self, subject_id: int) -> Subject:
310
        subject = self._subject_index[subject_id]
311
        if subject and subject.uri not in self._exclude:
312
            return subject
313
        return None
314
315
    def contains_uri(self, uri: str) -> bool:
316
        if uri in self._exclude:
317
            return False
318
        return self._subject_index.contains_uri(uri)
319
320
    def by_uri(self, uri: str, warnings: bool = True) -> int | None:
321
        """return the subject ID of a subject by its URI, or None if not found.
322
        If warnings=True, log a warning message if the URI cannot be found."""
323
324
        if uri in self._exclude:
325
            return None
326
        return self._subject_index.by_uri(uri, warnings)
327
328
    def by_label(self, label: str | None, language: str) -> int | None:
329
        """return the subject ID of a subject by its label in a given
330
        language"""
331
332
        subject_id = self._subject_index.by_label(label, language)
333
        subject = self._subject_index[subject_id]
334
        if subject is not None and subject.uri not in self._exclude:
335
            return subject_id
336
        return None
337
338
    @property
339
    def active(self) -> list[tuple[int, Subject]]:
340
        """return a list of (subject_id, Subject) tuples of all subjects that
341
        are available for use"""
342
343
        return [
344
            (subject_id, subject)
345
            for subject_id, subject in self._subject_index.active
346
            if subject.uri not in self._exclude
347
        ]
348