annif.vocab.vocab   A
last analyzed

Complexity

Total Complexity 25

Size/Duplication

Total Lines 160
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 110
dl 0
loc 160
rs 10
c 0
b 0
f 0
wmc 25

10 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifVocabulary._update_subject_index() 0 17 5
A AnnifVocabulary.load_vocabulary() 0 21 3
A AnnifVocabulary.subjects() 0 10 3
A AnnifVocabulary.languages() 0 6 2
A AnnifVocabulary.as_graph() 0 3 1
B AnnifVocabulary.skos() 0 28 6
A AnnifVocabulary.__init__() 0 4 1
A AnnifVocabulary.dump() 0 17 2
A AnnifVocabulary._create_subject_index() 0 5 1
A AnnifVocabulary.__len__() 0 2 1
1
"""Vocabulary management functionality for Annif"""
2
3
from __future__ import annotations
4
5
import os.path
6
from typing import TYPE_CHECKING
7
8
import annif
9
import annif.corpus
10
import annif.util
11
from annif.datadir import DatadirMixin
12
from annif.exception import NotInitializedException
13
14
from .skos import VocabFileSKOS
15
from .subject_index import SubjectIndexFile
16
from .types import Subject, SubjectIndex, VocabSource
17
18
if TYPE_CHECKING:
19
    from rdflib.graph import Graph
20
21
22
logger = annif.logger
23
logger.addFilter(annif.util.DuplicateFilter())
24
25
26
class AnnifVocabulary(DatadirMixin):
27
    """Class representing a subject vocabulary which can be used by multiple
28
    Annif projects."""
29
30
    # defaults for uninitialized instances
31
    _subjects = None
32
33
    # constants
34
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
35
    INDEX_FILENAME_TTL = "subjects.ttl"
36
    INDEX_FILENAME_CSV = "subjects.csv"
37
38
    def __init__(self, vocab_id: str, datadir: str) -> None:
39
        DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
40
        self.vocab_id = vocab_id
41
        self._skos_vocab = None
42
43
    def _create_subject_index(self, vocab_source: VocabSource) -> SubjectIndex:
44
        subjects = SubjectIndexFile()
45
        subjects.load_subjects(vocab_source)
46
        annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
47
        return subjects
48
49
    def _update_subject_index(self, vocab_source: VocabSource) -> SubjectIndex:
50
        old_subjects = self.subjects
51
        new_subjects = SubjectIndexFile()
52
        new_subjects.load_subjects(vocab_source)
53
        updated_subjects = SubjectIndexFile()
54
55
        for old_subject in old_subjects:
56
            if new_subjects.contains_uri(old_subject.uri):
57
                new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
58
            else:  # subject removed from new corpus
59
                new_subject = Subject(uri=old_subject.uri, labels=None, notation=None)
60
            updated_subjects.append(new_subject)
61
        for new_subject in new_subjects:
62
            if not old_subjects.contains_uri(new_subject.uri):
63
                updated_subjects.append(new_subject)
64
        annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
65
        return updated_subjects
66
67
    @property
68
    def subjects(self) -> SubjectIndex:
69
        if self._subjects is None:
70
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
71
            if os.path.exists(path):
72
                logger.debug("loading subjects from %s", path)
73
                self._subjects = SubjectIndexFile.load(path)
74
            else:
75
                raise NotInitializedException("subject file {} not found".format(path))
76
        return self._subjects
77
78
    @property
79
    def skos(self) -> VocabFileSKOS:
80
        """return the subject vocabulary from SKOS file"""
81
        if self._skos_vocab is not None:
82
            return self._skos_vocab
83
84
        # attempt to load graph from dump file
85
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
86
        if os.path.exists(dumppath):
87
            logger.debug(f"loading graph dump from {dumppath}")
88
            try:
89
                self._skos_vocab = VocabFileSKOS(dumppath)
90
            except ModuleNotFoundError:
91
                # Probably dump has been saved using a different rdflib version
92
                logger.debug("could not load graph dump, using turtle file")
93
            else:
94
                return self._skos_vocab
95
96
        # graph dump file not found - parse ttl file instead
97
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
98
        if os.path.exists(path):
99
            logger.debug(f"loading graph from {path}")
100
            self._skos_vocab = VocabFileSKOS(path)
101
            # store the dump file so we can use it next time
102
            self._skos_vocab.save_skos(path)
103
            return self._skos_vocab
104
105
        raise NotInitializedException(f"graph file {path} not found")
106
107
    def __len__(self) -> int:
108
        return len(self.subjects)
109
110
    @property
111
    def languages(self) -> list[str]:
112
        try:
113
            return self.subjects.languages
114
        except NotInitializedException:
115
            return []
116
117
    def load_vocabulary(
118
        self,
119
        vocab_source: VocabSource,
120
        force: bool = False,
121
    ) -> None:
122
        """Load subjects from a subject corpus and save them into one
123
        or more subject index files as well as a SKOS/Turtle file for later
124
        use. If force=True, replace the existing subject index completely."""
125
126
        if not force and os.path.exists(
127
            os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
128
        ):
129
            logger.info("updating existing subject index")
130
            self._subjects = self._update_subject_index(vocab_source)
131
        else:
132
            logger.info("creating subject index")
133
            self._subjects = self._create_subject_index(vocab_source)
134
135
        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
136
        logger.info(f"saving vocabulary into SKOS file {skosfile}")
137
        vocab_source.save_skos(skosfile)
138
139
    def as_graph(self) -> Graph:
140
        """return the vocabulary as an rdflib graph"""
141
        return self.skos.graph
142
143
    def dump(self) -> dict[str, str | list | int | bool]:
144
        """return this vocabulary as a dict"""
145
146
        try:
147
            languages = list(sorted(self.languages))
148
            size = len(self)
149
            loaded = True
150
        except NotInitializedException:
151
            languages = []
152
            size = None
153
            loaded = False
154
155
        return {
156
            "vocab_id": self.vocab_id,
157
            "languages": languages,
158
            "size": size,
159
            "loaded": loaded,
160
        }
161