Passed
Push — issue735-subject-filtering ( d4533d...f9dfa6 )
by Osma
03:38
created

annif.vocab.vocab.AnnifVocabulary.skos()   B

Complexity

Conditions 6

Size

Total Lines 28
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 19
nop 1
dl 0
loc 28
rs 8.5166
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
from __future__ import annotations
4
5
import os.path
6
from typing import TYPE_CHECKING
7
8
import annif
9
import annif.corpus
10
import annif.util
11
from annif.datadir import DatadirMixin
12
from annif.exception import NotInitializedException
13
14
from .subject_index import SubjectIndexFile
15
from .types import SubjectIndex
16
17
if TYPE_CHECKING:
18
    from rdflib.graph import Graph
19
20
    from annif.corpus.skos import SubjectFileSKOS
21
    from annif.corpus.subject import SubjectCorpus
22
23
24
logger = annif.logger
25
logger.addFilter(annif.util.DuplicateFilter())
26
27
28
class AnnifVocabulary(DatadirMixin):
29
    """Class representing a subject vocabulary which can be used by multiple
30
    Annif projects."""
31
32
    # defaults for uninitialized instances
33
    _subjects = None
34
35
    # constants
36
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
37
    INDEX_FILENAME_TTL = "subjects.ttl"
38
    INDEX_FILENAME_CSV = "subjects.csv"
39
40
    def __init__(self, vocab_id: str, datadir: str) -> None:
41
        DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
42
        self.vocab_id = vocab_id
43
        self._skos_vocab = None
44
45
    def _create_subject_index(self, subject_corpus: SubjectCorpus) -> SubjectIndex:
46
        subjects = SubjectIndexFile()
47
        subjects.load_subjects(subject_corpus)
48
        annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
49
        return subjects
50
51
    def _update_subject_index(self, subject_corpus: SubjectCorpus) -> SubjectIndex:
52
        old_subjects = self.subjects
53
        new_subjects = SubjectIndexFile()
54
        new_subjects.load_subjects(subject_corpus)
55
        updated_subjects = SubjectIndexFile()
56
57
        for old_subject in old_subjects:
58
            if new_subjects.contains_uri(old_subject.uri):
59
                new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
60
            else:  # subject removed from new corpus
61
                new_subject = annif.corpus.Subject(
62
                    uri=old_subject.uri, labels=None, notation=None
63
                )
64
            updated_subjects.append(new_subject)
65
        for new_subject in new_subjects:
66
            if not old_subjects.contains_uri(new_subject.uri):
67
                updated_subjects.append(new_subject)
68
        annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
69
        return updated_subjects
70
71
    @property
72
    def subjects(self) -> SubjectIndex:
73
        if self._subjects is None:
74
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
75
            if os.path.exists(path):
76
                logger.debug("loading subjects from %s", path)
77
                self._subjects = SubjectIndexFile.load(path)
78
            else:
79
                raise NotInitializedException("subject file {} not found".format(path))
80
        return self._subjects
81
82
    @property
83
    def skos(self) -> SubjectFileSKOS:
84
        """return the subject vocabulary from SKOS file"""
85
        if self._skos_vocab is not None:
86
            return self._skos_vocab
87
88
        # attempt to load graph from dump file
89
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
90
        if os.path.exists(dumppath):
91
            logger.debug(f"loading graph dump from {dumppath}")
92
            try:
93
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
94
            except ModuleNotFoundError:
95
                # Probably dump has been saved using a different rdflib version
96
                logger.debug("could not load graph dump, using turtle file")
97
            else:
98
                return self._skos_vocab
99
100
        # graph dump file not found - parse ttl file instead
101
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
102
        if os.path.exists(path):
103
            logger.debug(f"loading graph from {path}")
104
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
105
            # store the dump file so we can use it next time
106
            self._skos_vocab.save_skos(path)
107
            return self._skos_vocab
108
109
        raise NotInitializedException(f"graph file {path} not found")
110
111
    def __len__(self) -> int:
112
        return len(self.subjects)
113
114
    @property
115
    def languages(self) -> list[str]:
116
        try:
117
            return self.subjects.languages
118
        except NotInitializedException:
119
            return []
120
121
    def load_vocabulary(
122
        self,
123
        subject_corpus: SubjectCorpus,
124
        force: bool = False,
125
    ) -> None:
126
        """Load subjects from a subject corpus and save them into one
127
        or more subject index files as well as a SKOS/Turtle file for later
128
        use. If force=True, replace the existing subject index completely."""
129
130
        if not force and os.path.exists(
131
            os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
132
        ):
133
            logger.info("updating existing subject index")
134
            self._subjects = self._update_subject_index(subject_corpus)
135
        else:
136
            logger.info("creating subject index")
137
            self._subjects = self._create_subject_index(subject_corpus)
138
139
        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
140
        logger.info(f"saving vocabulary into SKOS file {skosfile}")
141
        subject_corpus.save_skos(skosfile)
142
143
    def as_graph(self) -> Graph:
144
        """return the vocabulary as an rdflib graph"""
145
        return self.skos.graph
146
147
    def dump(self) -> dict[str, str | list | int | bool]:
148
        """return this vocabulary as a dict"""
149
150
        try:
151
            languages = list(sorted(self.languages))
152
            size = len(self)
153
            loaded = True
154
        except NotInitializedException:
155
            languages = []
156
            size = None
157
            loaded = False
158
159
        return {
160
            "vocab_id": self.vocab_id,
161
            "languages": languages,
162
            "size": size,
163
            "loaded": loaded,
164
        }
165