annif.vocab.vocab - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

annif.vocab.vocab A
last analyzed 2025-08-06 10:43 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	160
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	110
dl	0
loc	160
rs	10
c	0
b	0
f	0
wmc	25

10 Methods

Rating	Name	Size	Complexity
A	AnnifVocabulary._update_subject_index()	17	5
A	AnnifVocabulary.load_vocabulary()	21	3
A	AnnifVocabulary.subjects()	10	3
A	AnnifVocabulary.languages()	6	2
A	AnnifVocabulary.as_graph()	3	1
B	AnnifVocabulary.skos()	28	6
A	AnnifVocabulary.__init__()	4	1
A	AnnifVocabulary.dump()	17	2
A	AnnifVocabulary._create_subject_index()	5	1
A	AnnifVocabulary.__len__()	2	1

"""Vocabulary management functionality for Annif"""

from __future__ import annotations

import os.path
from typing import TYPE_CHECKING

import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException

from .skos import VocabFileSKOS
from .subject_index import SubjectIndexFile
from .types import Subject, SubjectIndex, VocabSource

if TYPE_CHECKING:
    from rdflib.graph import Graph


logger = annif.logger
logger.addFilter(annif.util.DuplicateFilter())


class AnnifVocabulary(DatadirMixin):
    """Class representing a subject vocabulary which can be used by multiple
    Annif projects."""

    # defaults for uninitialized instances
    _subjects = None

    # constants
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
    INDEX_FILENAME_TTL = "subjects.ttl"
    INDEX_FILENAME_CSV = "subjects.csv"

    def __init__(self, vocab_id: str, datadir: str) -> None:
        DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
        self.vocab_id = vocab_id
        self._skos_vocab = None

    def _create_subject_index(self, vocab_source: VocabSource) -> SubjectIndex:
        subjects = SubjectIndexFile()
        subjects.load_subjects(vocab_source)
        annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
        return subjects

    def _update_subject_index(self, vocab_source: VocabSource) -> SubjectIndex:
        old_subjects = self.subjects
        new_subjects = SubjectIndexFile()
        new_subjects.load_subjects(vocab_source)
        updated_subjects = SubjectIndexFile()

        for old_subject in old_subjects:
            if new_subjects.contains_uri(old_subject.uri):
                new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
            else:  # subject removed from new corpus
                new_subject = Subject(uri=old_subject.uri, labels=None, notation=None)
            updated_subjects.append(new_subject)
        for new_subject in new_subjects:
            if not old_subjects.contains_uri(new_subject.uri):
                updated_subjects.append(new_subject)
        annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
        return updated_subjects

    @property
    def subjects(self) -> SubjectIndex:
        if self._subjects is None:
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
            if os.path.exists(path):
                logger.debug("loading subjects from %s", path)
                self._subjects = SubjectIndexFile.load(path)
            else:
                raise NotInitializedException("subject file {} not found".format(path))
        return self._subjects

    @property
    def skos(self) -> VocabFileSKOS:
        """return the subject vocabulary from SKOS file"""
        if self._skos_vocab is not None:
            return self._skos_vocab

        # attempt to load graph from dump file
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
        if os.path.exists(dumppath):
            logger.debug(f"loading graph dump from {dumppath}")
            try:
                self._skos_vocab = VocabFileSKOS(dumppath)
            except ModuleNotFoundError:
                # Probably dump has been saved using a different rdflib version
                logger.debug("could not load graph dump, using turtle file")
            else:
                return self._skos_vocab

        # graph dump file not found - parse ttl file instead
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
        if os.path.exists(path):
            logger.debug(f"loading graph from {path}")
            self._skos_vocab = VocabFileSKOS(path)
            # store the dump file so we can use it next time
            self._skos_vocab.save_skos(path)
            return self._skos_vocab

        raise NotInitializedException(f"graph file {path} not found")

    def __len__(self) -> int:
        return len(self.subjects)

    @property
    def languages(self) -> list[str]:
        try:
            return self.subjects.languages
        except NotInitializedException:
            return []

    def load_vocabulary(
        self,
        vocab_source: VocabSource,
        force: bool = False,
    ) -> None:
        """Load subjects from a subject corpus and save them into one
        or more subject index files as well as a SKOS/Turtle file for later
        use. If force=True, replace the existing subject index completely."""

        if not force and os.path.exists(
            os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
        ):
            logger.info("updating existing subject index")
            self._subjects = self._update_subject_index(vocab_source)
        else:
            logger.info("creating subject index")
            self._subjects = self._create_subject_index(vocab_source)

        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
        logger.info(f"saving vocabulary into SKOS file {skosfile}")
        vocab_source.save_skos(skosfile)

    def as_graph(self) -> Graph:
        """return the vocabulary as an rdflib graph"""
        return self.skos.graph

    def dump(self) -> dict[str, str | list | int | bool]:
        """return this vocabulary as a dict"""

        try:
            languages = list(sorted(self.languages))
            size = len(self)
            loaded = True
        except NotInitializedException:
            languages = []
            size = None
            loaded = False

        return {
            "vocab_id": self.vocab_id,
            "languages": languages,
            "size": size,
            "loaded": loaded,
        }


1			"""Vocabulary management functionality for Annif"""
2
3			from __future__ import annotations
4
5			import os.path
6			from typing import TYPE_CHECKING
7
8			import annif
9			import annif.corpus
10			import annif.util
11			from annif.datadir import DatadirMixin
12			from annif.exception import NotInitializedException
13
14			from .skos import VocabFileSKOS
15			from .subject_index import SubjectIndexFile
16			from .types import Subject, SubjectIndex, VocabSource
17
18			if TYPE_CHECKING:
19			from rdflib.graph import Graph
20
21
22			logger = annif.logger
23			logger.addFilter(annif.util.DuplicateFilter())
24
25
26			class AnnifVocabulary(DatadirMixin):
27			"""Class representing a subject vocabulary which can be used by multiple
28			Annif projects."""
29
30			# defaults for uninitialized instances
31			_subjects = None
32
33			# constants
34			INDEX_FILENAME_DUMP = "subjects.dump.gz"
35			INDEX_FILENAME_TTL = "subjects.ttl"
36			INDEX_FILENAME_CSV = "subjects.csv"
37
38			def __init__(self, vocab_id: str, datadir: str) -> None:
39			DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
40			self.vocab_id = vocab_id
41			self._skos_vocab = None
42
43			def _create_subject_index(self, vocab_source: VocabSource) -> SubjectIndex:
44			subjects = SubjectIndexFile()
45			subjects.load_subjects(vocab_source)
46			annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
47			return subjects
48
49			def _update_subject_index(self, vocab_source: VocabSource) -> SubjectIndex:
50			old_subjects = self.subjects
51			new_subjects = SubjectIndexFile()
52			new_subjects.load_subjects(vocab_source)
53			updated_subjects = SubjectIndexFile()
54
55			for old_subject in old_subjects:
56			if new_subjects.contains_uri(old_subject.uri):
57			new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
58			else: # subject removed from new corpus
59			new_subject = Subject(uri=old_subject.uri, labels=None, notation=None)
60			updated_subjects.append(new_subject)
61			for new_subject in new_subjects:
62			if not old_subjects.contains_uri(new_subject.uri):
63			updated_subjects.append(new_subject)
64			annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
65			return updated_subjects
66
67			@property
68			def subjects(self) -> SubjectIndex:
69			if self._subjects is None:
70			path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
71			if os.path.exists(path):
72			logger.debug("loading subjects from %s", path)
73			self._subjects = SubjectIndexFile.load(path)
74			else:
75			raise NotInitializedException("subject file {} not found".format(path))
76			return self._subjects
77
78			@property
79			def skos(self) -> VocabFileSKOS:
80			"""return the subject vocabulary from SKOS file"""
81			if self._skos_vocab is not None:
82			return self._skos_vocab
83
84			# attempt to load graph from dump file
85			dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
86			if os.path.exists(dumppath):
87			logger.debug(f"loading graph dump from {dumppath}")
88			try:
89			self._skos_vocab = VocabFileSKOS(dumppath)
90			except ModuleNotFoundError:
91			# Probably dump has been saved using a different rdflib version
92			logger.debug("could not load graph dump, using turtle file")
93			else:
94			return self._skos_vocab
95
96			# graph dump file not found - parse ttl file instead
97			path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
98			if os.path.exists(path):
99			logger.debug(f"loading graph from {path}")
100			self._skos_vocab = VocabFileSKOS(path)
101			# store the dump file so we can use it next time
102			self._skos_vocab.save_skos(path)
103			return self._skos_vocab
104
105			raise NotInitializedException(f"graph file {path} not found")
106
107			def __len__(self) -> int:
108			return len(self.subjects)
109
110			@property
111			def languages(self) -> list[str]:
112			try:
113			return self.subjects.languages
114			except NotInitializedException:
115			return []
116
117			def load_vocabulary(
118			self,
119			vocab_source: VocabSource,
120			force: bool = False,
121			) -> None:
122			"""Load subjects from a subject corpus and save them into one
123			or more subject index files as well as a SKOS/Turtle file for later
124			use. If force=True, replace the existing subject index completely."""
125
126			if not force and os.path.exists(
127			os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
128			):
129			logger.info("updating existing subject index")
130			self._subjects = self._update_subject_index(vocab_source)
131			else:
132			logger.info("creating subject index")
133			self._subjects = self._create_subject_index(vocab_source)
134
135			skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
136			logger.info(f"saving vocabulary into SKOS file {skosfile}")
137			vocab_source.save_skos(skosfile)
138
139			def as_graph(self) -> Graph:
140			"""return the vocabulary as an rdflib graph"""
141			return self.skos.graph
142
143			def dump(self) -> dict[str, str \| list \| int \| bool]:
144			"""return this vocabulary as a dict"""
145
146			try:
147			languages = list(sorted(self.languages))
148			size = len(self)
149			loaded = True
150			except NotInitializedException:
151			languages = []
152			size = None
153			loaded = False
154
155			return {
156			"vocab_id": self.vocab_id,
157			"languages": languages,
158			"size": size,
159			"loaded": loaded,
160			}
161

NatLibFi / Annif

annif.vocab.vocab A last analyzed 2025-08-06 10:43 UTC

Complexity

Size/Duplication

Importance

10 Methods

Duplication Side-by-Side

Filter issues like

annif.vocab.vocab A
last analyzed 2025-08-06 10:43 UTC