annif.vocab.get_vocab() - Code Metrics - Inspection of "Make vocabularies multilingual" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#600)

by Osma

created 2022-08-05 14:48 UTC

annif.vocab.get_vocab() A

↳ Parent: annif.vocab

Complexity

Conditions

Size

Total Lines	9
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	8
nop	3
dl	0
loc	9
rs	10
c	0
b	0
f	0

"""Vocabulary management functionality for Annif"""

import os.path
import re
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException
from annif.util import parse_args

logger = annif.logger


def get_vocab(vocab_spec, datadir, default_language):
    match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
    if match is None:
        raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
    vocab_id = match.group(1)
    posargs, kwargs = parse_args(match.group(3))
    language = posargs[0] if posargs else default_language

    return AnnifVocabulary(vocab_id, datadir, language)


class AnnifVocabulary(DatadirMixin):

    """Class representing a subject vocabulary which can be used by multiple
    Annif projects."""

    # defaults for uninitialized instances
    _subjects = None

    def __init__(self, vocab_id, datadir, language):
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
        self.vocab_id = vocab_id
        self.language = language
        self._skos_vocab = None

    @staticmethod
    def _index_filename(language):
        return f"subjects.{language}.tsv"

    def _create_subject_index(self, subject_corpus, language):
        subjects = annif.corpus.SubjectIndex()
        subjects.load_subjects(subject_corpus, language)
        annif.util.atomic_save(subjects, self.datadir,
                               self._index_filename(language))
        return subjects

    def _update_subject_index(self, subject_corpus, language):
        old_subjects = self.subjects
        new_subjects = annif.corpus.SubjectIndex()
        new_subjects.load_subjects(subject_corpus, language)
        updated_subjects = annif.corpus.SubjectIndex()

        for uri, label, notation in old_subjects:
            if new_subjects.contains_uri(uri):
                label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
            else:  # subject removed from new corpus
                label, notation = None, None
            updated_subjects.append(uri, label, notation)
        for uri, label, notation in new_subjects:
            if not old_subjects.contains_uri(uri):
                updated_subjects.append(uri, label, notation)
        annif.util.atomic_save(updated_subjects, self.datadir,
                               self._index_filename(language))
        return updated_subjects

    @property
    def subjects(self):
        if self._subjects is None:
            path = os.path.join(self.datadir,
                                self._index_filename(self.language))
            if os.path.exists(path):
                logger.debug('loading subjects from %s', path)
                self._subjects = annif.corpus.SubjectIndex.load(path)
            else:
                raise NotInitializedException(
                    "subject file {} not found".format(path))
        return self._subjects

    @property
    def skos(self):
        """return the subject vocabulary from SKOS file"""
        if self._skos_vocab is not None:
            return self._skos_vocab

        # attempt to load graph from dump file
        dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
        if os.path.exists(dumppath):
            logger.debug(f'loading graph dump from {dumppath}')
            try:
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
            except ModuleNotFoundError:
                # Probably dump has been saved using a different rdflib version
                logger.debug('could not load graph dump, using turtle file')
            else:
                return self._skos_vocab

        # graph dump file not found - parse ttl file instead
        path = os.path.join(self.datadir, 'subjects.ttl')
        if os.path.exists(path):
            logger.debug(f'loading graph from {path}')
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
            # store the dump file so we can use it next time
            self._skos_vocab.save_skos(path, self.language)
            return self._skos_vocab

        raise NotInitializedException(f'graph file {path} not found')

    def load_vocabulary(self, subject_corpus, default_language, force=False):
        """Load subjects from a subject corpus and save them into one
        or more subject index files as well as a SKOS/Turtle file for later
        use. If force=True, replace the existing subject index completely."""

        languages = subject_corpus.languages
        if not languages:
            # subject corpus isn't language-aware or can't detect languages
            # default to language from project config instead
            languages = [default_language]

        for language in languages:
            if not force and os.path.exists(
                    os.path.join(self.datadir,
                                 self._index_filename(language))):
                logger.info('updating existing vocabulary')
                subjects = self._update_subject_index(subject_corpus, language)
            else:
                subjects = self._create_subject_index(subject_corpus, language)

            if language == default_language:
                self._subjects = subjects

        subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
                                 default_language)

    def as_skos_file(self):
        """return the vocabulary as a file object, in SKOS/Turtle syntax"""
        return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')

    def as_graph(self):
        """return the vocabulary as an rdflib graph"""
        return self.skos.graph


1		"""Vocabulary management functionality for Annif"""
2
3		import os.path
4		import re
5		import annif
6		import annif.corpus
7		import annif.util
8		from annif.datadir import DatadirMixin
9		from annif.exception import NotInitializedException
10		from annif.util import parse_args
11
12		logger = annif.logger
13
14
15		def get_vocab(vocab_spec, datadir, default_language):
16		match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
17		if match is None:
18		raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
19		vocab_id = match.group(1)
20		posargs, kwargs = parse_args(match.group(3))
21		language = posargs[0] if posargs else default_language
22
23		return AnnifVocabulary(vocab_id, datadir, language)
24
25
26	View Code Duplication	class AnnifVocabulary(DatadirMixin):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
27		"""Class representing a subject vocabulary which can be used by multiple
28		Annif projects."""
29
30		# defaults for uninitialized instances
31		_subjects = None
32
33		def __init__(self, vocab_id, datadir, language):
34		DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
35		self.vocab_id = vocab_id
36		self.language = language
37		self._skos_vocab = None
38
39		@staticmethod
40		def _index_filename(language):
41		return f"subjects.{language}.tsv"
42
43		def _create_subject_index(self, subject_corpus, language):
44		subjects = annif.corpus.SubjectIndex()
45		subjects.load_subjects(subject_corpus, language)
46		annif.util.atomic_save(subjects, self.datadir,
47		self._index_filename(language))
48		return subjects
49
50		def _update_subject_index(self, subject_corpus, language):
51		old_subjects = self.subjects
52		new_subjects = annif.corpus.SubjectIndex()
53		new_subjects.load_subjects(subject_corpus, language)
54		updated_subjects = annif.corpus.SubjectIndex()
55
56		for uri, label, notation in old_subjects:
57		if new_subjects.contains_uri(uri):
58		label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
59		else: # subject removed from new corpus
60		label, notation = None, None
61		updated_subjects.append(uri, label, notation)
62		for uri, label, notation in new_subjects:
63		if not old_subjects.contains_uri(uri):
64		updated_subjects.append(uri, label, notation)
65		annif.util.atomic_save(updated_subjects, self.datadir,
66		self._index_filename(language))
67		return updated_subjects
68
69		@property
70		def subjects(self):
71		if self._subjects is None:
72		path = os.path.join(self.datadir,
73		self._index_filename(self.language))
74		if os.path.exists(path):
75		logger.debug('loading subjects from %s', path)
76		self._subjects = annif.corpus.SubjectIndex.load(path)
77		else:
78		raise NotInitializedException(
79		"subject file {} not found".format(path))
80		return self._subjects
81
82		@property
83		def skos(self):
84		"""return the subject vocabulary from SKOS file"""
85		if self._skos_vocab is not None:
86		return self._skos_vocab
87
88		# attempt to load graph from dump file
89		dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
90		if os.path.exists(dumppath):
91		logger.debug(f'loading graph dump from {dumppath}')
92		try:
93		self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
94		except ModuleNotFoundError:
95		# Probably dump has been saved using a different rdflib version
96		logger.debug('could not load graph dump, using turtle file')
97		else:
98		return self._skos_vocab
99
100		# graph dump file not found - parse ttl file instead
101		path = os.path.join(self.datadir, 'subjects.ttl')
102		if os.path.exists(path):
103		logger.debug(f'loading graph from {path}')
104		self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
105		# store the dump file so we can use it next time
106		self._skos_vocab.save_skos(path, self.language)
107		return self._skos_vocab
108
109		raise NotInitializedException(f'graph file {path} not found')
110
111		def load_vocabulary(self, subject_corpus, default_language, force=False):
112		"""Load subjects from a subject corpus and save them into one
113		or more subject index files as well as a SKOS/Turtle file for later
114		use. If force=True, replace the existing subject index completely."""
115
116		languages = subject_corpus.languages
117		if not languages:
118		# subject corpus isn't language-aware or can't detect languages
119		# default to language from project config instead
120		languages = [default_language]
121
122		for language in languages:
123		if not force and os.path.exists(
124		os.path.join(self.datadir,
125		self._index_filename(language))):
126		logger.info('updating existing vocabulary')
127		subjects = self._update_subject_index(subject_corpus, language)
128		else:
129		subjects = self._create_subject_index(subject_corpus, language)
130
131		if language == default_language:
132		self._subjects = subjects
133
134		subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
135		default_language)
136
137		def as_skos_file(self):
138		"""return the vocabulary as a file object, in SKOS/Turtle syntax"""
139		return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
140
141		def as_graph(self):
142		"""return the vocabulary as an rdflib graph"""
143		return self.skos.graph
144

NatLibFi / Annif

Pull Request — master (#600)

annif.vocab.get_vocab() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like