annif.vocab.AnnifVocabulary.subjects() - Code Metrics - Inspection of "Make vocabularies multilingual" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#600)

by Osma

created 2022-08-04 07:41 UTC

annif.vocab.AnnifVocabulary.subjects() A

↳ Parent: annif.vocab

Complexity

Conditions

Size

Total Lines	12
Code Lines	11

Duplication

Lines	12
Ratio	100 %

Importance

Changes

Metric	Value
cc	3
eloc	11
nop	1
dl	12
loc	12
rs	9.85
c	0
b	0
f	0

"""Vocabulary management functionality for Annif"""

import os.path
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException

logger = annif.logger


class AnnifVocabulary(DatadirMixin):

    """Class representing a subject vocabulary which can be used by multiple
    Annif projects."""

    # defaults for uninitialized instances
    _subjects = None

    def __init__(self, vocab_id, datadir, language):
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
        self.vocab_id = vocab_id
        self.language = language
        self._skos_vocab = None

    @staticmethod
    def _index_filename(language):
        return f"subjects.{language}.tsv"

    def _create_subject_index(self, subject_corpus, language):
        subjects = annif.corpus.SubjectIndex()
        subjects.load_subjects(subject_corpus, language)
        annif.util.atomic_save(subjects, self.datadir,
                               self._index_filename(language))
        return subjects

    def _update_subject_index(self, subject_corpus, language):
        old_subjects = self.subjects
        new_subjects = annif.corpus.SubjectIndex()
        new_subjects.load_subjects(subject_corpus, language)
        updated_subjects = annif.corpus.SubjectIndex()

        for uri, label, notation in old_subjects:
            if new_subjects.contains_uri(uri):
                label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
            else:  # subject removed from new corpus
                label, notation = None, None
            updated_subjects.append(uri, label, notation)
        for uri, label, notation in new_subjects:
            if not old_subjects.contains_uri(uri):
                updated_subjects.append(uri, label, notation)
        annif.util.atomic_save(updated_subjects, self.datadir,
                               self._index_filename(language))
        return updated_subjects

    @property
    def subjects(self):
        if self._subjects is None:
            path = os.path.join(self.datadir,
                                self._index_filename(self.language))
            if os.path.exists(path):
                logger.debug('loading subjects from %s', path)
                self._subjects = annif.corpus.SubjectIndex.load(path)
            else:
                raise NotInitializedException(
                    "subject file {} not found".format(path))
        return self._subjects

    @property
    def skos(self):
        """return the subject vocabulary from SKOS file"""
        if self._skos_vocab is not None:
            return self._skos_vocab

        # attempt to load graph from dump file
        dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
        if os.path.exists(dumppath):
            logger.debug(f'loading graph dump from {dumppath}')
            try:
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
            except ModuleNotFoundError:
                # Probably dump has been saved using a different rdflib version
                logger.debug('could not load graph dump, using turtle file')
            else:
                return self._skos_vocab

        # graph dump file not found - parse ttl file instead
        path = os.path.join(self.datadir, 'subjects.ttl')
        if os.path.exists(path):
            logger.debug(f'loading graph from {path}')
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
            # store the dump file so we can use it next time
            self._skos_vocab.save_skos(path, self.language)
            return self._skos_vocab

        raise NotInitializedException(f'graph file {path} not found')

    def load_vocabulary(self, subject_corpus, project_language, force=False):
        """Load subjects from a subject corpus and save them into one
        or more subject index files as well as a SKOS/Turtle file for later
        use. If force=True, replace the existing subject index completely."""

        languages = subject_corpus.languages
        if languages is None:
            # subject corpus isn't language-aware, default to project language
            languages = [project_language]

        for language in languages:
            if not force and os.path.exists(
                    os.path.join(self.datadir,
                                 self._index_filename(language))):
                logger.info('updating existing vocabulary')
                subjects = self._update_subject_index(subject_corpus, language)
            else:
                subjects = self._create_subject_index(subject_corpus, language)

            if language == project_language:
                self._subjects = subjects

        subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
                                 project_language)

    def as_skos_file(self):
        """return the vocabulary as a file object, in SKOS/Turtle syntax"""
        return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')

    def as_graph(self):
        """return the vocabulary as an rdflib graph"""
        return self.skos.graph


1		"""Vocabulary management functionality for Annif"""
2
3		import os.path
4		import annif
5		import annif.corpus
6		import annif.util
7		from annif.datadir import DatadirMixin
8		from annif.exception import NotInitializedException
9
10		logger = annif.logger
11
12
13	View Code Duplication	class AnnifVocabulary(DatadirMixin):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
14		"""Class representing a subject vocabulary which can be used by multiple
15		Annif projects."""
16
17		# defaults for uninitialized instances
18		_subjects = None
19
20		def __init__(self, vocab_id, datadir, language):
21		DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
22		self.vocab_id = vocab_id
23		self.language = language
24		self._skos_vocab = None
25
26		@staticmethod
27		def _index_filename(language):
28		return f"subjects.{language}.tsv"
29
30		def _create_subject_index(self, subject_corpus, language):
31		subjects = annif.corpus.SubjectIndex()
32		subjects.load_subjects(subject_corpus, language)
33		annif.util.atomic_save(subjects, self.datadir,
34		self._index_filename(language))
35		return subjects
36
37		def _update_subject_index(self, subject_corpus, language):
38		old_subjects = self.subjects
39		new_subjects = annif.corpus.SubjectIndex()
40		new_subjects.load_subjects(subject_corpus, language)
41		updated_subjects = annif.corpus.SubjectIndex()
42
43		for uri, label, notation in old_subjects:
44		if new_subjects.contains_uri(uri):
45		label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
46		else: # subject removed from new corpus
47		label, notation = None, None
48		updated_subjects.append(uri, label, notation)
49		for uri, label, notation in new_subjects:
50		if not old_subjects.contains_uri(uri):
51		updated_subjects.append(uri, label, notation)
52		annif.util.atomic_save(updated_subjects, self.datadir,
53		self._index_filename(language))
54		return updated_subjects
55
56		@property
57		def subjects(self):
58		if self._subjects is None:
59		path = os.path.join(self.datadir,
60		self._index_filename(self.language))
61		if os.path.exists(path):
62		logger.debug('loading subjects from %s', path)
63		self._subjects = annif.corpus.SubjectIndex.load(path)
64		else:
65		raise NotInitializedException(
66		"subject file {} not found".format(path))
67		return self._subjects
68
69		@property
70		def skos(self):
71		"""return the subject vocabulary from SKOS file"""
72		if self._skos_vocab is not None:
73		return self._skos_vocab
74
75		# attempt to load graph from dump file
76		dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
77		if os.path.exists(dumppath):
78		logger.debug(f'loading graph dump from {dumppath}')
79		try:
80		self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
81		except ModuleNotFoundError:
82		# Probably dump has been saved using a different rdflib version
83		logger.debug('could not load graph dump, using turtle file')
84		else:
85		return self._skos_vocab
86
87		# graph dump file not found - parse ttl file instead
88		path = os.path.join(self.datadir, 'subjects.ttl')
89		if os.path.exists(path):
90		logger.debug(f'loading graph from {path}')
91		self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
92		# store the dump file so we can use it next time
93		self._skos_vocab.save_skos(path, self.language)
94		return self._skos_vocab
95
96		raise NotInitializedException(f'graph file {path} not found')
97
98		def load_vocabulary(self, subject_corpus, project_language, force=False):
99		"""Load subjects from a subject corpus and save them into one
100		or more subject index files as well as a SKOS/Turtle file for later
101		use. If force=True, replace the existing subject index completely."""
102
103		languages = subject_corpus.languages
104		if languages is None:
105		# subject corpus isn't language-aware, default to project language
106		languages = [project_language]
107
108		for language in languages:
109		if not force and os.path.exists(
110		os.path.join(self.datadir,
111		self._index_filename(language))):
112		logger.info('updating existing vocabulary')
113		subjects = self._update_subject_index(subject_corpus, language)
114		else:
115		subjects = self._create_subject_index(subject_corpus, language)
116
117		if language == project_language:
118		self._subjects = subjects
119
120		subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
121		project_language)
122
123		def as_skos_file(self):
124		"""return the vocabulary as a file object, in SKOS/Turtle syntax"""
125		return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
126
127		def as_graph(self):
128		"""return the vocabulary as an rdflib graph"""
129		return self.skos.graph
130

NatLibFi / Annif

Pull Request — master (#600)

annif.vocab.AnnifVocabulary.subjects() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like