annif.vocab.AnnifVocabulary.as_skos_file() - Code Metrics - Inspection of "multilingual SubjectIndex backed by CSV file" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#608)

by Osma

created 2022-08-15 09:29 UTC

annif.vocab.AnnifVocabulary.as_skos_file() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	3
Code Lines	2

Duplication

Lines	3
Ratio	100 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	1
dl	3
loc	3
rs	10
c	0
b	0
f	0

"""Vocabulary management functionality for Annif"""

import os.path
import re
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException
from annif.util import parse_args

logger = annif.logger


def get_vocab(vocab_spec, datadir, default_language):
    match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
    if match is None:
        raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
    vocab_id = match.group(1)
    posargs, kwargs = parse_args(match.group(3))
    language = posargs[0] if posargs else default_language

    return AnnifVocabulary(vocab_id, datadir, language)


class AnnifVocabulary(DatadirMixin):

    """Class representing a subject vocabulary which can be used by multiple
    Annif projects."""

    # defaults for uninitialized instances
    _subjects = None

    # constants
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
    INDEX_FILENAME_TTL = "subjects.ttl"
    INDEX_FILENAME_CSV = "subjects.csv"

    def __init__(self, vocab_id, datadir, language):
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
        self.vocab_id = vocab_id
        self.language = language
        self._skos_vocab = None

    def _create_subject_index(self, subject_corpus):
        subjects = annif.corpus.SubjectIndex()
        subjects.load_subjects(subject_corpus)
        annif.util.atomic_save(subjects, self.datadir,
                               self.INDEX_FILENAME_CSV)
        return subjects

    def _update_subject_index(self, subject_corpus):
        old_subjects = self.subjects
        new_subjects = annif.corpus.SubjectIndex()
        new_subjects.load_subjects(subject_corpus)
        updated_subjects = annif.corpus.SubjectIndex()

        for old_subject in old_subjects:
            if new_subjects.contains_uri(old_subject.uri):
                new_subject = new_subjects[new_subjects.by_uri(
                    old_subject.uri)]
            else:  # subject removed from new corpus
                new_subject = annif.corpus.Subject(uri=old_subject.uri,
                                                   labels=None,
                                                   notation=None)
            updated_subjects.append(new_subject)
        for new_subject in new_subjects:
            if not old_subjects.contains_uri(new_subject.uri):
                updated_subjects.append(new_subject)
        annif.util.atomic_save(updated_subjects, self.datadir,
                               self.INDEX_FILENAME_CSV)
        return updated_subjects

    @property
    def subjects(self):
        if self._subjects is None:
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
            if os.path.exists(path):
                logger.debug('loading subjects from %s', path)
                self._subjects = annif.corpus.SubjectIndex.load(path)
            else:
                raise NotInitializedException(
                    "subject file {} not found".format(path))
        return self._subjects

    @property
    def skos(self):
        """return the subject vocabulary from SKOS file"""
        if self._skos_vocab is not None:
            return self._skos_vocab

        # attempt to load graph from dump file
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
        if os.path.exists(dumppath):
            logger.debug(f'loading graph dump from {dumppath}')
            try:
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
            except ModuleNotFoundError:
                # Probably dump has been saved using a different rdflib version
                logger.debug('could not load graph dump, using turtle file')
            else:
                return self._skos_vocab

        # graph dump file not found - parse ttl file instead
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
        if os.path.exists(path):
            logger.debug(f'loading graph from {path}')
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
            # store the dump file so we can use it next time
            self._skos_vocab.save_skos(path)
            return self._skos_vocab

        raise NotInitializedException(f'graph file {path} not found')

    def load_vocabulary(self, subject_corpus, force=False):
        """Load subjects from a subject corpus and save them into one
        or more subject index files as well as a SKOS/Turtle file for later
        use. If force=True, replace the existing subject index completely."""

        if not force and os.path.exists(
                os.path.join(self.datadir, self.INDEX_FILENAME_CSV)):
            logger.info('updating existing vocabulary')
            self._subjects = self._update_subject_index(subject_corpus)
        else:
            self._subjects = self._create_subject_index(subject_corpus)

        subject_corpus.save_skos(
            os.path.join(self.datadir, self.INDEX_FILENAME_TTL))

    def as_graph(self):
        """return the vocabulary as an rdflib graph"""
        return self.skos.graph


1		"""Vocabulary management functionality for Annif"""
2
3		import os.path
4		import re
5		import annif
6		import annif.corpus
7		import annif.util
8		from annif.datadir import DatadirMixin
9		from annif.exception import NotInitializedException
10		from annif.util import parse_args
11
12		logger = annif.logger
13
14
15		def get_vocab(vocab_spec, datadir, default_language):
16		match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
17		if match is None:
18		raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
19		vocab_id = match.group(1)
20		posargs, kwargs = parse_args(match.group(3))
21		language = posargs[0] if posargs else default_language
22
23		return AnnifVocabulary(vocab_id, datadir, language)
24
25
26	View Code Duplication	class AnnifVocabulary(DatadirMixin):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
27		"""Class representing a subject vocabulary which can be used by multiple
28		Annif projects."""
29
30		# defaults for uninitialized instances
31		_subjects = None
32
33		# constants
34		INDEX_FILENAME_DUMP = "subjects.dump.gz"
35		INDEX_FILENAME_TTL = "subjects.ttl"
36		INDEX_FILENAME_CSV = "subjects.csv"
37
38		def __init__(self, vocab_id, datadir, language):
39		DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
40		self.vocab_id = vocab_id
41		self.language = language
42		self._skos_vocab = None
43
44		def _create_subject_index(self, subject_corpus):
45		subjects = annif.corpus.SubjectIndex()
46		subjects.load_subjects(subject_corpus)
47		annif.util.atomic_save(subjects, self.datadir,
48		self.INDEX_FILENAME_CSV)
49		return subjects
50
51		def _update_subject_index(self, subject_corpus):
52		old_subjects = self.subjects
53		new_subjects = annif.corpus.SubjectIndex()
54		new_subjects.load_subjects(subject_corpus)
55		updated_subjects = annif.corpus.SubjectIndex()
56
57		for old_subject in old_subjects:
58		if new_subjects.contains_uri(old_subject.uri):
59		new_subject = new_subjects[new_subjects.by_uri(
60		old_subject.uri)]
61		else: # subject removed from new corpus
62		new_subject = annif.corpus.Subject(uri=old_subject.uri,
63		labels=None,
64		notation=None)
65		updated_subjects.append(new_subject)
66		for new_subject in new_subjects:
67		if not old_subjects.contains_uri(new_subject.uri):
68		updated_subjects.append(new_subject)
69		annif.util.atomic_save(updated_subjects, self.datadir,
70		self.INDEX_FILENAME_CSV)
71		return updated_subjects
72
73		@property
74		def subjects(self):
75		if self._subjects is None:
76		path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
77		if os.path.exists(path):
78		logger.debug('loading subjects from %s', path)
79		self._subjects = annif.corpus.SubjectIndex.load(path)
80		else:
81		raise NotInitializedException(
82		"subject file {} not found".format(path))
83		return self._subjects
84
85		@property
86		def skos(self):
87		"""return the subject vocabulary from SKOS file"""
88		if self._skos_vocab is not None:
89		return self._skos_vocab
90
91		# attempt to load graph from dump file
92		dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
93		if os.path.exists(dumppath):
94		logger.debug(f'loading graph dump from {dumppath}')
95		try:
96		self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
97		except ModuleNotFoundError:
98		# Probably dump has been saved using a different rdflib version
99		logger.debug('could not load graph dump, using turtle file')
100		else:
101		return self._skos_vocab
102
103		# graph dump file not found - parse ttl file instead
104		path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
105		if os.path.exists(path):
106		logger.debug(f'loading graph from {path}')
107		self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
108		# store the dump file so we can use it next time
109		self._skos_vocab.save_skos(path)
110		return self._skos_vocab
111
112		raise NotInitializedException(f'graph file {path} not found')
113
114		def load_vocabulary(self, subject_corpus, force=False):
115		"""Load subjects from a subject corpus and save them into one
116		or more subject index files as well as a SKOS/Turtle file for later
117		use. If force=True, replace the existing subject index completely."""
118
119		if not force and os.path.exists(
120		os.path.join(self.datadir, self.INDEX_FILENAME_CSV)):
121		logger.info('updating existing vocabulary')
122		self._subjects = self._update_subject_index(subject_corpus)
123		else:
124		self._subjects = self._create_subject_index(subject_corpus)
125
126		subject_corpus.save_skos(
127		os.path.join(self.datadir, self.INDEX_FILENAME_TTL))
128
129		def as_graph(self):
130		"""return the vocabulary as an rdflib graph"""
131		return self.skos.graph
132

NatLibFi / Annif

Pull Request — master (#608)

annif.vocab.AnnifVocabulary.as_skos_file() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like