annif.vocab.AnnifVocabulary.languages() - Code Metrics - Inspection of "Use black code style" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#640)

by Juho

created 2022-11-07 08:13 UTC

annif.vocab.AnnifVocabulary.languages() A

↳ Parent: annif.vocab

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	3
nop	1
dl	0
loc	3
rs	10
c	0
b	0
f	0

"""Vocabulary management functionality for Annif"""

import os.path
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException

logger = annif.logger


class AnnifVocabulary(DatadirMixin):
    """Class representing a subject vocabulary which can be used by multiple
    Annif projects."""

    # defaults for uninitialized instances
    _subjects = None

    # constants
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
    INDEX_FILENAME_TTL = "subjects.ttl"
    INDEX_FILENAME_CSV = "subjects.csv"

    def __init__(self, vocab_id, datadir):
        DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
        self.vocab_id = vocab_id
        self._skos_vocab = None

    def _create_subject_index(self, subject_corpus):
        subjects = annif.corpus.SubjectIndex()
        subjects.load_subjects(subject_corpus)
        annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
        return subjects

    def _update_subject_index(self, subject_corpus):
        old_subjects = self.subjects
        new_subjects = annif.corpus.SubjectIndex()
        new_subjects.load_subjects(subject_corpus)
        updated_subjects = annif.corpus.SubjectIndex()

        for old_subject in old_subjects:
            if new_subjects.contains_uri(old_subject.uri):
                new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
            else:  # subject removed from new corpus
                new_subject = annif.corpus.Subject(
                    uri=old_subject.uri, labels=None, notation=None
                )
            updated_subjects.append(new_subject)
        for new_subject in new_subjects:
            if not old_subjects.contains_uri(new_subject.uri):
                updated_subjects.append(new_subject)
        annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
        return updated_subjects

    @property
    def subjects(self):
        if self._subjects is None:
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
            if os.path.exists(path):
                logger.debug("loading subjects from %s", path)
                self._subjects = annif.corpus.SubjectIndex.load(path)
            else:
                raise NotInitializedException("subject file {} not found".format(path))
        return self._subjects

    @property
    def skos(self):
        """return the subject vocabulary from SKOS file"""
        if self._skos_vocab is not None:
            return self._skos_vocab

        # attempt to load graph from dump file
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
        if os.path.exists(dumppath):
            logger.debug(f"loading graph dump from {dumppath}")
            try:
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
            except ModuleNotFoundError:
                # Probably dump has been saved using a different rdflib version
                logger.debug("could not load graph dump, using turtle file")
            else:
                return self._skos_vocab

        # graph dump file not found - parse ttl file instead
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
        if os.path.exists(path):
            logger.debug(f"loading graph from {path}")
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
            # store the dump file so we can use it next time
            self._skos_vocab.save_skos(path)
            return self._skos_vocab

        raise NotInitializedException(f"graph file {path} not found")

    def __len__(self):
        return len(self.subjects)

    @property
    def languages(self):
        return self.subjects.languages

    def load_vocabulary(self, subject_corpus, force=False):
        """Load subjects from a subject corpus and save them into one
        or more subject index files as well as a SKOS/Turtle file for later
        use. If force=True, replace the existing subject index completely."""

        if not force and os.path.exists(
            os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
        ):
            logger.info("updating existing subject index")
            self._subjects = self._update_subject_index(subject_corpus)
        else:
            logger.info("creating subject index")
            self._subjects = self._create_subject_index(subject_corpus)

        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
        logger.info(f"saving vocabulary into SKOS file {skosfile}")
        subject_corpus.save_skos(skosfile)

    def as_graph(self):
        """return the vocabulary as an rdflib graph"""
        return self.skos.graph


1			"""Vocabulary management functionality for Annif"""
2
3			import os.path
4			import annif
5			import annif.corpus
6			import annif.util
7			from annif.datadir import DatadirMixin
8			from annif.exception import NotInitializedException
9
10			logger = annif.logger
11
12
13			class AnnifVocabulary(DatadirMixin):
14			"""Class representing a subject vocabulary which can be used by multiple
15			Annif projects."""
16
17			# defaults for uninitialized instances
18			_subjects = None
19
20			# constants
21			INDEX_FILENAME_DUMP = "subjects.dump.gz"
22			INDEX_FILENAME_TTL = "subjects.ttl"
23			INDEX_FILENAME_CSV = "subjects.csv"
24
25			def __init__(self, vocab_id, datadir):
26			DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
27			self.vocab_id = vocab_id
28			self._skos_vocab = None
29
30			def _create_subject_index(self, subject_corpus):
31			subjects = annif.corpus.SubjectIndex()
32			subjects.load_subjects(subject_corpus)
33			annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
34			return subjects
35
36			def _update_subject_index(self, subject_corpus):
37			old_subjects = self.subjects
38			new_subjects = annif.corpus.SubjectIndex()
39			new_subjects.load_subjects(subject_corpus)
40			updated_subjects = annif.corpus.SubjectIndex()
41
42			for old_subject in old_subjects:
43			if new_subjects.contains_uri(old_subject.uri):
44			new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
45			else: # subject removed from new corpus
46			new_subject = annif.corpus.Subject(
47			uri=old_subject.uri, labels=None, notation=None
48			)
49			updated_subjects.append(new_subject)
50			for new_subject in new_subjects:
51			if not old_subjects.contains_uri(new_subject.uri):
52			updated_subjects.append(new_subject)
53			annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
54			return updated_subjects
55
56			@property
57			def subjects(self):
58			if self._subjects is None:
59			path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
60			if os.path.exists(path):
61			logger.debug("loading subjects from %s", path)
62			self._subjects = annif.corpus.SubjectIndex.load(path)
63			else:
64			raise NotInitializedException("subject file {} not found".format(path))
65			return self._subjects
66
67			@property
68			def skos(self):
69			"""return the subject vocabulary from SKOS file"""
70			if self._skos_vocab is not None:
71			return self._skos_vocab
72
73			# attempt to load graph from dump file
74			dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
75			if os.path.exists(dumppath):
76			logger.debug(f"loading graph dump from {dumppath}")
77			try:
78			self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
79			except ModuleNotFoundError:
80			# Probably dump has been saved using a different rdflib version
81			logger.debug("could not load graph dump, using turtle file")
82			else:
83			return self._skos_vocab
84
85			# graph dump file not found - parse ttl file instead
86			path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
87			if os.path.exists(path):
88			logger.debug(f"loading graph from {path}")
89			self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
90			# store the dump file so we can use it next time
91			self._skos_vocab.save_skos(path)
92			return self._skos_vocab
93
94			raise NotInitializedException(f"graph file {path} not found")
95
96			def __len__(self):
97			return len(self.subjects)
98
99			@property
100			def languages(self):
101			return self.subjects.languages
102
103			def load_vocabulary(self, subject_corpus, force=False):
104			"""Load subjects from a subject corpus and save them into one
105			or more subject index files as well as a SKOS/Turtle file for later
106			use. If force=True, replace the existing subject index completely."""
107
108			if not force and os.path.exists(
109			os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
110			):
111			logger.info("updating existing subject index")
112			self._subjects = self._update_subject_index(subject_corpus)
113			else:
114			logger.info("creating subject index")
115			self._subjects = self._create_subject_index(subject_corpus)
116
117			skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
118			logger.info(f"saving vocabulary into SKOS file {skosfile}")
119			subject_corpus.save_skos(skosfile)
120
121			def as_graph(self):
122			"""return the vocabulary as an rdflib graph"""
123			return self.skos.graph
124

NatLibFi / Annif

Pull Request — master (#640)

annif.vocab.AnnifVocabulary.languages() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like