Passed
Pull Request — master (#614)
by Osma
02:59
created

annif.vocab   A

Complexity

Total Complexity 22

Size/Duplication

Total Lines 125
Duplicated Lines 89.6 %

Importance

Changes 0
Metric Value
eloc 86
dl 112
loc 125
rs 10
c 0
b 0
f 0
wmc 22

9 Methods

Rating   Name   Duplication   Size   Complexity  
A AnnifVocabulary.__len__() 2 2 1
A AnnifVocabulary.as_graph() 3 3 1
B AnnifVocabulary.skos() 28 28 6
A AnnifVocabulary.__init__() 4 4 1
A AnnifVocabulary._create_subject_index() 6 6 1
A AnnifVocabulary.languages() 3 3 1
A AnnifVocabulary.subjects() 11 11 3
B AnnifVocabulary._update_subject_index() 21 21 5
A AnnifVocabulary.load_vocabulary() 14 14 3

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
"""Vocabulary management functionality for Annif"""
2
3
import os.path
4
import annif
5
import annif.corpus
6
import annif.util
7
from annif.datadir import DatadirMixin
8
from annif.exception import NotInitializedException
9
10
logger = annif.logger
11
12
13 View Code Duplication
class AnnifVocabulary(DatadirMixin):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
14
    """Class representing a subject vocabulary which can be used by multiple
15
    Annif projects."""
16
17
    # defaults for uninitialized instances
18
    _subjects = None
19
20
    # constants
21
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
22
    INDEX_FILENAME_TTL = "subjects.ttl"
23
    INDEX_FILENAME_CSV = "subjects.csv"
24
25
    def __init__(self, vocab_id, datadir):
26
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
27
        self.vocab_id = vocab_id
28
        self._skos_vocab = None
29
30
    def _create_subject_index(self, subject_corpus):
31
        subjects = annif.corpus.SubjectIndex()
32
        subjects.load_subjects(subject_corpus)
33
        annif.util.atomic_save(subjects, self.datadir,
34
                               self.INDEX_FILENAME_CSV)
35
        return subjects
36
37
    def _update_subject_index(self, subject_corpus):
38
        old_subjects = self.subjects
39
        new_subjects = annif.corpus.SubjectIndex()
40
        new_subjects.load_subjects(subject_corpus)
41
        updated_subjects = annif.corpus.SubjectIndex()
42
43
        for old_subject in old_subjects:
44
            if new_subjects.contains_uri(old_subject.uri):
45
                new_subject = new_subjects[new_subjects.by_uri(
46
                    old_subject.uri)]
47
            else:  # subject removed from new corpus
48
                new_subject = annif.corpus.Subject(uri=old_subject.uri,
49
                                                   labels=None,
50
                                                   notation=None)
51
            updated_subjects.append(new_subject)
52
        for new_subject in new_subjects:
53
            if not old_subjects.contains_uri(new_subject.uri):
54
                updated_subjects.append(new_subject)
55
        annif.util.atomic_save(updated_subjects, self.datadir,
56
                               self.INDEX_FILENAME_CSV)
57
        return updated_subjects
58
59
    @property
60
    def subjects(self):
61
        if self._subjects is None:
62
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
63
            if os.path.exists(path):
64
                logger.debug('loading subjects from %s', path)
65
                self._subjects = annif.corpus.SubjectIndex.load(path)
66
            else:
67
                raise NotInitializedException(
68
                    "subject file {} not found".format(path))
69
        return self._subjects
70
71
    @property
72
    def skos(self):
73
        """return the subject vocabulary from SKOS file"""
74
        if self._skos_vocab is not None:
75
            return self._skos_vocab
76
77
        # attempt to load graph from dump file
78
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
79
        if os.path.exists(dumppath):
80
            logger.debug(f'loading graph dump from {dumppath}')
81
            try:
82
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
83
            except ModuleNotFoundError:
84
                # Probably dump has been saved using a different rdflib version
85
                logger.debug('could not load graph dump, using turtle file')
86
            else:
87
                return self._skos_vocab
88
89
        # graph dump file not found - parse ttl file instead
90
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
91
        if os.path.exists(path):
92
            logger.debug(f'loading graph from {path}')
93
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
94
            # store the dump file so we can use it next time
95
            self._skos_vocab.save_skos(path)
96
            return self._skos_vocab
97
98
        raise NotInitializedException(f'graph file {path} not found')
99
100
    def __len__(self):
101
        return len(self.subjects)
102
103
    @property
104
    def languages(self):
105
        return self.subjects.languages
106
107
    def load_vocabulary(self, subject_corpus, force=False):
108
        """Load subjects from a subject corpus and save them into one
109
        or more subject index files as well as a SKOS/Turtle file for later
110
        use. If force=True, replace the existing subject index completely."""
111
112
        if not force and os.path.exists(
113
                os.path.join(self.datadir, self.INDEX_FILENAME_CSV)):
114
            logger.info('updating existing vocabulary')
115
            self._subjects = self._update_subject_index(subject_corpus)
116
        else:
117
            self._subjects = self._create_subject_index(subject_corpus)
118
119
        subject_corpus.save_skos(
120
            os.path.join(self.datadir, self.INDEX_FILENAME_TTL))
121
122
    def as_graph(self):
123
        """return the vocabulary as an rdflib graph"""
124
        return self.skos.graph
125