Passed
Pull Request — master (#600)
by Osma
02:56
created

annif.vocab.AnnifVocabulary.load_vocabulary()   B

Complexity

Conditions 6

Size

Total Lines 24
Code Lines 15

Duplication

Lines 24
Ratio 100 %

Importance

Changes 0
Metric Value
cc 6
eloc 15
nop 4
dl 24
loc 24
rs 8.6666
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
import os.path
4
import annif
5
import annif.corpus
6
import annif.util
7
from annif.datadir import DatadirMixin
8
from annif.exception import NotInitializedException
9
10
logger = annif.logger
11
12
13 View Code Duplication
class AnnifVocabulary(DatadirMixin):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
14
    """Class representing a subject vocabulary which can be used by multiple
15
    Annif projects."""
16
17
    # defaults for uninitialized instances
18
    _subjects = None
19
20
    def __init__(self, vocab_id, datadir, language):
21
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
22
        self.vocab_id = vocab_id
23
        self.language = language
24
        self._skos_vocab = None
25
26
    @staticmethod
27
    def _index_filename(language):
28
        return f"subjects.{language}.tsv"
29
30
    def _create_subject_index(self, subject_corpus, language):
31
        subjects = annif.corpus.SubjectIndex()
32
        subjects.load_subjects(subject_corpus, language)
33
        annif.util.atomic_save(subjects, self.datadir,
34
                               self._index_filename(language))
35
        return subjects
36
37
    def _update_subject_index(self, subject_corpus, language):
38
        old_subjects = self.subjects
39
        new_subjects = annif.corpus.SubjectIndex()
40
        new_subjects.load_subjects(subject_corpus, language)
41
        updated_subjects = annif.corpus.SubjectIndex()
42
43
        for uri, label, notation in old_subjects:
44
            if new_subjects.contains_uri(uri):
45
                label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
46
            else:  # subject removed from new corpus
47
                label, notation = None, None
48
            updated_subjects.append(uri, label, notation)
49
        for uri, label, notation in new_subjects:
50
            if not old_subjects.contains_uri(uri):
51
                updated_subjects.append(uri, label, notation)
52
        annif.util.atomic_save(updated_subjects, self.datadir,
53
                               self._index_filename(language))
54
        return updated_subjects
55
56
    @property
57
    def subjects(self):
58
        if self._subjects is None:
59
            path = os.path.join(self.datadir,
60
                                self._index_filename(self.language))
61
            if os.path.exists(path):
62
                logger.debug('loading subjects from %s', path)
63
                self._subjects = annif.corpus.SubjectIndex.load(path)
64
            else:
65
                raise NotInitializedException(
66
                    "subject file {} not found".format(path))
67
        return self._subjects
68
69
    @property
70
    def skos(self):
71
        """return the subject vocabulary from SKOS file"""
72
        if self._skos_vocab is not None:
73
            return self._skos_vocab
74
75
        # attempt to load graph from dump file
76
        dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
77
        if os.path.exists(dumppath):
78
            logger.debug(f'loading graph dump from {dumppath}')
79
            try:
80
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
81
            except ModuleNotFoundError:
82
                # Probably dump has been saved using a different rdflib version
83
                logger.debug('could not load graph dump, using turtle file')
84
            else:
85
                return self._skos_vocab
86
87
        # graph dump file not found - parse ttl file instead
88
        path = os.path.join(self.datadir, 'subjects.ttl')
89
        if os.path.exists(path):
90
            logger.debug(f'loading graph from {path}')
91
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
92
            # store the dump file so we can use it next time
93
            self._skos_vocab.save_skos(path, self.language)
94
            return self._skos_vocab
95
96
        raise NotInitializedException(f'graph file {path} not found')
97
98
    def load_vocabulary(self, subject_corpus, project_language, force=False):
99
        """Load subjects from a subject corpus and save them into one
100
        or more subject index files as well as a SKOS/Turtle file for later
101
        use. If force=True, replace the existing subject index completely."""
102
103
        languages = subject_corpus.languages
104
        if languages is None:
105
            # subject corpus isn't language-aware, default to project language
106
            languages = [project_language]
107
108
        for language in languages:
109
            if not force and os.path.exists(
110
                    os.path.join(self.datadir,
111
                                 self._index_filename(language))):
112
                logger.info('updating existing vocabulary')
113
                subjects = self._update_subject_index(subject_corpus, language)
114
            else:
115
                subjects = self._create_subject_index(subject_corpus, language)
116
117
            if language == project_language:
118
                self._subjects = subjects
119
120
        subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
121
                                 project_language)
122
123
    def as_skos_file(self):
124
        """return the vocabulary as a file object, in SKOS/Turtle syntax"""
125
        return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
126
127
    def as_graph(self):
128
        """return the vocabulary as an rdflib graph"""
129
        return self.skos.graph
130