Passed
Pull Request — master (#600)
by Osma
02:56
created

annif.vocab.AnnifVocabulary.subjects()   A

Complexity

Conditions 3

Size

Total Lines 12
Code Lines 11

Duplication

Lines 12
Ratio 100 %

Importance

Changes 0
Metric Value
cc 3
eloc 11
nop 1
dl 12
loc 12
rs 9.85
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
import os.path
4
import re
5
import annif
6
import annif.corpus
7
import annif.util
8
from annif.datadir import DatadirMixin
9
from annif.exception import NotInitializedException
10
from annif.util import parse_args
11
12
logger = annif.logger
13
14
15
def get_vocab(vocab_spec, datadir, default_language):
16
    match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
17
    if match is None:
18
        raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
19
    vocab_id = match.group(1)
20
    posargs, kwargs = parse_args(match.group(3))
21
    language = posargs[0] if posargs else default_language
22
23
    return AnnifVocabulary(vocab_id, datadir, language)
24
25
26 View Code Duplication
class AnnifVocabulary(DatadirMixin):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
27
    """Class representing a subject vocabulary which can be used by multiple
28
    Annif projects."""
29
30
    # defaults for uninitialized instances
31
    _subjects = None
32
33
    def __init__(self, vocab_id, datadir, language):
34
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
35
        self.vocab_id = vocab_id
36
        self.language = language
37
        self._skos_vocab = None
38
39
    @staticmethod
40
    def _index_filename(language):
41
        return f"subjects.{language}.tsv"
42
43
    def _create_subject_index(self, subject_corpus, language):
44
        subjects = annif.corpus.SubjectIndex()
45
        subjects.load_subjects(subject_corpus, language)
46
        annif.util.atomic_save(subjects, self.datadir,
47
                               self._index_filename(language))
48
        return subjects
49
50
    def _update_subject_index(self, subject_corpus, language):
51
        old_subjects = self.subjects
52
        new_subjects = annif.corpus.SubjectIndex()
53
        new_subjects.load_subjects(subject_corpus, language)
54
        updated_subjects = annif.corpus.SubjectIndex()
55
56
        for uri, label, notation in old_subjects:
57
            if new_subjects.contains_uri(uri):
58
                label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
59
            else:  # subject removed from new corpus
60
                label, notation = None, None
61
            updated_subjects.append(uri, label, notation)
62
        for uri, label, notation in new_subjects:
63
            if not old_subjects.contains_uri(uri):
64
                updated_subjects.append(uri, label, notation)
65
        annif.util.atomic_save(updated_subjects, self.datadir,
66
                               self._index_filename(language))
67
        return updated_subjects
68
69
    @property
70
    def subjects(self):
71
        if self._subjects is None:
72
            path = os.path.join(self.datadir,
73
                                self._index_filename(self.language))
74
            if os.path.exists(path):
75
                logger.debug('loading subjects from %s', path)
76
                self._subjects = annif.corpus.SubjectIndex.load(path)
77
            else:
78
                raise NotInitializedException(
79
                    "subject file {} not found".format(path))
80
        return self._subjects
81
82
    @property
83
    def skos(self):
84
        """return the subject vocabulary from SKOS file"""
85
        if self._skos_vocab is not None:
86
            return self._skos_vocab
87
88
        # attempt to load graph from dump file
89
        dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
90
        if os.path.exists(dumppath):
91
            logger.debug(f'loading graph dump from {dumppath}')
92
            try:
93
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
94
            except ModuleNotFoundError:
95
                # Probably dump has been saved using a different rdflib version
96
                logger.debug('could not load graph dump, using turtle file')
97
            else:
98
                return self._skos_vocab
99
100
        # graph dump file not found - parse ttl file instead
101
        path = os.path.join(self.datadir, 'subjects.ttl')
102
        if os.path.exists(path):
103
            logger.debug(f'loading graph from {path}')
104
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
105
            # store the dump file so we can use it next time
106
            self._skos_vocab.save_skos(path, self.language)
107
            return self._skos_vocab
108
109
        raise NotInitializedException(f'graph file {path} not found')
110
111
    def load_vocabulary(self, subject_corpus, default_language, force=False):
112
        """Load subjects from a subject corpus and save them into one
113
        or more subject index files as well as a SKOS/Turtle file for later
114
        use. If force=True, replace the existing subject index completely."""
115
116
        languages = subject_corpus.languages
117
        if not languages:
118
            # subject corpus isn't language-aware or can't detect languages
119
            # default to language from project config instead
120
            languages = [default_language]
121
122
        for language in languages:
123
            if not force and os.path.exists(
124
                    os.path.join(self.datadir,
125
                                 self._index_filename(language))):
126
                logger.info('updating existing vocabulary')
127
                subjects = self._update_subject_index(subject_corpus, language)
128
            else:
129
                subjects = self._create_subject_index(subject_corpus, language)
130
131
            if language == default_language:
132
                self._subjects = subjects
133
134
        subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
135
                                 default_language)
136
137
    def as_skos_file(self):
138
        """return the vocabulary as a file object, in SKOS/Turtle syntax"""
139
        return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
140
141
    def as_graph(self):
142
        """return the vocabulary as an rdflib graph"""
143
        return self.skos.graph
144