Passed
Pull Request — master (#608)
by Osma
02:49
created

annif.vocab.AnnifVocabulary._index_filename()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 3
Ratio 100 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 1
dl 3
loc 3
rs 10
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
import os.path
4
import re
5
import annif
6
import annif.corpus
7
import annif.util
8
from annif.datadir import DatadirMixin
9
from annif.exception import NotInitializedException
10
from annif.util import parse_args
11
12
logger = annif.logger
13
14
15
def get_vocab(vocab_spec, datadir, default_language):
16
    match = re.match(r'(\w+)(\((.*)\))?', vocab_spec)
17
    if match is None:
18
        raise ValueError(f"Invalid vocabulary specification: {vocab_spec}")
19
    vocab_id = match.group(1)
20
    posargs, kwargs = parse_args(match.group(3))
21
    language = posargs[0] if posargs else default_language
22
23
    return AnnifVocabulary(vocab_id, datadir, language)
24
25
26 View Code Duplication
class AnnifVocabulary(DatadirMixin):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
27
    """Class representing a subject vocabulary which can be used by multiple
28
    Annif projects."""
29
30
    # defaults for uninitialized instances
31
    _subjects = None
32
33
    # constants
34
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
35
    INDEX_FILENAME_TTL = "subjects.ttl"
36
    INDEX_FILENAME_CSV = "subjects.csv"
37
38
    def __init__(self, vocab_id, datadir, language):
39
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
40
        self.vocab_id = vocab_id
41
        self.language = language
42
        self._skos_vocab = None
43
44
    def _create_subject_index(self, subject_corpus):
45
        subjects = annif.corpus.SubjectIndex()
46
        subjects.load_subjects(subject_corpus)
47
        annif.util.atomic_save(subjects, self.datadir,
48
                               self.INDEX_FILENAME_CSV)
49
        return subjects
50
51
    def _update_subject_index(self, subject_corpus):
52
        old_subjects = self.subjects
53
        new_subjects = annif.corpus.SubjectIndex()
54
        new_subjects.load_subjects(subject_corpus)
55
        updated_subjects = annif.corpus.SubjectIndex()
56
57
        for old_subject in old_subjects:
58
            if new_subjects.contains_uri(old_subject.uri):
59
                new_subject = new_subjects[new_subjects.by_uri(
60
                    old_subject.uri)]
61
            else:  # subject removed from new corpus
62
                new_subject = annif.corpus.Subject(uri=old_subject.uri,
63
                                                   labels=None,
64
                                                   notation=None)
65
            updated_subjects.append(new_subject)
66
        for new_subject in new_subjects:
67
            if not old_subjects.contains_uri(new_subject.uri):
68
                updated_subjects.append(new_subject)
69
        annif.util.atomic_save(updated_subjects, self.datadir,
70
                               self.INDEX_FILENAME_CSV)
71
        return updated_subjects
72
73
    @property
74
    def subjects(self):
75
        if self._subjects is None:
76
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
77
            if os.path.exists(path):
78
                logger.debug('loading subjects from %s', path)
79
                self._subjects = annif.corpus.SubjectIndex.load(path)
80
            else:
81
                raise NotInitializedException(
82
                    "subject file {} not found".format(path))
83
        return self._subjects
84
85
    @property
86
    def skos(self):
87
        """return the subject vocabulary from SKOS file"""
88
        if self._skos_vocab is not None:
89
            return self._skos_vocab
90
91
        # attempt to load graph from dump file
92
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
93
        if os.path.exists(dumppath):
94
            logger.debug(f'loading graph dump from {dumppath}')
95
            try:
96
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
97
            except ModuleNotFoundError:
98
                # Probably dump has been saved using a different rdflib version
99
                logger.debug('could not load graph dump, using turtle file')
100
            else:
101
                return self._skos_vocab
102
103
        # graph dump file not found - parse ttl file instead
104
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
105
        if os.path.exists(path):
106
            logger.debug(f'loading graph from {path}')
107
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
108
            # store the dump file so we can use it next time
109
            self._skos_vocab.save_skos(path)
110
            return self._skos_vocab
111
112
        raise NotInitializedException(f'graph file {path} not found')
113
114
    def load_vocabulary(self, subject_corpus, force=False):
115
        """Load subjects from a subject corpus and save them into one
116
        or more subject index files as well as a SKOS/Turtle file for later
117
        use. If force=True, replace the existing subject index completely."""
118
119
        if not force and os.path.exists(
120
                os.path.join(self.datadir, self.INDEX_FILENAME_CSV)):
121
            logger.info('updating existing vocabulary')
122
            self._subjects = self._update_subject_index(subject_corpus)
123
        else:
124
            self._subjects = self._create_subject_index(subject_corpus)
125
126
        subject_corpus.save_skos(
127
            os.path.join(self.datadir, self.INDEX_FILENAME_TTL))
128
129
    def as_graph(self):
130
        """return the vocabulary as an rdflib graph"""
131
        return self.skos.graph
132