Passed
Pull Request — master (#536)
by Juho
03:39
created

annif.vocab.AnnifVocabulary.skos()   B

Complexity

Conditions 6

Size

Total Lines 30
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 21
nop 1
dl 0
loc 30
rs 8.4426
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
import os.path
4
import annif
5
import annif.corpus
6
import annif.util
7
from annif.datadir import DatadirMixin
8
from annif.exception import NotInitializedException
9
10
logger = annif.logger
11
12
13
class AnnifVocabulary(DatadirMixin):
14
    """Class representing a subject vocabulary which can be used by multiple
15
    Annif projects."""
16
17
    # defaults for uninitialized instances
18
    _subjects = None
19
20
    def __init__(self, vocab_id, datadir, language):
21
        DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
22
        self.vocab_id = vocab_id
23
        self.language = language
24
        self._skos_vocab = None
25
26
    def _create_subject_index(self, subject_corpus):
27
        self._subjects = annif.corpus.SubjectIndex(subject_corpus)
28
        annif.util.atomic_save(self._subjects, self.datadir, 'subjects')
29
30
    def _update_subject_index(self, subject_corpus):
31
        old_subjects = self.subjects
32
        new_subjects = annif.corpus.SubjectIndex(subject_corpus)
33
        updated_subjects = annif.corpus.SubjectIndex()
34
35
        for uri, label, notation in old_subjects:
36
            if new_subjects.contains_uri(uri):
37
                label, notation = new_subjects[new_subjects.by_uri(uri)][1:3]
38
            else:  # subject removed from new corpus
39
                label, notation = None, None
40
            updated_subjects.append(uri, label, notation)
41
        for uri, label, notation in new_subjects:
42
            if not old_subjects.contains_uri(uri):
43
                updated_subjects.append(uri, label, notation)
44
        self._subjects = updated_subjects
45
        annif.util.atomic_save(self._subjects, self.datadir, 'subjects')
46
47
    @property
48
    def subjects(self):
49
        if self._subjects is None:
50
            path = os.path.join(self.datadir, 'subjects')
51
            if os.path.exists(path):
52
                logger.debug('loading subjects from %s', path)
53
                self._subjects = annif.corpus.SubjectIndex.load(path)
54
            else:
55
                raise NotInitializedException(
56
                    "subject file {} not found".format(path))
57
        return self._subjects
58
59
    @property
60
    def skos(self):
61
        """return the subject vocabulary from SKOS file"""
62
        if self._skos_vocab is not None:
63
            return self._skos_vocab
64
65
        # attempt to load graph from dump file
66
        dumppath = os.path.join(self.datadir, 'subjects.dump.gz')
67
        if os.path.exists(dumppath):
68
            logger.debug(f'loading graph dump from {dumppath}')
69
            try:
70
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath,
71
                                                                self.language)
72
            except ModuleNotFoundError:
73
                # Probably dump has been saved using a different rdflib version
74
                logger.debug('could not load graph dump, using turtle file')
75
            else:
76
                return self._skos_vocab
77
78
        # graph dump file not found - parse ttl file instead
79
        path = os.path.join(self.datadir, 'subjects.ttl')
80
        if os.path.exists(path):
81
            logger.debug(f'loading graph from {path}')
82
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path,
83
                                                            self.language)
84
            # store the dump file so we can use it next time
85
            self._skos_vocab.save_skos(path, self.language)
86
            return self._skos_vocab
87
88
        raise NotInitializedException(f'graph file {path} not found')
89
90
    def load_vocabulary(self, subject_corpus, language):
91
        """load subjects from a subject corpus and save them into a
92
        SKOS/Turtle file for later use"""
93
94
        if os.path.exists(os.path.join(self.datadir, 'subjects')):
95
            logger.info('updating existing vocabulary')
96
            self._update_subject_index(subject_corpus)
97
        else:
98
            self._create_subject_index(subject_corpus)
99
        subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
100
                                 language)
101
102
    def as_skos_file(self):
103
        """return the vocabulary as a file object, in SKOS/Turtle syntax"""
104
        return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
105
106
    def as_graph(self):
107
        """return the vocabulary as an rdflib graph"""
108
        return self.skos.graph
109