Passed
Pull Request — master (#640)
by Juho
02:38
created

AnnifVocabulary._update_subject_index()   A

Complexity

Conditions 5

Size

Total Lines 19
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 16
nop 2
dl 0
loc 19
rs 9.1333
c 0
b 0
f 0
1
"""Vocabulary management functionality for Annif"""
2
3
import os.path
4
import annif
5
import annif.corpus
6
import annif.util
7
from annif.datadir import DatadirMixin
8
from annif.exception import NotInitializedException
9
10
logger = annif.logger
11
12
13
class AnnifVocabulary(DatadirMixin):
14
    """Class representing a subject vocabulary which can be used by multiple
15
    Annif projects."""
16
17
    # defaults for uninitialized instances
18
    _subjects = None
19
20
    # constants
21
    INDEX_FILENAME_DUMP = "subjects.dump.gz"
22
    INDEX_FILENAME_TTL = "subjects.ttl"
23
    INDEX_FILENAME_CSV = "subjects.csv"
24
25
    def __init__(self, vocab_id, datadir):
26
        DatadirMixin.__init__(self, datadir, "vocabs", vocab_id)
27
        self.vocab_id = vocab_id
28
        self._skos_vocab = None
29
30
    def _create_subject_index(self, subject_corpus):
31
        subjects = annif.corpus.SubjectIndex()
32
        subjects.load_subjects(subject_corpus)
33
        annif.util.atomic_save(subjects, self.datadir, self.INDEX_FILENAME_CSV)
34
        return subjects
35
36
    def _update_subject_index(self, subject_corpus):
37
        old_subjects = self.subjects
38
        new_subjects = annif.corpus.SubjectIndex()
39
        new_subjects.load_subjects(subject_corpus)
40
        updated_subjects = annif.corpus.SubjectIndex()
41
42
        for old_subject in old_subjects:
43
            if new_subjects.contains_uri(old_subject.uri):
44
                new_subject = new_subjects[new_subjects.by_uri(old_subject.uri)]
45
            else:  # subject removed from new corpus
46
                new_subject = annif.corpus.Subject(
47
                    uri=old_subject.uri, labels=None, notation=None
48
                )
49
            updated_subjects.append(new_subject)
50
        for new_subject in new_subjects:
51
            if not old_subjects.contains_uri(new_subject.uri):
52
                updated_subjects.append(new_subject)
53
        annif.util.atomic_save(updated_subjects, self.datadir, self.INDEX_FILENAME_CSV)
54
        return updated_subjects
55
56
    @property
57
    def subjects(self):
58
        if self._subjects is None:
59
            path = os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
60
            if os.path.exists(path):
61
                logger.debug("loading subjects from %s", path)
62
                self._subjects = annif.corpus.SubjectIndex.load(path)
63
            else:
64
                raise NotInitializedException("subject file {} not found".format(path))
65
        return self._subjects
66
67
    @property
68
    def skos(self):
69
        """return the subject vocabulary from SKOS file"""
70
        if self._skos_vocab is not None:
71
            return self._skos_vocab
72
73
        # attempt to load graph from dump file
74
        dumppath = os.path.join(self.datadir, self.INDEX_FILENAME_DUMP)
75
        if os.path.exists(dumppath):
76
            logger.debug(f"loading graph dump from {dumppath}")
77
            try:
78
                self._skos_vocab = annif.corpus.SubjectFileSKOS(dumppath)
79
            except ModuleNotFoundError:
80
                # Probably dump has been saved using a different rdflib version
81
                logger.debug("could not load graph dump, using turtle file")
82
            else:
83
                return self._skos_vocab
84
85
        # graph dump file not found - parse ttl file instead
86
        path = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
87
        if os.path.exists(path):
88
            logger.debug(f"loading graph from {path}")
89
            self._skos_vocab = annif.corpus.SubjectFileSKOS(path)
90
            # store the dump file so we can use it next time
91
            self._skos_vocab.save_skos(path)
92
            return self._skos_vocab
93
94
        raise NotInitializedException(f"graph file {path} not found")
95
96
    def __len__(self):
97
        return len(self.subjects)
98
99
    @property
100
    def languages(self):
101
        return self.subjects.languages
102
103
    def load_vocabulary(self, subject_corpus, force=False):
104
        """Load subjects from a subject corpus and save them into one
105
        or more subject index files as well as a SKOS/Turtle file for later
106
        use. If force=True, replace the existing subject index completely."""
107
108
        if not force and os.path.exists(
109
            os.path.join(self.datadir, self.INDEX_FILENAME_CSV)
110
        ):
111
            logger.info("updating existing subject index")
112
            self._subjects = self._update_subject_index(subject_corpus)
113
        else:
114
            logger.info("creating subject index")
115
            self._subjects = self._create_subject_index(subject_corpus)
116
117
        skosfile = os.path.join(self.datadir, self.INDEX_FILENAME_TTL)
118
        logger.info(f"saving vocabulary into SKOS file {skosfile}")
119
        subject_corpus.save_skos(skosfile)
120
121
    def as_graph(self):
122
        """return the vocabulary as an rdflib graph"""
123
        return self.skos.graph
124