annif.vocab.skos.serialize_subjects_to_skos()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 28
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 20
nop 2
dl 0
loc 28
rs 9.4
c 0
b 0
f 0
1
"""Classes for supporting vocabulary files in SKOS/RDF format"""
2
3
from __future__ import annotations
4
5
import collections
6
import os.path
7
import shutil
8
from typing import TYPE_CHECKING
9
10
import rdflib
11
import rdflib.util
12
from rdflib.namespace import OWL, RDF, RDFS, SKOS
13
14
import annif.util
15
16
from .types import Subject, VocabSource
17
18
if TYPE_CHECKING:
19
    from collections.abc import Iterator, Sequence
20
21
    from rdflib.term import URIRef
22
23
24
def serialize_subjects_to_skos(subjects: Iterator, path: str) -> None:
25
    """Create a SKOS representation of the given subjects and serialize it
26
    into a SKOS/Turtle file with the given path name."""
27
    import joblib
28
29
    graph = rdflib.Graph()
30
    graph.namespace_manager.bind("skos", SKOS)
31
    for subject in subjects:
32
        graph.add((rdflib.URIRef(subject.uri), RDF.type, SKOS.Concept))
33
        for lang, label in subject.labels.items():
34
            graph.add(
35
                (
36
                    rdflib.URIRef(subject.uri),
37
                    SKOS.prefLabel,
38
                    rdflib.Literal(label, lang),
39
                )
40
            )
41
        graph.add(
42
            (
43
                rdflib.URIRef(subject.uri),
44
                SKOS.notation,
45
                rdflib.Literal(subject.notation),
46
            )
47
        )
48
    graph.serialize(destination=path, format="turtle")
49
    # also dump the graph in joblib format which is faster to load
50
    annif.util.atomic_save(
51
        graph, *os.path.split(path.replace(".ttl", ".dump.gz")), method=joblib.dump
52
    )
53
54
55
class VocabFileSKOS(VocabSource):
56
    """A subject corpus that uses SKOS files"""
57
58
    PREF_LABEL_PROPERTIES = (SKOS.prefLabel, RDFS.label)
59
60
    _languages = None
61
62
    def __init__(self, path: str) -> None:
63
        self.path = path
64
        if path.endswith(".dump.gz"):
65
            import joblib
66
67
            self.graph = joblib.load(path)
68
        else:
69
            self.graph = rdflib.Graph()
70
            self.graph.parse(self.path, format=rdflib.util.guess_format(self.path))
71
72
    @property
73
    def languages(self) -> set[str]:
74
        if self._languages is None:
75
            self._languages = {
76
                label.language
77
                for concept in self.concepts
78
                for label_type in self.PREF_LABEL_PROPERTIES
79
                for label in self.graph.objects(concept, label_type)
80
                if label.language is not None
81
            }
82
        return self._languages
83
84
    def _concept_labels(self, concept: URIRef) -> dict[str, str]:
85
        by_lang = self.get_concept_labels(concept, self.PREF_LABEL_PROPERTIES)
86
        return {
87
            lang: (
88
                by_lang[lang][0]
89
                if by_lang[lang]  # correct lang
90
                else (
91
                    by_lang[None][0]
92
                    if by_lang[None]  # no language
93
                    else self.graph.namespace_manager.qname(concept)
94
                )
95
            )
96
            for lang in self.languages
97
        }
98
99
    @property
100
    def subjects(self) -> Iterator[Subject]:
101
        for concept in self.concepts:
102
            labels = self._concept_labels(concept)
103
104
            notation = self.graph.value(concept, SKOS.notation, None, any=True)
105
            if notation is not None:
106
                notation = str(notation)
107
108
            yield Subject(uri=str(concept), labels=labels, notation=notation)
109
110
    @property
111
    def concepts(self) -> Iterator[URIRef]:
112
        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
113
            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
114
                continue
115
            yield concept
116
117
    def get_concept_labels(
118
        self,
119
        concept: URIRef,
120
        label_types: Sequence[URIRef],
121
    ) -> collections.defaultdict[str | None, list[str]]:
122
        """return all the labels of the given concept with the given label
123
        properties as a dict-like object where the keys are language codes
124
        and the values are lists of labels in that language"""
125
        labels_by_lang = collections.defaultdict(list)
126
127
        for label_type in label_types:
128
            for label in self.graph.objects(concept, label_type):
129
                labels_by_lang[label.language].append(str(label))
130
131
        return labels_by_lang
132
133
    @staticmethod
134
    def is_rdf_file(path: str) -> bool:
135
        """return True if the path looks like an RDF file that can be loaded
136
        as SKOS"""
137
138
        fmt = rdflib.util.guess_format(path)
139
        return fmt is not None
140
141
    def save_skos(self, path: str) -> None:
142
        """Save the contents of the subject vocabulary into a SKOS/Turtle
143
        file with the given path name."""
144
145
        if self.path.endswith(".ttl"):
146
            # input is already in Turtle syntax, no need to reserialize
147
            if not os.path.exists(path) or not os.path.samefile(self.path, path):
148
                shutil.copyfile(self.path, path)
149
        else:
150
            # need to serialize into Turtle
151
            self.graph.serialize(destination=path, format="turtle")
152
        # also dump the graph in joblib format which is faster to load
153
        import joblib
154
155
        annif.util.atomic_save(
156
            self.graph,
157
            *os.path.split(path.replace(".ttl", ".dump.gz")),
158
            method=joblib.dump,
159
        )
160