Passed
Pull Request — master (#608)
by Osma
02:49
created

SubjectFileSKOS._concept_labels()   A

Complexity

Conditions 3

Size

Total Lines 7
Code Lines 7

Duplication

Lines 7
Ratio 100 %

Importance

Changes 0
Metric Value
cc 3
eloc 7
nop 2
dl 7
loc 7
rs 10
c 0
b 0
f 0
1
"""Support for subjects loaded from a SKOS/RDF file"""
2
3
import collections
4
import os.path
5
import shutil
6
import joblib
7
import rdflib
8
import rdflib.util
9
from rdflib.namespace import SKOS, RDF, OWL, RDFS
10
import annif.util
11
from .types import Subject, SubjectCorpus
12
13
14 View Code Duplication
def serialize_subjects_to_skos(subjects, path):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
15
    """Create a SKOS representation of the given subjects and serialize it
16
    into a SKOS/Turtle file with the given path name."""
17
18
    graph = rdflib.Graph()
19
    graph.namespace_manager.bind('skos', SKOS)
20
    for subject in subjects:
21
        graph.add((rdflib.URIRef(subject.uri), RDF.type, SKOS.Concept))
22
        for lang, label in subject.labels.items():
23
            graph.add((rdflib.URIRef(subject.uri),
24
                       SKOS.prefLabel,
25
                       rdflib.Literal(label, lang)))
26
        graph.add((rdflib.URIRef(subject.uri),
27
                   SKOS.notation,
28
                   rdflib.Literal(subject.notation)))
29
    graph.serialize(destination=path, format='turtle')
30
    # also dump the graph in joblib format which is faster to load
31
    annif.util.atomic_save(graph,
32
                           *os.path.split(path.replace('.ttl', '.dump.gz')),
33
                           method=joblib.dump)
34
35
36 View Code Duplication
class SubjectFileSKOS(SubjectCorpus):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
37
    """A subject corpus that uses SKOS files"""
38
39
    PREF_LABEL_PROPERTIES = (SKOS.prefLabel, RDFS.label)
40
41
    _languages = None
42
43
    def __init__(self, path):
44
        self.path = path
45
        if path.endswith('.dump.gz'):
46
            self.graph = joblib.load(path)
47
        else:
48
            self.graph = rdflib.Graph()
49
            self.graph.parse(self.path,
50
                             format=rdflib.util.guess_format(self.path))
51
52
    @property
53
    def languages(self):
54
        if self._languages is None:
55
            self._languages = {label.language
56
                               for concept in self.concepts
57
                               for label_type in self.PREF_LABEL_PROPERTIES
58
                               for label in self.graph.objects(concept,
59
                                                               label_type)
60
                               if label.language is not None}
61
        return self._languages
62
63
    def _concept_labels(self, concept):
64
        by_lang = self.get_concept_labels(concept,
65
                                          self.PREF_LABEL_PROPERTIES)
66
        return {lang: by_lang[lang][0] if by_lang[lang]  # correct lang
67
                else by_lang[None][0] if by_lang[None]  # no language
68
                else self.graph.namespace_manager.qname(concept)
69
                for lang in self.languages}
70
71
    @property
72
    def subjects(self):
73
        for concept in self.concepts:
74
            labels = self._concept_labels(concept)
75
76
            notation = self.graph.value(concept, SKOS.notation, None, any=True)
77
            if notation is not None:
78
                notation = str(notation)
79
80
            yield Subject(uri=str(concept), labels=labels, notation=notation)
81
82
    @property
83
    def concepts(self):
84
        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
85
            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
86
                continue
87
            yield concept
88
89
    def get_concept_labels(self, concept, label_types):
90
        """return all the labels of the given concept with the given label
91
        properties as a dict-like object where the keys are language codes
92
        and the values are lists of labels in that language"""
93
        labels_by_lang = collections.defaultdict(list)
94
95
        for label_type in label_types:
96
            for label in self.graph.objects(concept, label_type):
97
                labels_by_lang[label.language].append(str(label))
98
99
        return labels_by_lang
100
101
    @staticmethod
102
    def is_rdf_file(path):
103
        """return True if the path looks like an RDF file that can be loaded
104
        as SKOS"""
105
106
        fmt = rdflib.util.guess_format(path)
107
        return fmt is not None
108
109
    def save_skos(self, path):
110
        """Save the contents of the subject vocabulary into a SKOS/Turtle
111
        file with the given path name."""
112
113
        if self.path.endswith('.ttl'):
114
            # input is already in Turtle syntax, no need to reserialize
115
            if not os.path.exists(path) or \
116
               not os.path.samefile(self.path, path):
117
                shutil.copyfile(self.path, path)
118
        else:
119
            # need to serialize into Turtle
120
            self.graph.serialize(destination=path, format='turtle')
121
        # also dump the graph in joblib format which is faster to load
122
        annif.util.atomic_save(self.graph,
123
                               *os.path.split(
124
                                   path.replace('.ttl', '.dump.gz')),
125
                               method=joblib.dump)
126