Passed
Pull Request — master (#600)
by Osma
02:45
created

annif.corpus.skos.SubjectFileSKOS.save_skos()   A

Complexity

Conditions 4

Size

Total Lines 17
Code Lines 10

Duplication

Lines 17
Ratio 100 %

Importance

Changes 0
Metric Value
cc 4
eloc 10
nop 3
dl 17
loc 17
rs 9.9
c 0
b 0
f 0
1
"""Support for subjects loaded from a SKOS/RDF file"""
2
3
import os.path
4
import shutil
5
import joblib
6
import rdflib
7
import rdflib.util
8
from rdflib.namespace import SKOS, RDF, OWL, RDFS
9
import annif.util
10
from .types import Subject, SubjectCorpus
11
12
13 View Code Duplication
def serialize_subjects_to_skos(subjects, language, path):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
14
    """Create a SKOS representation of the given subjects and serialize it
15
    into a SKOS/Turtle file with the given path name."""
16
17
    graph = rdflib.Graph()
18
    graph.namespace_manager.bind('skos', SKOS)
19
    for subject in subjects:
20
        graph.add((rdflib.URIRef(subject.uri), RDF.type, SKOS.Concept))
21
        graph.add((rdflib.URIRef(subject.uri),
22
                   SKOS.prefLabel,
23
                   rdflib.Literal(subject.label, language)))
24
        graph.add((rdflib.URIRef(subject.uri),
25
                   SKOS.notation,
26
                   rdflib.Literal(subject.notation)))
27
    graph.serialize(destination=path, format='turtle')
28
    # also dump the graph in joblib format which is faster to load
29
    annif.util.atomic_save(graph,
30
                           *os.path.split(path.replace('.ttl', '.dump.gz')),
31
                           method=joblib.dump)
32
33
34 View Code Duplication
class SubjectFileSKOS(SubjectCorpus):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
35
    """A subject corpus that uses SKOS files"""
36
37
    PREF_LABEL_PROPERTIES = (SKOS.prefLabel, RDFS.label)
38
39
    def __init__(self, path):
40
        self.path = path
41
        if path.endswith('.dump.gz'):
42
            self.graph = joblib.load(path)
43
        else:
44
            self.graph = rdflib.Graph()
45
            self.graph.parse(self.path,
46
                             format=rdflib.util.guess_format(self.path))
47
48
    @property
49
    def languages(self):
50
        return {label.language
51
                for concept in self.concepts
52
                for label_type in self.PREF_LABEL_PROPERTIES
53
                for label in self.graph.objects(concept, label_type)
54
                if label.language is not None}
55
56
    def subjects(self, language):
57
        for concept in self.concepts:
58
            labels = self.get_concept_labels(
59
                concept, self.PREF_LABEL_PROPERTIES, language)
60
            # Use first label if available, else use qualified name (from URI)
61
            label = (labels[0] if labels
62
                     else self.graph.namespace_manager.qname(concept))
63
64
            notation = self.graph.value(concept, SKOS.notation, None, any=True)
65
            if notation is not None:
66
                notation = str(notation)
67
68
            yield Subject(uri=str(concept), label=label, notation=notation,
69
                          text=None)
70
71
    @property
72
    def concepts(self):
73
        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
74
            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
75
                continue
76
            yield concept
77
78
    def get_concept_labels(self, concept, label_types, language):
79
        all_labels = [label
80
                      for label_type in label_types
81
                      for label in self.graph.objects(concept, label_type)]
82
83
        # 1. Labels with the correct language tag
84
        same_lang_labels = [str(label)
85
                            for label in all_labels
86
                            if label.language == language]
87
88
        # 2. Labels without a language tag
89
        no_lang_labels = [str(label)
90
                          for label in all_labels
91
                          if label.language is None]
92
93
        # Return both kinds, but better ones (with the right language) first
94
        return same_lang_labels + no_lang_labels
95
96
    @staticmethod
97
    def is_rdf_file(path):
98
        """return True if the path looks like an RDF file that can be loaded
99
        as SKOS"""
100
101
        fmt = rdflib.util.guess_format(path)
102
        return fmt is not None
103
104
    def save_skos(self, path, language):
105
        """Save the contents of the subject vocabulary into a SKOS/Turtle
106
        file with the given path name."""
107
108
        if self.path.endswith('.ttl'):
109
            # input is already in Turtle syntax, no need to reserialize
110
            if not os.path.exists(path) or \
111
               not os.path.samefile(self.path, path):
112
                shutil.copyfile(self.path, path)
113
        else:
114
            # need to serialize into Turtle
115
            self.graph.serialize(destination=path, format='turtle')
116
        # also dump the graph in joblib format which is faster to load
117
        annif.util.atomic_save(self.graph,
118
                               *os.path.split(
119
                                   path.replace('.ttl', '.dump.gz')),
120
                               method=joblib.dump)
121