Passed
Pull Request — master (#600)
by Osma
02:56
created

annif.corpus.skos   A

Complexity

Total Complexity 22

Size/Duplication

Total Lines 125
Duplicated Lines 88 %

Importance

Changes 0
Metric Value
eloc 84
dl 110
loc 125
rs 10
c 0
b 0
f 0
wmc 22

1 Function

Rating   Name   Duplication   Size   Complexity  
A serialize_subjects_to_skos() 19 19 2

7 Methods

Rating   Name   Duplication   Size   Complexity  
A SubjectFileSKOS.save_skos() 17 17 4
A SubjectFileSKOS.subjects() 14 14 4
A SubjectFileSKOS.is_rdf_file() 7 7 1
A SubjectFileSKOS.concepts() 6 6 3
A SubjectFileSKOS.__init__() 8 8 2
A SubjectFileSKOS.get_concept_labels() 17 17 1
A SubjectFileSKOS.languages() 11 11 5

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
"""Support for subjects loaded from a SKOS/RDF file"""
2
3
import os.path
4
import shutil
5
import joblib
6
import rdflib
7
import rdflib.util
8
from rdflib.namespace import SKOS, RDF, OWL, RDFS
9
import annif.util
10
from .types import Subject, SubjectCorpus
11
12
13 View Code Duplication
def serialize_subjects_to_skos(subjects, language, path):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
14
    """Create a SKOS representation of the given subjects and serialize it
15
    into a SKOS/Turtle file with the given path name."""
16
17
    graph = rdflib.Graph()
18
    graph.namespace_manager.bind('skos', SKOS)
19
    for subject in subjects:
20
        graph.add((rdflib.URIRef(subject.uri), RDF.type, SKOS.Concept))
21
        graph.add((rdflib.URIRef(subject.uri),
22
                   SKOS.prefLabel,
23
                   rdflib.Literal(subject.label, language)))
24
        graph.add((rdflib.URIRef(subject.uri),
25
                   SKOS.notation,
26
                   rdflib.Literal(subject.notation)))
27
    graph.serialize(destination=path, format='turtle')
28
    # also dump the graph in joblib format which is faster to load
29
    annif.util.atomic_save(graph,
30
                           *os.path.split(path.replace('.ttl', '.dump.gz')),
31
                           method=joblib.dump)
32
33
34 View Code Duplication
class SubjectFileSKOS(SubjectCorpus):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
35
    """A subject corpus that uses SKOS files"""
36
37
    PREF_LABEL_PROPERTIES = (SKOS.prefLabel, RDFS.label)
38
39
    def __init__(self, path):
40
        self.path = path
41
        if path.endswith('.dump.gz'):
42
            self.graph = joblib.load(path)
43
        else:
44
            self.graph = rdflib.Graph()
45
            self.graph.parse(self.path,
46
                             format=rdflib.util.guess_format(self.path))
47
48
    @property
49
    def languages(self):
50
        langs = set()
51
52
        for concept in self.concepts:
53
            for label_type in self.PREF_LABEL_PROPERTIES:
54
                for label in self.graph.objects(concept, label_type):
55
                    if label.language is not None:
56
                        langs.add(label.language)
57
58
        return langs
59
60
    def subjects(self, language):
61
        for concept in self.concepts:
62
            labels = self.get_concept_labels(
63
                concept, self.PREF_LABEL_PROPERTIES, language)
64
            # Use first label if available, else use qualified name (from URI)
65
            label = (labels[0] if labels
66
                     else self.graph.namespace_manager.qname(concept))
67
68
            notation = self.graph.value(concept, SKOS.notation, None, any=True)
69
            if notation is not None:
70
                notation = str(notation)
71
72
            yield Subject(uri=str(concept), label=label, notation=notation,
73
                          text=None)
74
75
    @property
76
    def concepts(self):
77
        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
78
            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
79
                continue
80
            yield concept
81
82
    def get_concept_labels(self, concept, label_types, language):
83
        all_labels = [label
84
                      for label_type in label_types
85
                      for label in self.graph.objects(concept, label_type)]
86
87
        # 1. Labels with the correct language tag
88
        same_lang_labels = [str(label)
89
                            for label in all_labels
90
                            if label.language == language]
91
92
        # 2. Labels without a language tag
93
        no_lang_labels = [str(label)
94
                          for label in all_labels
95
                          if label.language is None]
96
97
        # Return both kinds, but better ones (with the right language) first
98
        return same_lang_labels + no_lang_labels
99
100
    @staticmethod
101
    def is_rdf_file(path):
102
        """return True if the path looks like an RDF file that can be loaded
103
        as SKOS"""
104
105
        fmt = rdflib.util.guess_format(path)
106
        return fmt is not None
107
108
    def save_skos(self, path, language):
109
        """Save the contents of the subject vocabulary into a SKOS/Turtle
110
        file with the given path name."""
111
112
        if self.path.endswith('.ttl'):
113
            # input is already in Turtle syntax, no need to reserialize
114
            if not os.path.exists(path) or \
115
               not os.path.samefile(self.path, path):
116
                shutil.copyfile(self.path, path)
117
        else:
118
            # need to serialize into Turtle
119
            self.graph.serialize(destination=path, format='turtle')
120
        # also dump the graph in joblib format which is faster to load
121
        annif.util.atomic_save(self.graph,
122
                               *os.path.split(
123
                                   path.replace('.ttl', '.dump.gz')),
124
                               method=joblib.dump)
125