RDFProvider.__init__()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 1
Metric Value
cc 2
c 3
b 0
f 1
dl 0
loc 6
rs 9.4285
1
# -*- coding: utf-8 -*-
2
3
'''
4
This module contains an RDFProvider, an implementation of the 
5
:class:`skosprovider.providers.VocabularyProvider` interface that uses a 
6
:class:`rdflib.graph.Graph` as input.
7
'''
8
9
import logging
10
import rdflib
11
from rdflib.term import Literal, URIRef
12
from skosprovider_rdf.utils import text_, _df_writexml
13
14
log = logging.getLogger(__name__)
15
logging.basicConfig(level=logging.DEBUG)
16
17
from skosprovider.providers import MemoryProvider
18
from skosprovider.uri import (
19
    DefaultConceptSchemeUrnGenerator
20
)
21
from skosprovider.skos import (
22
    Concept,
23
    Collection,
24
    ConceptScheme,
25
    Label,
26
    Note,
27
    Source
28
)
29
30
from rdflib.namespace import RDF, SKOS, DC, DCTERMS
31
SKOS_THES = rdflib.Namespace('http://purl.org/iso25964/skos-thes#')
32
33
from language_tags import tags
34
35
from xml.dom.minidom import DocumentFragment
36
DocumentFragment.writexml = _df_writexml
37
38
39
class RDFProvider(MemoryProvider):
40
    '''
41
    A simple vocabulary provider that use an :class:`rdflib.graph.Graph`
42
    as input. The provider expects a RDF graph with elements that represent
43
    the SKOS concepts and collections.
44
45
    Please be aware that this provider needs to load the entire graph in memory.
46
    '''
47
48
    def __init__(self, metadata, graph, **kwargs):
49
        self.graph = graph
50
        if not 'concept_scheme' in kwargs:
51
            kwargs['concept_scheme'] = self._cs_from_graph(metadata)
52
        super(RDFProvider, self).__init__(metadata, [], **kwargs)
53
        self.list = self._from_graph()
54
55
    def _cs_from_graph(self, metadata):
56
        cslist = []
57
        for sub in self.graph.subjects(RDF.type, SKOS.ConceptScheme):
58
            uri = self.to_text(sub)
59
            cs = ConceptScheme(
60
                uri=uri,
61
                labels = self._create_from_subject_typelist(sub, Label.valid_types),
62
                notes = self._create_from_subject_typelist(sub, Note.valid_types),
63
                sources = self._create_sources(sub),
64
                languages = self._create_languages(sub)
65
            )
66
            cslist.append(cs)
67
        if len(cslist) == 0:
68
            return ConceptScheme(
69
                uri=DefaultConceptSchemeUrnGenerator().generate(
70
                    id=metadata.get('id')
71
                )
72
            )
73
        elif len(cslist) == 1:
74
            return cslist[0]
75
        else:
76
            raise RuntimeError(
77
                'This RDF file contains more than one ConceptScheme.'
78
            )
79
80
    def _from_graph(self):
81
        clist = []
82
        for sub, pred, obj in self.graph.triples((None, RDF.type, SKOS.Concept)):
83
            uri = self.to_text(sub)
84
            matches = {}
85
            for k in Concept.matchtypes:
86
                matches[k] = self._create_from_subject_predicate(sub, URIRef(SKOS + k +'Match'))
87
            con = Concept(
88
                id = self._get_id_for_subject(sub, uri), 
89
                uri=uri,
90
                concept_scheme = self.concept_scheme,
91
                labels = self._create_from_subject_typelist(sub, Label.valid_types),
92
                notes = self._create_from_subject_typelist(sub, Note.valid_types),
93
                sources = self._create_sources(sub),
94
                broader = self._create_from_subject_predicate(sub, SKOS.broader),
95
                narrower = self._create_from_subject_predicate(sub, SKOS.narrower),
96
                related = self._create_from_subject_predicate(sub, SKOS.related),
97
                member_of = [],
98
                subordinate_arrays = self._create_from_subject_predicate(sub, SKOS_THES.subordinateArray),
99
                matches = matches
100
            )
101
            clist.append(con)
102
103
        for sub, pred, obj in self.graph.triples((None, RDF.type, SKOS.Collection)):
104
            uri = self.to_text(sub)
105
            col = Collection(
106
                id=self._get_id_for_subject(sub, uri), 
107
                uri=uri,
108
                concept_scheme = self.concept_scheme,
109
                labels = self._create_from_subject_typelist(sub, Label.valid_types),
110
                notes = self._create_from_subject_typelist(sub, (Note.valid_types)),
111
                sources = self._create_sources(sub),
112
                members = self._create_from_subject_predicate(sub, SKOS.member),
113
                member_of = [],
114
                superordinates = self._create_from_subject_predicate(sub, SKOS_THES.superOrdinate)
115
            )
116
            clist.append(col)
117
        self._fill_member_of(clist)
118
        return clist
119
120
    def _fill_member_of(self, clist):
121
        collections = list(set([c for c in clist if isinstance(c, Collection)]))
122
        for col in collections:
123
            for c in clist:
124
                if c.id in col.members:
125
                    c.member_of.append(col.id)
126
        return
127
128
    def _create_from_subject_typelist(self,subject,typelist):
129
        list = []
130
        for p in typelist:
131
            term=SKOS.term(p)
132
            list.extend(self._create_from_subject_predicate(subject,term))
133
        return list
134
135
    def _get_id_for_subject(self, subject, uri):
136
        for stmt in self.graph:
137
            print(stmt)
138
        if (subject, DCTERMS.identifier, None) in self.graph:
139
            return self.to_text(self.graph.value(subject=subject, predicate=DCTERMS.identifier, any=False))
140
        elif (subject, DC.identifier, None) in self.graph:
141
            return self.to_text(self.graph.value(subject=subject, predicate=DC.identifier, any=False))
142
        else:
143
            return uri
144
145
    def _create_from_subject_predicate(self, subject, predicate):
146
        list = []
147
        for s, p, o in self.graph.triples((subject, predicate, None)):
148
            type = predicate.split('#')[-1]
149
            if Label.is_valid_type(type):
150
                o = self._create_label(o, type)
151
            elif Note.is_valid_type(type):
152
                o = self._create_note(o, type)
153
            else:
154
                o = self._get_id_for_subject(o, self.to_text(o))
155
            list.append(o)
156
        return list
157
158
    def _create_label(self, literal, type):
159
        if not Label.is_valid_type(type):
160
            raise ValueError(
161
                'Type of Label is not valid.'
162
            )
163
        return Label(self.to_text(literal), type, self._get_language_from_literal(literal))
164
165
    def _read_markupped_literal(self, literal):
166
        if literal.datatype is None:
167
            return (literal, self._get_language_from_literal(literal), None)
168
        elif literal.datatype == RDF.HTML:
169
            df = literal.value.cloneNode(True)
170
            if df.firstChild and df.firstChild.attributes and 'xml:lang' in df.firstChild.attributes.keys():
171
                lang = self._scrub_language(df.firstChild.attributes.get('xml:lang').value)
172
                del df.firstChild.attributes['xml:lang']
173
            else:
174
                lang = 'und'
175
            return(df.toxml(), lang, 'HTML')
176
        else:
177
            raise ValueError(
178
                'Unable to process literal of type %s.' % literal.datatype
179
            )
180
181
    def _create_note(self, literal, type):
182
        if not Note.is_valid_type(type):
183
            raise ValueError(
184
                'Type of Note is not valid.'
185
            )
186
        l = self._read_markupped_literal(literal)
187
        return Note(self.to_text(l[0]), type, l[1], l[2])
188
189
    def _create_sources(self, subject):
190
        '''
191
        Create the sources for this subject.
192
193
        :param subject: Subject to get the sources for.
194
        :returns: A :class:`list` of :class:`skosprovider.skos.Source` objects.
195
        '''
196
        ret = []
197
        for s, p, o in self.graph.triples((subject, DCTERMS.source, None)):
198
            for si, pi, oi in self.graph.triples((o, DCTERMS.bibliographicCitation, None)):
199
                ret.append(
200
                    Source(
201
                        self.to_text(oi),
202
                        'HTML' if oi.datatype == RDF.HTML else None
203
                )
204
                )
205
        return ret
206
207
    def _create_languages(self, subject):
208
        '''
209
        Create the languages for this subject.
210
211
        :param subject: Subject to get the sources for.
212
        :returns: A :class:`list` of IANA language tags.
213
        '''
214
        ret = set()
215
        for s, p, o in self.graph.triples((subject, DCTERMS.language, None)):
216
            ret.add(self.to_text(self._scrub_language(o)))
217
        for s, p, o in self.graph.triples((subject, DC.language, None)):
218
            ret.add(self.to_text(self._scrub_language(o)))
219
        return ret
220
221
    def _scrub_language(self, language):
222
        if tags.check(language):
223
            return language
224
        else:
225
            log.warn('Encountered an invalid language %s. Falling back to "und".' % language)
226
            return 'und'
227
228
    def _get_language_from_literal(self, data):
229
        if not isinstance(data, Literal):
230
            return None
231
        if data.language is None:
232
            return None
233
        return self.to_text(self._scrub_language(data.language))
234
235
    def to_text(self, data):
236
        """
237
        data of binary type or literal type that needs to be converted to text.
238
        :param data
239
        :return: text representation of the data
240
        """
241
        return text_(data.encode('utf-8'), 'utf-8')
242