Test Failed
Push — master ( 9ea4cd...0ffab9 )
by Koen
03:34 queued 13s
created

atramhasis.scripts.dump_rdf   A

Complexity

Total Complexity 17

Size/Duplication

Total Lines 155
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 17
eloc 120
dl 0
loc 155
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F main() 0 134 17
1
import logging
2
import optparse
3
import os
4
import sys
5
import textwrap
6
import time
7
8
import transaction
9
from pyramid.paster import bootstrap, setup_logging
10
from rdflib.namespace import RDF, SKOS
11
from rdflib.term import URIRef
12
from skosprovider_rdf import utils
13
14
from atramhasis.data.datamanagers import CountsManager
15
from atramhasis.data.models import ConceptschemeCounts
16
from atramhasis.errors import SkosRegistryNotFoundException
17
18
log = logging.getLogger(__name__)
19
20
21
def main():
22
    description = """\
23
    Dump all conceptschemes to files. Will serialise as Turtle and RDF/XML format.
24
    """
25
    usage = "usage: %prog config_uri"
26
    parser = optparse.OptionParser(
27
        usage=usage,
28
        description=textwrap.dedent(description)
29
    )
30
    parser.add_option(
31
        '-l', '--location', dest='dump_location', type='string',
32
        help='Specify where to dump the conceptschemes. If not specified, this \
33
        is set to the atramhasis.dump_location from your ini file.'
34
    )
35
    parser.add_option(
36
        '-r', '--rdf2hdt', dest='rdf2hdt', type='string', default=False,
37
        help='Specify where the rdf2hdt command can be found. If not specified, this \
38
        is set to atramhasis.rdf2hdt from your ini file.'
39
    )
40
41
    options, args = parser.parse_args(sys.argv[1:])
42
43
    if not len(args) >= 1:
44
        log.error('You must provide at least one argument.')
45
        return 2
46
47
    config_uri = args[0]
48
49
    env = bootstrap(config_uri)
50
    setup_logging(config_uri)
51
52
    dump_location = options.dump_location
53
    if dump_location is None:
54
        dump_location = env['registry'].settings.get(
55
            'atramhasis.dump_location',
56
            os.path.abspath(os.path.dirname(config_uri))
57
        )
58
    if not os.access(dump_location, os.W_OK | os.X_OK):
59
        log.error('Dump location "' + dump_location + '" is not writable.')
60
        return 2
61
62
    rdf2hdt = options.rdf2hdt
63
    if not rdf2hdt:
64
        rdf2hdt = env['registry'].settings.get(
65
            'atramhasis.rdf2hdt',
66
            False
67
        )
68
69
    request = env['request']
70
71
    if hasattr(request, 'skos_registry') and request.skos_registry is not None:
72
        skos_registry = request.skos_registry
73
    else:
74
        raise SkosRegistryNotFoundException()  # pragma: no cover
75
76
    counts = []
77
78
    files = []
79
80
    for p in skos_registry.get_providers():
81
        if any([not_shown in p.get_metadata()['subject'] for not_shown in ['external']]):
82
            continue
83
        start_time = time.time()
84
        pid = p.get_metadata()['id']
85
        filename = os.path.join(dump_location, '%s-full' % pid)
86
        filename_ttl = '%s.ttl' % filename
87
        filename_rdf = '%s.rdf' % filename
88
        files.append(filename_ttl)
89
        log.info('Generating dump for %s' % pid)
90
        graph = utils.rdf_dumper(p)
91
        triples = len(graph)
92
        log.info('Number of triples in Graph: %d' % triples)
93
        csuri = URIRef(p.concept_scheme.uri)
94
        cs_triples = len(list(graph.predicate_objects(csuri)))
95
        log.info('Number of triples in Conceptscheme: %d' % cs_triples)
96
        count_concepts = len(list(graph.subjects(RDF.type, SKOS.Concept)))
97
        count_collections = len(list(graph.subjects(RDF.type, SKOS.Collection)))
98
        try:
99
            avg_concept_triples = ((triples - cs_triples) /
100
                                   (count_concepts + count_collections))
101
        except ZeroDivisionError:
102
            avg_concept_triples = 0
103
        log.info('Average number of triples per concept: %d' % avg_concept_triples)
104
        counts.append({
105
            'conceptscheme_id': pid,
106
            'triples': triples,
107
            'conceptscheme_triples': cs_triples,
108
            'avg_concept_triples': avg_concept_triples
109
        })
110
        log.info(f'Dumping {pid} to Turtle: {filename_ttl}')
111
        graph.serialize(destination=filename_ttl, format='turtle')
112
        log.info(f'Dumping {pid} to RDFxml: {filename_rdf}')
113
        graph.serialize(destination=filename_rdf, format='pretty-xml')
114
        del graph
115
        log.info(f'--- {(time.time() - start_time)} seconds ---')
116
117
    log.info('All files dumped to %s' % dump_location)
118
119
    if rdf2hdt:
120
        from subprocess import check_call, CalledProcessError
121
        parsing_error = False
122
        for f in files:
123
            log.info(f'Converting {f} to hdt')
124
            hdtf = f.replace('.ttl', '.hdt')
125
            try:
126
                check_call([rdf2hdt, '-f', 'turtle', f, hdtf])
127
            except (FileNotFoundError, CalledProcessError) as e:
128
                # Turtle failed, let's try rdfxml
129
                parsing_error = True
130
                log.warning(f'rdf2hdt for file {f} failed with error {e}. Trying rdfxml...')
131
                rdff = f.replace('.ttl', '.rdf')
132
                try:
133
                    check_call([rdf2hdt, '-f', 'rdfxml', rdff, hdtf])
134
                except (FileNotFoundError, CalledProcessError) as e:
135
                    # rdfxml failed
136
                    log.error(f'rdfxml for file {f} failed with error {e}')
137
        if parsing_error:
138
            log.error('Error during rdf2hdt conversion. Check logs for more information.')
139
        else:
140
            log.info(f'All hdt files dumped to {dump_location}')
141
142
    with transaction.manager:
143
        dbsession = request.registry.dbmaker()
144
        manager = CountsManager(dbsession)
145
        for c in counts:
146
            cs_count = ConceptschemeCounts(
147
                conceptscheme_id=c['conceptscheme_id'],
148
                triples=c['triples'],
149
                conceptscheme_triples=c['conceptscheme_triples'],
150
                avg_concept_triples=c['avg_concept_triples']
151
            )
152
            manager.save(cs_count)
153
154
    env['closer']()
155