|
1
|
|
|
import logging |
|
2
|
|
|
import optparse |
|
3
|
|
|
import os |
|
4
|
|
|
import sys |
|
5
|
|
|
import textwrap |
|
6
|
|
|
import time |
|
7
|
|
|
|
|
8
|
|
|
import transaction |
|
9
|
|
|
from pyramid.paster import bootstrap, setup_logging |
|
10
|
|
|
from rdflib.namespace import RDF, SKOS |
|
11
|
|
|
from rdflib.term import URIRef |
|
12
|
|
|
from skosprovider_rdf import utils |
|
13
|
|
|
|
|
14
|
|
|
from atramhasis.data.datamanagers import CountsManager |
|
15
|
|
|
from atramhasis.data.models import ConceptschemeCounts |
|
16
|
|
|
from atramhasis.errors import SkosRegistryNotFoundException |
|
17
|
|
|
|
|
18
|
|
|
log = logging.getLogger(__name__) |
|
19
|
|
|
|
|
20
|
|
|
|
|
21
|
|
|
def main(): |
|
22
|
|
|
description = """\ |
|
23
|
|
|
Dump all conceptschemes to files. Will serialise as Turtle and RDF/XML format. |
|
24
|
|
|
""" |
|
25
|
|
|
usage = "usage: %prog config_uri" |
|
26
|
|
|
parser = optparse.OptionParser( |
|
27
|
|
|
usage=usage, |
|
28
|
|
|
description=textwrap.dedent(description) |
|
29
|
|
|
) |
|
30
|
|
|
parser.add_option( |
|
31
|
|
|
'-l', '--location', dest='dump_location', type='string', |
|
32
|
|
|
help='Specify where to dump the conceptschemes. If not specified, this \ |
|
33
|
|
|
is set to the atramhasis.dump_location from your ini file.' |
|
34
|
|
|
) |
|
35
|
|
|
parser.add_option( |
|
36
|
|
|
'-r', '--rdf2hdt', dest='rdf2hdt', type='string', default=False, |
|
37
|
|
|
help='Specify where the rdf2hdt command can be found. If not specified, this \ |
|
38
|
|
|
is set to atramhasis.rdf2hdt from your ini file.' |
|
39
|
|
|
) |
|
40
|
|
|
|
|
41
|
|
|
options, args = parser.parse_args(sys.argv[1:]) |
|
42
|
|
|
|
|
43
|
|
|
if not len(args) >= 1: |
|
44
|
|
|
log.error('You must provide at least one argument.') |
|
45
|
|
|
return 2 |
|
46
|
|
|
|
|
47
|
|
|
config_uri = args[0] |
|
48
|
|
|
|
|
49
|
|
|
env = bootstrap(config_uri) |
|
50
|
|
|
setup_logging(config_uri) |
|
51
|
|
|
|
|
52
|
|
|
dump_location = options.dump_location |
|
53
|
|
|
if dump_location is None: |
|
54
|
|
|
dump_location = env['registry'].settings.get( |
|
55
|
|
|
'atramhasis.dump_location', |
|
56
|
|
|
os.path.abspath(os.path.dirname(config_uri)) |
|
57
|
|
|
) |
|
58
|
|
|
if not os.access(dump_location, os.W_OK | os.X_OK): |
|
59
|
|
|
log.error('Dump location "' + dump_location + '" is not writable.') |
|
60
|
|
|
return 2 |
|
61
|
|
|
|
|
62
|
|
|
rdf2hdt = options.rdf2hdt |
|
63
|
|
|
if not rdf2hdt: |
|
64
|
|
|
rdf2hdt = env['registry'].settings.get( |
|
65
|
|
|
'atramhasis.rdf2hdt', |
|
66
|
|
|
False |
|
67
|
|
|
) |
|
68
|
|
|
|
|
69
|
|
|
request = env['request'] |
|
70
|
|
|
|
|
71
|
|
|
if hasattr(request, 'skos_registry') and request.skos_registry is not None: |
|
72
|
|
|
skos_registry = request.skos_registry |
|
73
|
|
|
else: |
|
74
|
|
|
raise SkosRegistryNotFoundException() # pragma: no cover |
|
75
|
|
|
|
|
76
|
|
|
counts = [] |
|
77
|
|
|
|
|
78
|
|
|
files = [] |
|
79
|
|
|
|
|
80
|
|
|
for p in skos_registry.get_providers(): |
|
81
|
|
|
if any([not_shown in p.get_metadata()['subject'] for not_shown in ['external']]): |
|
82
|
|
|
continue |
|
83
|
|
|
start_time = time.time() |
|
84
|
|
|
pid = p.get_metadata()['id'] |
|
85
|
|
|
filename = os.path.join(dump_location, '%s-full' % pid) |
|
86
|
|
|
filename_ttl = '%s.ttl' % filename |
|
87
|
|
|
filename_rdf = '%s.rdf' % filename |
|
88
|
|
|
files.append(filename_ttl) |
|
89
|
|
|
log.info('Generating dump for %s' % pid) |
|
90
|
|
|
graph = utils.rdf_dumper(p) |
|
91
|
|
|
triples = len(graph) |
|
92
|
|
|
log.info('Number of triples in Graph: %d' % triples) |
|
93
|
|
|
csuri = URIRef(p.concept_scheme.uri) |
|
94
|
|
|
cs_triples = len(list(graph.predicate_objects(csuri))) |
|
95
|
|
|
log.info('Number of triples in Conceptscheme: %d' % cs_triples) |
|
96
|
|
|
count_concepts = len(list(graph.subjects(RDF.type, SKOS.Concept))) |
|
97
|
|
|
count_collections = len(list(graph.subjects(RDF.type, SKOS.Collection))) |
|
98
|
|
|
try: |
|
99
|
|
|
avg_concept_triples = ((triples - cs_triples) / |
|
100
|
|
|
(count_concepts + count_collections)) |
|
101
|
|
|
except ZeroDivisionError: |
|
102
|
|
|
avg_concept_triples = 0 |
|
103
|
|
|
log.info('Average number of triples per concept: %d' % avg_concept_triples) |
|
104
|
|
|
counts.append({ |
|
105
|
|
|
'conceptscheme_id': pid, |
|
106
|
|
|
'triples': triples, |
|
107
|
|
|
'conceptscheme_triples': cs_triples, |
|
108
|
|
|
'avg_concept_triples': avg_concept_triples |
|
109
|
|
|
}) |
|
110
|
|
|
log.info(f'Dumping {pid} to Turtle: {filename_ttl}') |
|
111
|
|
|
graph.serialize(destination=filename_ttl, format='turtle') |
|
112
|
|
|
log.info(f'Dumping {pid} to RDFxml: {filename_rdf}') |
|
113
|
|
|
graph.serialize(destination=filename_rdf, format='pretty-xml') |
|
114
|
|
|
del graph |
|
115
|
|
|
log.info(f'--- {(time.time() - start_time)} seconds ---') |
|
116
|
|
|
|
|
117
|
|
|
log.info('All files dumped to %s' % dump_location) |
|
118
|
|
|
|
|
119
|
|
|
if rdf2hdt: |
|
120
|
|
|
from subprocess import check_call, CalledProcessError |
|
121
|
|
|
parsing_error = False |
|
122
|
|
|
for f in files: |
|
123
|
|
|
log.info(f'Converting {f} to hdt') |
|
124
|
|
|
hdtf = f.replace('.ttl', '.hdt') |
|
125
|
|
|
try: |
|
126
|
|
|
check_call([rdf2hdt, '-f', 'turtle', f, hdtf]) |
|
127
|
|
|
except (FileNotFoundError, CalledProcessError) as e: |
|
128
|
|
|
# Turtle failed, let's try rdfxml |
|
129
|
|
|
parsing_error = True |
|
130
|
|
|
log.warning(f'rdf2hdt for file {f} failed with error {e}. Trying rdfxml...') |
|
131
|
|
|
rdff = f.replace('.ttl', '.rdf') |
|
132
|
|
|
try: |
|
133
|
|
|
check_call([rdf2hdt, '-f', 'rdfxml', rdff, hdtf]) |
|
134
|
|
|
except (FileNotFoundError, CalledProcessError) as e: |
|
135
|
|
|
# rdfxml failed |
|
136
|
|
|
log.error(f'rdfxml for file {f} failed with error {e}') |
|
137
|
|
|
if parsing_error: |
|
138
|
|
|
log.error('Error during rdf2hdt conversion. Check logs for more information.') |
|
139
|
|
|
else: |
|
140
|
|
|
log.info(f'All hdt files dumped to {dump_location}') |
|
141
|
|
|
|
|
142
|
|
|
with transaction.manager: |
|
143
|
|
|
dbsession = request.registry.dbmaker() |
|
144
|
|
|
manager = CountsManager(dbsession) |
|
145
|
|
|
for c in counts: |
|
146
|
|
|
cs_count = ConceptschemeCounts( |
|
147
|
|
|
conceptscheme_id=c['conceptscheme_id'], |
|
148
|
|
|
triples=c['triples'], |
|
149
|
|
|
conceptscheme_triples=c['conceptscheme_triples'], |
|
150
|
|
|
avg_concept_triples=c['avg_concept_triples'] |
|
151
|
|
|
) |
|
152
|
|
|
manager.save(cs_count) |
|
153
|
|
|
|
|
154
|
|
|
env['closer']() |
|
155
|
|
|
|