|
1
|
|
|
import argparse |
|
2
|
|
|
import csv |
|
3
|
|
|
import json |
|
4
|
|
|
import os |
|
5
|
|
|
import sys |
|
6
|
|
|
|
|
7
|
|
|
from rdflib import Graph |
|
8
|
|
|
from rdflib.util import SUFFIX_FORMAT_MAP |
|
9
|
|
|
from rdflib.util import guess_format |
|
10
|
|
|
from skosprovider.providers import DictionaryProvider |
|
11
|
|
|
from skosprovider.providers import SimpleCsvProvider |
|
12
|
|
|
from skosprovider.skos import ConceptScheme |
|
13
|
|
|
from skosprovider.uri import UriPatternGenerator |
|
14
|
|
|
from skosprovider_rdf.providers import RDFProvider |
|
15
|
|
|
from skosprovider_sqlalchemy.utils import import_provider |
|
16
|
|
|
from sqlalchemy import create_engine |
|
17
|
|
|
from sqlalchemy.engine import url |
|
18
|
|
|
from sqlalchemy.orm import sessionmaker |
|
19
|
|
|
|
|
20
|
|
|
from atramhasis.data.models import Provider |
|
21
|
|
|
from atramhasis.scripts.migrate_sqlalchemy_providers import json_serial |
|
22
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
def file_to_rdf_provider(**kwargs) -> RDFProvider: |
|
25
|
|
|
""" |
|
26
|
|
|
Create RDF provider from the input file |
|
27
|
|
|
""" |
|
28
|
|
|
input_file = kwargs.get('input_file') |
|
29
|
|
|
input_name, input_ext = os.path.splitext(os.path.basename(input_file)) |
|
30
|
|
|
meta_id = kwargs.get("provider_id") or input_name.upper() |
|
31
|
|
|
graph = Graph() |
|
32
|
|
|
graph.parse(input_file, format=guess_format(input_ext)) |
|
33
|
|
|
return RDFProvider( |
|
34
|
|
|
{'id': meta_id}, |
|
35
|
|
|
graph |
|
36
|
|
|
) |
|
37
|
|
|
|
|
38
|
|
|
|
|
39
|
|
|
def _create_provider_kwargs(**kwargs): |
|
40
|
|
|
provider_kwargs = {} |
|
41
|
|
|
uri_pattern = kwargs.get('uri_pattern') |
|
42
|
|
|
if uri_pattern: |
|
43
|
|
|
provider_kwargs['uri_generator'] = UriPatternGenerator(uri_pattern) |
|
44
|
|
|
concept_scheme = kwargs.get('concept_scheme') |
|
45
|
|
|
if concept_scheme: |
|
46
|
|
|
provider_kwargs['concept_scheme'] = concept_scheme |
|
47
|
|
|
return provider_kwargs |
|
48
|
|
|
|
|
49
|
|
|
|
|
50
|
|
|
def file_to_csv_provider(**kwargs) -> SimpleCsvProvider: |
|
51
|
|
|
""" |
|
52
|
|
|
Create CSV provider from the input file |
|
53
|
|
|
""" |
|
54
|
|
|
input_file = kwargs.get('input_file') |
|
55
|
|
|
input_name, input_ext = os.path.splitext(os.path.basename(input_file)) |
|
56
|
|
|
meta_id = kwargs.get("provider_id") or input_name.upper() |
|
57
|
|
|
provider_kwargs = _create_provider_kwargs(**kwargs) |
|
58
|
|
|
with open(input_file) as ifile: |
|
59
|
|
|
reader = csv.reader(ifile) |
|
60
|
|
|
return SimpleCsvProvider( |
|
61
|
|
|
{'id': meta_id}, |
|
62
|
|
|
reader, |
|
63
|
|
|
**provider_kwargs |
|
64
|
|
|
) |
|
65
|
|
|
|
|
66
|
|
|
|
|
67
|
|
|
def file_to_json_provider(**kwargs) -> DictionaryProvider: |
|
68
|
|
|
""" |
|
69
|
|
|
Create Dictionary provider from the input file |
|
70
|
|
|
""" |
|
71
|
|
|
input_file = kwargs.get('input_file') |
|
72
|
|
|
input_name, input_ext = os.path.splitext(os.path.basename(input_file)) |
|
73
|
|
|
meta_id = kwargs.get("provider_id") or input_name.upper() |
|
74
|
|
|
provider_kwargs = _create_provider_kwargs(**kwargs) |
|
75
|
|
|
with open(input_file) as data_file: |
|
76
|
|
|
dictionary = json.load(data_file) |
|
77
|
|
|
return DictionaryProvider( |
|
78
|
|
|
{'id': meta_id}, |
|
79
|
|
|
dictionary, |
|
80
|
|
|
**provider_kwargs |
|
81
|
|
|
) |
|
82
|
|
|
|
|
83
|
|
|
|
|
84
|
|
|
supported_types = { |
|
85
|
|
|
'JSON': { |
|
86
|
|
|
'extensions': ['.json'], |
|
87
|
|
|
'file_to_provider': file_to_json_provider |
|
88
|
|
|
}, |
|
89
|
|
|
'RDF': { |
|
90
|
|
|
'extensions': ['.%s' % suffix for suffix in SUFFIX_FORMAT_MAP], |
|
91
|
|
|
'file_to_provider': file_to_rdf_provider |
|
92
|
|
|
}, |
|
93
|
|
|
'CSV': { |
|
94
|
|
|
'extensions': ['.csv'], |
|
95
|
|
|
'file_to_provider': file_to_csv_provider |
|
96
|
|
|
} |
|
97
|
|
|
} |
|
98
|
|
|
|
|
99
|
|
|
supported_ext = [item for sublist in [supported_types[filetype]['extensions'] for filetype in supported_types.keys()] |
|
100
|
|
|
for item in sublist] |
|
101
|
|
|
|
|
102
|
|
|
|
|
103
|
|
|
def parse_argv_for_import(argv): |
|
104
|
|
|
""" |
|
105
|
|
|
Parse parameters and validate |
|
106
|
|
|
""" |
|
107
|
|
|
cmd = os.path.basename(argv[0]) |
|
108
|
|
|
parser = argparse.ArgumentParser( |
|
109
|
|
|
description='Import file to a database', |
|
110
|
|
|
epilog=( |
|
111
|
|
|
f'example: {cmd} ' |
|
112
|
|
|
'atramhasis/scripts/my_file ' |
|
113
|
|
|
'urn:x-skosprovider:trees:%s ' |
|
114
|
|
|
'--to sqlite:///atramhasis.sqlite ' |
|
115
|
|
|
'--conceptscheme-label Labels ' |
|
116
|
|
|
'--conceptscheme-uri urn:x-skosprovider:trees ' |
|
117
|
|
|
'--create-provider ' |
|
118
|
|
|
'--provider-id ERFGOEDTYPES ' |
|
119
|
|
|
'--id-generation-strategy numeric' |
|
120
|
|
|
) |
|
121
|
|
|
) |
|
122
|
|
|
parser.add_argument( |
|
123
|
|
|
'input_file', |
|
124
|
|
|
type=str, |
|
125
|
|
|
help='local path to the input file', |
|
126
|
|
|
) |
|
127
|
|
|
parser.add_argument( |
|
128
|
|
|
'uri_pattern', |
|
129
|
|
|
type=str, |
|
130
|
|
|
help='URI pattern input for the URIGenerator', |
|
131
|
|
|
) |
|
132
|
|
|
parser.add_argument( |
|
133
|
|
|
'--to', |
|
134
|
|
|
dest='to', |
|
135
|
|
|
metavar='conn_string', |
|
136
|
|
|
type=str, |
|
137
|
|
|
help='Connection string of the output database', |
|
138
|
|
|
required=False, |
|
139
|
|
|
default='sqlite:///atramhasis.sqlite' |
|
140
|
|
|
) |
|
141
|
|
|
parser.add_argument( |
|
142
|
|
|
'--conceptscheme-label', |
|
143
|
|
|
dest='cs_label', |
|
144
|
|
|
type=str, |
|
145
|
|
|
help='Label of the conceptscheme', |
|
146
|
|
|
required=False, |
|
147
|
|
|
default=None |
|
148
|
|
|
) |
|
149
|
|
|
parser.add_argument( |
|
150
|
|
|
'--conceptscheme-uri', |
|
151
|
|
|
dest='cs_uri', |
|
152
|
|
|
type=str, |
|
153
|
|
|
help='URI of the conceptscheme', |
|
154
|
|
|
required=False, |
|
155
|
|
|
default=None |
|
156
|
|
|
) |
|
157
|
|
|
parser.add_argument( |
|
158
|
|
|
'--create-provider', |
|
159
|
|
|
dest='create_provider', |
|
160
|
|
|
default=True, |
|
161
|
|
|
action=argparse.BooleanOptionalAction, |
|
162
|
|
|
help='An optional parameter if given a provider is created. ' |
|
163
|
|
|
'Use --no-create-provider to not create a provider', |
|
164
|
|
|
) |
|
165
|
|
|
parser.add_argument( |
|
166
|
|
|
'--provider-id', |
|
167
|
|
|
dest='provider_id', |
|
168
|
|
|
type=str, |
|
169
|
|
|
help='An optional string (eg. ERFGOEDTYPES) to be assigned to the provider id. ' |
|
170
|
|
|
'If not specified, assign the conceptscheme id to the provider id', |
|
171
|
|
|
required=False, |
|
172
|
|
|
default=None |
|
173
|
|
|
) |
|
174
|
|
|
parser.add_argument( |
|
175
|
|
|
'--id-generation-strategy', |
|
176
|
|
|
dest='id_generation_strategy', |
|
177
|
|
|
type=str, |
|
178
|
|
|
help='URI pattern input for the URIGenerator', |
|
179
|
|
|
required=False, |
|
180
|
|
|
choices=["numeric", "guid", "manual"], |
|
181
|
|
|
default="numeric" |
|
182
|
|
|
) |
|
183
|
|
|
args = parser.parse_args() |
|
184
|
|
|
if not validate_file(args.input_file) or not validate_connection_string(args.to): |
|
185
|
|
|
sys.exit(1) |
|
186
|
|
|
return args |
|
187
|
|
|
|
|
188
|
|
|
|
|
189
|
|
|
def validate_file(input_file): |
|
190
|
|
|
if not os.path.exists(input_file): |
|
191
|
|
|
print(f'The input file {input_file} does not exists') |
|
192
|
|
|
return False |
|
193
|
|
|
elif os.path.splitext(input_file)[1] not in supported_ext: |
|
194
|
|
|
print(f'the input file {input_file} is not supported. Allowed extensions are: {supported_ext}') |
|
195
|
|
|
return False |
|
196
|
|
|
else: |
|
197
|
|
|
return True |
|
198
|
|
|
|
|
199
|
|
|
|
|
200
|
|
|
def validate_connection_string(connection_string): |
|
201
|
|
|
""" |
|
202
|
|
|
Validate the connection string |
|
203
|
|
|
:param connection_string |
|
204
|
|
|
:return: Boolean True if correct connection string |
|
205
|
|
|
""" |
|
206
|
|
|
u = url.make_url(connection_string) |
|
207
|
|
|
if u.drivername == 'postgresql': |
|
208
|
|
|
if u.username and u.password and u.host and u.port and u.database: |
|
209
|
|
|
return True |
|
210
|
|
|
elif u.drivername == 'sqlite': |
|
211
|
|
|
if u.database: |
|
212
|
|
|
return True |
|
213
|
|
|
elif u.drivername: |
|
214
|
|
|
print('The database driver ' + u.drivername + ' is not supported.') |
|
215
|
|
|
print('Wrong structure of connection string "' + connection_string + '"') |
|
216
|
|
|
print('Structure: postgresql://username:password@host:port/db_name OR sqlite:///path/db_name.sqlite') |
|
217
|
|
|
return False |
|
218
|
|
|
|
|
219
|
|
|
|
|
220
|
|
|
def conn_str_to_session(conn_str): |
|
221
|
|
|
""" |
|
222
|
|
|
create session from database connection string |
|
223
|
|
|
""" |
|
224
|
|
|
connect_uri = conn_str |
|
225
|
|
|
engine = create_engine(connect_uri, echo=True) |
|
226
|
|
|
return sessionmaker( |
|
227
|
|
|
bind=engine, |
|
228
|
|
|
)() |
|
229
|
|
|
|
|
230
|
|
|
|
|
231
|
|
|
def create_conceptscheme(conceptscheme_uri: str, conceptscheme_label: str) -> ConceptScheme: |
|
232
|
|
|
""" |
|
233
|
|
|
Create a conceptscheme based on arg values |
|
234
|
|
|
""" |
|
235
|
|
|
return ConceptScheme( |
|
236
|
|
|
uri=conceptscheme_uri, |
|
237
|
|
|
labels = [{'label': conceptscheme_label}] |
|
238
|
|
|
) |
|
239
|
|
|
|
|
240
|
|
|
|
|
241
|
|
|
def main(argv=sys.argv): |
|
242
|
|
|
""" |
|
243
|
|
|
Documentation: import -h |
|
244
|
|
|
Run: import |
|
245
|
|
|
<path_input_file> |
|
246
|
|
|
<uri_pattern> |
|
247
|
|
|
--to <conn_string> |
|
248
|
|
|
--conceptscheme-uri <cs_uri> |
|
249
|
|
|
--conceptscheme-label <cs_label> |
|
250
|
|
|
--create-provider |
|
251
|
|
|
--provider-id <provider_id> |
|
252
|
|
|
--id-generation-strategy <numeric/guid/manual> |
|
253
|
|
|
|
|
254
|
|
|
example path_input_file: |
|
255
|
|
|
atramhasis/scripts/my_file |
|
256
|
|
|
|
|
257
|
|
|
structure conn_string: |
|
258
|
|
|
postgresql://username:password@host:port/db_name |
|
259
|
|
|
sqlite:///path/db_name.sqlite |
|
260
|
|
|
default conn_string: |
|
261
|
|
|
sqlite:///atramhasis.sqlite |
|
262
|
|
|
|
|
263
|
|
|
example conceptscheme_label |
|
264
|
|
|
My Conceptscheme |
|
265
|
|
|
default conceptscheme_label is the name of the file if a URI is specified. |
|
266
|
|
|
If no URI is specified, a conceptscheme will be imported from the input |
|
267
|
|
|
file. This only works for RDf files. For other file types (JSON and CSV) |
|
268
|
|
|
conceptscheme_uri is mandatory and conceptscheme_label is recommended. |
|
269
|
|
|
""" |
|
270
|
|
|
|
|
271
|
|
|
# Import the data |
|
272
|
|
|
args = parse_argv_for_import(argv) |
|
273
|
|
|
input_name, input_ext = os.path.splitext(os.path.basename(args.input_file)) |
|
274
|
|
|
session = conn_str_to_session(args.to) |
|
275
|
|
|
file_to_provider_function = [ |
|
276
|
|
|
supported_types[filetype]['file_to_provider'] |
|
277
|
|
|
for filetype in supported_types.keys() |
|
278
|
|
|
if input_ext in supported_types[filetype]['extensions'] |
|
279
|
|
|
][0] |
|
280
|
|
|
if args.cs_uri: |
|
281
|
|
|
cs_uri = args.cs_uri |
|
282
|
|
|
cs_label = args.cs_label if args.cs_label else input_name.capitalize() |
|
283
|
|
|
args.concept_scheme = create_conceptscheme(cs_uri, cs_label) |
|
284
|
|
|
provider = file_to_provider_function(**vars(args)) |
|
285
|
|
|
cs = import_provider(provider, session) |
|
286
|
|
|
if args.create_provider: |
|
287
|
|
|
db_provider = Provider() |
|
288
|
|
|
provider.metadata[ |
|
289
|
|
|
'atramhasis.id_generation_strategy' |
|
290
|
|
|
] = args.id_generation_strategy.upper() |
|
291
|
|
|
db_provider.meta = json.loads(json.dumps(provider.metadata, default=json_serial)) |
|
292
|
|
|
db_provider.expand_strategy = 'RECURSE' |
|
293
|
|
|
db_provider.conceptscheme = cs |
|
294
|
|
|
db_provider.id = args.provider_id or cs.id |
|
295
|
|
|
db_provider.uri_pattern = args.uri_pattern |
|
296
|
|
|
if 'conceptscheme_id' in db_provider.meta: |
|
297
|
|
|
del db_provider.meta['conceptscheme_id'] |
|
298
|
|
|
session.add(db_provider) |
|
299
|
|
|
session.commit() |
|
300
|
|
|
|
|
301
|
|
|
# Get info to return to the user |
|
302
|
|
|
scheme_id = cs.id |
|
303
|
|
|
if not args.create_provider: |
|
304
|
|
|
prov_id = getattr(args, 'provider_id', None) or input_name.upper() |
|
305
|
|
|
print( |
|
306
|
|
|
"\n\n*** The import of conceptscheme {0} from the {1} file to {2} was succesful. ***\ |
|
307
|
|
|
\n\nTo use the data in Atramhasis, you must edit the file my_thesaurus/skos/__init__.py.\ |
|
308
|
|
|
\nAdd a configuration similar to:\ |
|
309
|
|
|
\n\ndef create_registry(request):\ |
|
310
|
|
|
\n\t# create the SKOS registry\ |
|
311
|
|
|
\n\tregistry = Registry(instance_scope='threaded_thread')\ |
|
312
|
|
|
\n\t{3} = SQLAlchemyProvider(\ |
|
313
|
|
|
\n\t\t{{'id': '{4}', 'conceptscheme_id': {5}}},\ |
|
314
|
|
|
\n\t\trequest.db\ |
|
315
|
|
|
\n\t)\ |
|
316
|
|
|
\n\tregistry.register_provider({6})\ |
|
317
|
|
|
\n\treturn registry\ |
|
318
|
|
|
\n\n". |
|
319
|
|
|
format( |
|
320
|
|
|
prov_id, args.input_file, args.to, |
|
321
|
|
|
prov_id.replace(' ', '_'), prov_id, scheme_id, prov_id.replace(' ', '_') |
|
322
|
|
|
) |
|
323
|
|
|
) |
|
324
|
|
|
else: |
|
325
|
|
|
prov_id = args.provider_id or cs.id |
|
326
|
|
|
msg = """ |
|
327
|
|
|
*** |
|
328
|
|
|
The import of conceptscheme {0} from the {1} file to {2} was succesful. |
|
329
|
|
|
You can now continue through the Atramhasis UI. |
|
330
|
|
|
*** |
|
331
|
|
|
""" |
|
332
|
|
|
print(msg.format(prov_id, args.input_file, args.to)) |
|
333
|
|
|
|
|
334
|
|
|
|
|
335
|
|
|
|
|
336
|
|
|
if __name__ == '__main__': |
|
337
|
|
|
main() |
|
338
|
|
|
|