|
1
|
|
|
#!/usr/bin/env python3 |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
""" |
|
4
|
|
|
Created on Tue Feb 19 16:15:35 2019 |
|
5
|
|
|
|
|
6
|
|
|
@author: Paolo Cozzi <[email protected]> |
|
7
|
|
|
""" |
|
8
|
|
|
|
|
9
|
|
|
import re |
|
10
|
|
|
import json |
|
11
|
|
|
import logging |
|
12
|
|
|
import requests |
|
13
|
|
|
|
|
14
|
|
|
from django.db.models import Q |
|
15
|
|
|
from django.core.exceptions import ObjectDoesNotExist |
|
16
|
|
|
from django.utils.text import Truncator |
|
17
|
|
|
|
|
18
|
|
|
from image_validation import validation, ValidationResult |
|
19
|
|
|
from image_validation.static_parameters import ruleset_filename as \ |
|
20
|
|
|
IMAGE_RULESET |
|
21
|
|
|
|
|
22
|
|
|
from common.constants import BIOSAMPLE_URL |
|
23
|
|
|
from image_app.models import Name |
|
24
|
|
|
from biosample.helpers import parse_image_alias, get_model_object |
|
25
|
|
|
|
|
26
|
|
|
# Get an instance of a logger |
|
27
|
|
|
logger = logging.getLogger(__name__) |
|
28
|
|
|
|
|
29
|
|
|
|
|
30
|
|
|
# a class to deal with temporary issues from EBI servers |
|
31
|
|
|
class OntologyCacheError(Exception): |
|
32
|
|
|
"""Identifies temporary issues with EBI servers and |
|
33
|
|
|
image_validation.use_ontology.OntologyCache objects""" |
|
34
|
|
|
|
|
35
|
|
|
|
|
36
|
|
|
# a class to deal with errors in ruleset (that are not user errors but |
|
37
|
|
|
# errors within InjectTool and image_validation library) |
|
38
|
|
|
class RulesetError(Exception): |
|
39
|
|
|
"""Indentifies errors in ruleset""" |
|
40
|
|
|
|
|
41
|
|
|
|
|
42
|
|
|
class MetaDataValidation(): |
|
43
|
|
|
"""A class to deal with IMAGE-ValidationTool ruleset objects""" |
|
44
|
|
|
|
|
45
|
|
|
ruleset = None |
|
46
|
|
|
|
|
47
|
|
|
def __init__(self, ruleset_filename=IMAGE_RULESET): |
|
48
|
|
|
self.read_in_ruleset(ruleset_filename) |
|
49
|
|
|
|
|
50
|
|
|
# check validation rules |
|
51
|
|
|
ruleset_errors = self.check_ruleset() |
|
52
|
|
|
|
|
53
|
|
|
if ruleset_errors != []: |
|
54
|
|
|
raise RulesetError( |
|
55
|
|
|
"Error with ruleset: %s" % "; ".join(ruleset_errors)) |
|
56
|
|
|
|
|
57
|
|
|
def read_in_ruleset(self, ruleset_filename): |
|
58
|
|
|
try: |
|
59
|
|
|
self.ruleset = validation.read_in_ruleset(ruleset_filename) |
|
60
|
|
|
|
|
61
|
|
|
except json.JSONDecodeError as message: |
|
62
|
|
|
logger.error( |
|
63
|
|
|
"Error with 'https://www.ebi.ac.uk/ols/api/': %s" % ( |
|
64
|
|
|
str(message))) |
|
65
|
|
|
|
|
66
|
|
|
raise OntologyCacheError( |
|
67
|
|
|
"Issue with 'https://www.ebi.ac.uk/ols/api/'") |
|
68
|
|
|
|
|
69
|
|
|
def check_usi_structure(self, record): |
|
70
|
|
|
"""Check data against USI rules""" |
|
71
|
|
|
|
|
72
|
|
|
# this function need its input as a list |
|
73
|
|
|
return validation.check_usi_structure(record) |
|
74
|
|
|
|
|
75
|
|
|
def check_ruleset(self): |
|
76
|
|
|
"""Check ruleset""" |
|
77
|
|
|
|
|
78
|
|
|
return validation.check_ruleset(self.ruleset) |
|
79
|
|
|
|
|
80
|
|
|
def check_duplicates(self, record): |
|
81
|
|
|
"""Check duplicates in data""" |
|
82
|
|
|
|
|
83
|
|
|
return validation.check_duplicates(record) |
|
84
|
|
|
|
|
85
|
|
|
def check_biosample_id_target( |
|
86
|
|
|
self, biosample_id, record_id, record_result): |
|
87
|
|
|
|
|
88
|
|
|
""" |
|
89
|
|
|
Check if a target biosample_id exists or not. If it is present, ok. |
|
90
|
|
|
Otherwise a ValidationResultColumn with a warning |
|
91
|
|
|
|
|
92
|
|
|
Args: |
|
93
|
|
|
biosample_id (str): the desidered biosample id |
|
94
|
|
|
record_id (str): is the name of the object in the original data |
|
95
|
|
|
source |
|
96
|
|
|
record_result (ValidationResult.ValidationResultRecord): |
|
97
|
|
|
an image_validation result object |
|
98
|
|
|
|
|
99
|
|
|
Returns: |
|
100
|
|
|
ValidationResult.ValidationResultRecord: an updated |
|
101
|
|
|
image_validation object |
|
102
|
|
|
""" |
|
103
|
|
|
|
|
104
|
|
|
url = f"{BIOSAMPLE_URL}/{biosample_id}" |
|
105
|
|
|
response = requests.get(url) |
|
106
|
|
|
status = response.status_code |
|
107
|
|
|
if status != 200: |
|
108
|
|
|
record_result.add_validation_result_column( |
|
109
|
|
|
ValidationResult.ValidationResultColumn( |
|
110
|
|
|
"Warning", |
|
111
|
|
|
f"Fail to retrieve record {biosample_id} from " |
|
112
|
|
|
f"BioSamples as required in the relationship", |
|
113
|
|
|
record_id, |
|
114
|
|
|
'sampleRelationships')) |
|
115
|
|
|
|
|
116
|
|
|
return record_result |
|
117
|
|
|
|
|
118
|
|
|
def check_relationship(self, record, record_result): |
|
119
|
|
|
""" |
|
120
|
|
|
Check relationship for an Animal/Sample record and return a list |
|
121
|
|
|
of dictionaries (to_biosample() objects) of related object |
|
122
|
|
|
|
|
123
|
|
|
Args: |
|
124
|
|
|
record (dict): An Animal/Sample.to_biosample() dictionary object |
|
125
|
|
|
record_result (ValidationResult.ValidationResultRecord): |
|
126
|
|
|
an image_validation result object |
|
127
|
|
|
|
|
128
|
|
|
Returns: |
|
129
|
|
|
list: a list of dictionaries of relate objects |
|
130
|
|
|
ValidationResult.ValidationResultRecord: an updated |
|
131
|
|
|
image_validation object |
|
132
|
|
|
""" |
|
133
|
|
|
|
|
134
|
|
|
# get relationship from a to_biosample() dictionary object |
|
135
|
|
|
relationships = record.get('sampleRelationships', []) |
|
136
|
|
|
|
|
137
|
|
|
# as described in image_validation.Submission.Submission |
|
138
|
|
|
# same as record["title"], is the original name of the object id DS |
|
139
|
|
|
record_id = record['attributes']["Data source ID"][0]['value'] |
|
140
|
|
|
|
|
141
|
|
|
# related objects (from UID goes here) |
|
142
|
|
|
related = [] |
|
143
|
|
|
|
|
144
|
|
|
for relationship in relationships: |
|
145
|
|
|
if 'accession' in relationship: |
|
146
|
|
|
target = relationship['accession'] |
|
147
|
|
|
|
|
148
|
|
|
# check biosample target and update record_result if necessary |
|
149
|
|
|
record_result = self.check_biosample_id_target( |
|
150
|
|
|
target, record_id, record_result) |
|
151
|
|
|
|
|
152
|
|
|
# HINT: should I check aliases? they came from PK and are related |
|
153
|
|
|
# in the same submission. I can't have a sample without an animal |
|
154
|
|
|
# since animal is a foreign key of sample (which doesn't tolerate |
|
155
|
|
|
# NULL). Even mother and father are related through keys. If |
|
156
|
|
|
# missing, no information about mother and father could be |
|
157
|
|
|
# determined |
|
158
|
|
|
else: |
|
159
|
|
|
# could be a parent relationship for an animal, or the animal |
|
160
|
|
|
# where this sample comes from |
|
161
|
|
|
target = relationship['alias'] |
|
162
|
|
|
|
|
163
|
|
|
# test for object existence in db. Use biosample.helpers |
|
164
|
|
|
# method to derive a model object from database, then get |
|
165
|
|
|
# its related data |
|
166
|
|
|
try: |
|
167
|
|
|
material_obj = get_model_object( |
|
168
|
|
|
*parse_image_alias(target)) |
|
169
|
|
|
related.append(material_obj.to_biosample()) |
|
170
|
|
|
|
|
171
|
|
|
except ObjectDoesNotExist: |
|
172
|
|
|
record_result.add_validation_result_column( |
|
173
|
|
|
ValidationResult.ValidationResultColumn( |
|
174
|
|
|
"Error", |
|
175
|
|
|
f"Could not locate the referenced record {target}", |
|
176
|
|
|
record_id, 'sampleRelationships')) |
|
177
|
|
|
|
|
178
|
|
|
return related, record_result |
|
179
|
|
|
|
|
180
|
|
|
def validate(self, record): |
|
181
|
|
|
""" |
|
182
|
|
|
Check attributes for record by calling image_validation methods |
|
183
|
|
|
|
|
184
|
|
|
Args: |
|
185
|
|
|
record (dict): An Animal/Sample.to_biosample() dictionary object |
|
186
|
|
|
|
|
187
|
|
|
Returns: |
|
188
|
|
|
ValidationResult.ValidationResultRecord: an image_validation |
|
189
|
|
|
object |
|
190
|
|
|
""" |
|
191
|
|
|
|
|
192
|
|
|
# this validated in general way |
|
193
|
|
|
result = self.ruleset.validate(record) |
|
194
|
|
|
|
|
195
|
|
|
# as defined in image_valdiation.Submission, I will skip further |
|
196
|
|
|
# validation check |
|
197
|
|
|
if result.get_overall_status() == "Error": |
|
198
|
|
|
logger.warning( |
|
199
|
|
|
"record: %s has errors. Skipping context validation" % ( |
|
200
|
|
|
record["title"])) |
|
201
|
|
|
|
|
202
|
|
|
else: |
|
203
|
|
|
# context validation evaluate relationships. Get them |
|
204
|
|
|
related, result = self.check_relationship(record, result) |
|
205
|
|
|
|
|
206
|
|
|
# this validate context (attributes that depends on another one) |
|
207
|
|
|
result = validation.context_validation(record, result, related) |
|
208
|
|
|
|
|
209
|
|
|
return result |
|
210
|
|
|
|
|
211
|
|
|
|
|
212
|
|
|
class ValidationSummary(): |
|
213
|
|
|
"""A class to deal with error messages and submission""" |
|
214
|
|
|
|
|
215
|
|
|
def __init__(self, submission_obj): |
|
216
|
|
|
"""Istantiate a report object from Submission""" |
|
217
|
|
|
|
|
218
|
|
|
# get all names belonging to this submission |
|
219
|
|
|
self.names = Name.objects.select_related( |
|
220
|
|
|
"validationresult", |
|
221
|
|
|
"animal", |
|
222
|
|
|
"sample").filter( |
|
223
|
|
|
submission=submission_obj) |
|
224
|
|
|
|
|
225
|
|
|
# here I will have 5 queries, each one executed when calling count |
|
226
|
|
|
# or when iterating queryset |
|
227
|
|
|
|
|
228
|
|
|
# count animal and samples |
|
229
|
|
|
self.n_animals = self.names.filter(animal__isnull=False).count() |
|
230
|
|
|
self.n_samples = self.names.filter(sample__isnull=False).count() |
|
231
|
|
|
|
|
232
|
|
|
logger.debug("Got %s animal and %s samples in total" % ( |
|
233
|
|
|
self.n_animals, self.n_samples)) |
|
234
|
|
|
|
|
235
|
|
|
# count animal and samples with unknown validation |
|
236
|
|
|
self.n_animal_unknown = self.names.filter( |
|
237
|
|
|
animal__isnull=False, validationresult__isnull=True).count() |
|
238
|
|
|
self.n_sample_unknown = self.names.filter( |
|
239
|
|
|
sample__isnull=False, validationresult__isnull=True).count() |
|
240
|
|
|
|
|
241
|
|
|
logger.debug("Got %s animal and %s samples with unknown validation" % ( |
|
242
|
|
|
self.n_animal_unknown, self.n_sample_unknown)) |
|
243
|
|
|
|
|
244
|
|
|
# filter names which have errors |
|
245
|
|
|
self.errors = self.names.exclude( |
|
246
|
|
|
Q(validationresult__status="Pass") | |
|
247
|
|
|
Q(validationresult__isnull=True) |
|
248
|
|
|
) |
|
249
|
|
|
|
|
250
|
|
|
# count animal and samples with issues |
|
251
|
|
|
self.n_animal_issues = self.errors.filter(animal__isnull=False).count() |
|
252
|
|
|
self.n_sample_issues = self.errors.filter(sample__isnull=False).count() |
|
253
|
|
|
|
|
254
|
|
|
logger.debug("Got %s animal and %s samples with issues" % ( |
|
255
|
|
|
self.n_animal_issues, self.n_sample_issues)) |
|
256
|
|
|
|
|
257
|
|
|
# setting patterns |
|
258
|
|
|
self.pattern1 = re.compile( |
|
259
|
|
|
r"<([^>]*)> of field (.*) \bis \b(.*) for Record") |
|
260
|
|
|
|
|
261
|
|
|
self.pattern2 = re.compile( |
|
262
|
|
|
r"(.*) for the field (.*) \which \b(.*) for Record") |
|
263
|
|
|
|
|
264
|
|
|
self.pattern3 = re.compile( |
|
265
|
|
|
r"Provided value (.*) (does not match to the provided ontology)") |
|
266
|
|
|
|
|
267
|
|
|
# setting report dictionary |
|
268
|
|
|
self.report = {} |
|
269
|
|
|
|
|
270
|
|
|
def process_errors(self): |
|
271
|
|
|
"""Process errors and gives hints""" |
|
272
|
|
|
|
|
273
|
|
|
# resetting report dictionary |
|
274
|
|
|
self.report = {} |
|
275
|
|
|
|
|
276
|
|
|
# TODO: track passed objects in report |
|
277
|
|
|
for error in self.errors: |
|
278
|
|
|
if not hasattr(error, 'validationresult'): |
|
279
|
|
|
logger.debug("Ignoring %s" % (error)) |
|
280
|
|
|
continue |
|
281
|
|
|
|
|
282
|
|
|
for message in error.validationresult.messages: |
|
283
|
|
|
if (self.parse1(message, error.id) or |
|
284
|
|
|
self.parse2(message, error.id) or |
|
285
|
|
|
self.parse3(message, error.id)): |
|
286
|
|
|
logger.debug("Processed message: %s" % (message)) |
|
287
|
|
|
else: |
|
288
|
|
|
logger.error("Cannot parse: '%s'" % message) |
|
289
|
|
|
|
|
290
|
|
|
# assign those values to report |
|
291
|
|
|
key = ("unmanaged", Truncator(message).words(10)) |
|
292
|
|
|
self.__update_report(key, error.id) |
|
293
|
|
|
|
|
294
|
|
|
# block error message |
|
295
|
|
|
|
|
296
|
|
|
return self.report |
|
297
|
|
|
|
|
298
|
|
|
def parse1(self, message, error_id): |
|
299
|
|
|
match = re.search(self.pattern1, message) |
|
300
|
|
|
|
|
301
|
|
|
if match: |
|
302
|
|
|
value, field, reason = match.groups() |
|
303
|
|
|
logger.debug("parse1: Got '{}','{}' and '{}'".format( |
|
304
|
|
|
value, field, reason)) |
|
305
|
|
|
|
|
306
|
|
|
key = (field, reason) |
|
307
|
|
|
self.__update_report(key, error_id) |
|
308
|
|
|
|
|
309
|
|
|
return True |
|
310
|
|
|
|
|
311
|
|
|
else: |
|
312
|
|
|
return False |
|
313
|
|
|
|
|
314
|
|
|
def parse2(self, message, error_id): |
|
315
|
|
|
match = re.search(self.pattern2, message) |
|
316
|
|
|
|
|
317
|
|
|
if match: |
|
318
|
|
|
reason, field, field_type = match.groups() |
|
319
|
|
|
logger.debug("parse2: Got '{}','{}' and '{}'".format( |
|
320
|
|
|
reason, field, field_type)) |
|
321
|
|
|
|
|
322
|
|
|
key = (field, reason) |
|
323
|
|
|
self.__update_report(key, error_id) |
|
324
|
|
|
|
|
325
|
|
|
return True |
|
326
|
|
|
|
|
327
|
|
|
else: |
|
328
|
|
|
return False |
|
329
|
|
|
|
|
330
|
|
|
def parse3(self, message, error_id): |
|
331
|
|
|
match = re.search(self.pattern3, message) |
|
332
|
|
|
|
|
333
|
|
|
if match: |
|
334
|
|
|
value, reason = match.groups() |
|
335
|
|
|
logger.debug("parse3: Got '{}' and '{}'".format( |
|
336
|
|
|
value, reason)) |
|
337
|
|
|
|
|
338
|
|
|
key = (value, reason) |
|
339
|
|
|
self.__update_report(key, error_id) |
|
340
|
|
|
|
|
341
|
|
|
return True |
|
342
|
|
|
|
|
343
|
|
|
else: |
|
344
|
|
|
return False |
|
345
|
|
|
|
|
346
|
|
|
def __update_report(self, key, error_id): |
|
347
|
|
|
if key in self.report: |
|
348
|
|
|
self.report[key]['count'] += 1 |
|
349
|
|
|
self.report[key]['ids'] += [error_id] |
|
350
|
|
|
|
|
351
|
|
|
else: |
|
352
|
|
|
self.report[key] = {'count': 1, 'ids': [error_id]} |
|
353
|
|
|
|