Passed
Pull Request — master (#26)
by Paolo
02:00
created

ValidationSummary.process_errors()   B

Complexity

Conditions 7

Size

Total Lines 27
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 15
dl 0
loc 27
rs 8
c 0
b 0
f 0
cc 7
nop 1
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Tue Feb 19 16:15:35 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import re
10
import json
11
import logging
12
import requests
13
14
from django.db.models import Q
15
from django.core.exceptions import ObjectDoesNotExist
16
from django.utils.text import Truncator
17
18
from image_validation import validation, ValidationResult
19
from image_validation.static_parameters import ruleset_filename as \
20
    IMAGE_RULESET
21
22
from common.constants import BIOSAMPLE_URL
23
from image_app.models import Name
24
from biosample.helpers import parse_image_alias, get_model_object
25
26
# Get an instance of a logger
27
logger = logging.getLogger(__name__)
28
29
30
# a class to deal with temporary issues from EBI servers
31
class OntologyCacheError(Exception):
32
    """Identifies temporary issues with EBI servers and
33
    image_validation.use_ontology.OntologyCache objects"""
34
35
36
# a class to deal with errors in ruleset (that are not user errors but
37
# errors within InjectTool and image_validation library)
38
class RulesetError(Exception):
39
    """Indentifies errors in ruleset"""
40
41
42
class MetaDataValidation():
43
    """A class to deal with IMAGE-ValidationTool ruleset objects"""
44
45
    ruleset = None
46
47
    def __init__(self, ruleset_filename=IMAGE_RULESET):
48
        self.read_in_ruleset(ruleset_filename)
49
50
        # check validation rules
51
        ruleset_errors = self.check_ruleset()
52
53
        if ruleset_errors != []:
54
            raise RulesetError(
55
                "Error with ruleset: %s" % "; ".join(ruleset_errors))
56
57
    def read_in_ruleset(self, ruleset_filename):
58
        try:
59
            self.ruleset = validation.read_in_ruleset(ruleset_filename)
60
61
        except json.JSONDecodeError as message:
62
            logger.error(
63
                "Error with 'https://www.ebi.ac.uk/ols/api/': %s" % (
64
                    str(message)))
65
66
            raise OntologyCacheError(
67
                "Issue with 'https://www.ebi.ac.uk/ols/api/'")
68
69
    def check_usi_structure(self, record):
70
        """Check data against USI rules"""
71
72
        # this function need its input as a list
73
        return validation.check_usi_structure(record)
74
75
    def check_ruleset(self):
76
        """Check ruleset"""
77
78
        return validation.check_ruleset(self.ruleset)
79
80
    def check_duplicates(self, record):
81
        """Check duplicates in data"""
82
83
        return validation.check_duplicates(record)
84
85
    def check_biosample_id_target(
86
            self, biosample_id, record_id, record_result):
87
88
        """
89
        Check if a target biosample_id exists or not. If it is present, ok.
90
        Otherwise a ValidationResultColumn with a warning
91
92
        Args:
93
            biosample_id (str): the desidered biosample id
94
            record_id (str): is the name of the object in the original data
95
                source
96
            record_result (ValidationResult.ValidationResultRecord):
97
                an image_validation result object
98
99
        Returns:
100
            ValidationResult.ValidationResultRecord: an updated
101
            image_validation object
102
        """
103
104
        url = f"{BIOSAMPLE_URL}/{biosample_id}"
105
        response = requests.get(url)
106
        status = response.status_code
107
        if status != 200:
108
            record_result.add_validation_result_column(
109
                ValidationResult.ValidationResultColumn(
110
                    "Warning",
111
                    f"Fail to retrieve record {biosample_id} from "
112
                    f"BioSamples as required in the relationship",
113
                    record_id,
114
                    'sampleRelationships'))
115
116
        return record_result
117
118
    def check_relationship(self, record, record_result):
119
        """
120
        Check relationship for an Animal/Sample record and return a list
121
        of dictionaries (to_biosample() objects) of related object
122
123
        Args:
124
            record (dict): An Animal/Sample.to_biosample() dictionary object
125
            record_result (ValidationResult.ValidationResultRecord):
126
                an image_validation result object
127
128
        Returns:
129
            list: a list of dictionaries of relate objects
130
            ValidationResult.ValidationResultRecord: an updated
131
            image_validation object
132
        """
133
134
        # get relationship from a to_biosample() dictionary object
135
        relationships = record.get('sampleRelationships', [])
136
137
        # as described in image_validation.Submission.Submission
138
        # same as record["title"], is the original name of the object id DS
139
        record_id = record['attributes']["Data source ID"][0]['value']
140
141
        # related objects (from UID goes here)
142
        related = []
143
144
        for relationship in relationships:
145
            if 'accession' in relationship:
146
                target = relationship['accession']
147
148
                # check biosample target and update record_result if necessary
149
                record_result = self.check_biosample_id_target(
150
                    target, record_id, record_result)
151
152
            # HINT: should I check aliases? they came from PK and are related
153
            # in the same submission. I can't have a sample without an animal
154
            # since animal is a foreign key of sample (which doesn't tolerate
155
            # NULL). Even mother and father are related through keys. If
156
            # missing, no information about mother and father could be
157
            # determined
158
            else:
159
                # could be a parent relationship for an animal, or the animal
160
                # where this sample comes from
161
                target = relationship['alias']
162
163
                # test for object existence in db. Use biosample.helpers
164
                # method to derive a model object from database, then get
165
                # its related data
166
                try:
167
                    material_obj = get_model_object(
168
                        *parse_image_alias(target))
169
                    related.append(material_obj.to_biosample())
170
171
                except ObjectDoesNotExist:
172
                    record_result.add_validation_result_column(
173
                        ValidationResult.ValidationResultColumn(
174
                            "Error",
175
                            f"Could not locate the referenced record {target}",
176
                            record_id, 'sampleRelationships'))
177
178
        return related, record_result
179
180
    def validate(self, record):
181
        """
182
        Check attributes for record by calling image_validation methods
183
184
        Args:
185
            record (dict): An Animal/Sample.to_biosample() dictionary object
186
187
        Returns:
188
            ValidationResult.ValidationResultRecord: an image_validation
189
            object
190
        """
191
192
        # this validated in general way
193
        result = self.ruleset.validate(record)
194
195
        # as defined in image_valdiation.Submission, I will skip further
196
        # validation check
197
        if result.get_overall_status() == "Error":
198
            logger.warning(
199
                "record: %s has errors. Skipping context validation" % (
200
                        record["title"]))
201
202
        else:
203
            # context validation evaluate relationships. Get them
204
            related, result = self.check_relationship(record, result)
205
206
            # this validate context (attributes that depends on another one)
207
            result = validation.context_validation(record, result, related)
208
209
        return result
210
211
212
class ValidationSummary():
213
    """A class to deal with error messages and submission"""
214
215
    def __init__(self, submission_obj):
216
        """Istantiate a report object from Submission"""
217
218
        # get all names belonging to this submission
219
        self.names = Name.objects.select_related(
220
                "validationresult",
221
                "animal",
222
                "sample").filter(
223
                    submission=submission_obj)
224
225
        # here I will have 5 queries, each one executed when calling count
226
        # or when iterating queryset
227
228
        # count animal and samples
229
        self.n_animals = self.names.filter(animal__isnull=False).count()
230
        self.n_samples = self.names.filter(sample__isnull=False).count()
231
232
        logger.debug("Got %s animal and %s samples in total" % (
233
            self.n_animals, self.n_samples))
234
235
        # count animal and samples with unknown validation
236
        self.n_animal_unknown = self.names.filter(
237
            animal__isnull=False, validationresult__isnull=True).count()
238
        self.n_sample_unknown = self.names.filter(
239
            sample__isnull=False, validationresult__isnull=True).count()
240
241
        logger.debug("Got %s animal and %s samples with unknown validation" % (
242
            self.n_animal_unknown, self.n_sample_unknown))
243
244
        # filter names which have errors
245
        self.errors = self.names.exclude(
246
            Q(validationresult__status="Pass") |
247
            Q(validationresult__isnull=True)
248
        )
249
250
        # count animal and samples with issues
251
        self.n_animal_issues = self.errors.filter(animal__isnull=False).count()
252
        self.n_sample_issues = self.errors.filter(sample__isnull=False).count()
253
254
        logger.debug("Got %s animal and %s samples with issues" % (
255
            self.n_animal_issues, self.n_sample_issues))
256
257
        # setting patterns
258
        self.pattern1 = re.compile(
259
            r"<([^>]*)> of field (.*) \bis \b(.*) for Record")
260
261
        self.pattern2 = re.compile(
262
            r"(.*) for the field (.*) \which \b(.*) for Record")
263
264
        self.pattern3 = re.compile(
265
            r"Provided value (.*) (does not match to the provided ontology)")
266
267
        # setting report dictionary
268
        self.report = {}
269
270
    def process_errors(self):
271
        """Process errors and gives hints"""
272
273
        # resetting report dictionary
274
        self.report = {}
275
276
        # TODO: track passed objects in report
277
        for error in self.errors:
278
            if not hasattr(error, 'validationresult'):
279
                logger.debug("Ignoring %s" % (error))
280
                continue
281
282
            for message in error.validationresult.messages:
283
                if (self.parse1(message, error.id) or
284
                        self.parse2(message, error.id) or
285
                        self.parse3(message, error.id)):
286
                    logger.debug("Processed message: %s" % (message))
287
                else:
288
                    logger.error("Cannot parse: '%s'" % message)
289
290
                    # assign those values to report
291
                    key = ("unmanaged", Truncator(message).words(10))
292
                    self.__update_report(key, error.id)
293
294
            # block error message
295
296
        return self.report
297
298
    def parse1(self, message, error_id):
299
        match = re.search(self.pattern1, message)
300
301
        if match:
302
            value, field, reason = match.groups()
303
            logger.debug("parse1: Got '{}','{}' and '{}'".format(
304
                    value, field, reason))
305
306
            key = (field, reason)
307
            self.__update_report(key, error_id)
308
309
            return True
310
311
        else:
312
            return False
313
314
    def parse2(self, message, error_id):
315
        match = re.search(self.pattern2, message)
316
317
        if match:
318
            reason, field, field_type = match.groups()
319
            logger.debug("parse2: Got '{}','{}' and '{}'".format(
320
                    reason, field, field_type))
321
322
            key = (field, reason)
323
            self.__update_report(key, error_id)
324
325
            return True
326
327
        else:
328
            return False
329
330
    def parse3(self, message, error_id):
331
        match = re.search(self.pattern3, message)
332
333
        if match:
334
            value, reason = match.groups()
335
            logger.debug("parse3: Got '{}' and '{}'".format(
336
                    value, reason))
337
338
            key = (value, reason)
339
            self.__update_report(key, error_id)
340
341
            return True
342
343
        else:
344
            return False
345
346
    def __update_report(self, key, error_id):
347
        if key in self.report:
348
            self.report[key]['count'] += 1
349
            self.report[key]['ids'] += [error_id]
350
351
        else:
352
            self.report[key] = {'count': 1, 'ids': [error_id]}
353