Passed
Pull Request — master (#26)
by Paolo
01:48
created

validation.helpers   A

Complexity

Total Complexity 30

Size/Duplication

Total Lines 340
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 30
eloc 160
dl 0
loc 340
rs 10
c 0
b 0
f 0

14 Methods

Rating   Name   Duplication   Size   Complexity  
A MetaDataValidation.check_duplicates() 0 4 1
A ValidationSummary.parse3() 0 15 2
A ValidationSummary.__init__() 0 54 1
B ValidationSummary.process_errors() 0 27 7
A ValidationSummary.parse1() 0 15 2
A MetaDataValidation.check_usi_structure() 0 5 1
A MetaDataValidation.check_relationship() 0 61 4
A MetaDataValidation.__init__() 0 2 1
A MetaDataValidation.check_biosample_id_target() 0 32 2
A ValidationSummary.__update_report() 0 7 2
A MetaDataValidation.validate() 0 30 2
A ValidationSummary.parse2() 0 15 2
A MetaDataValidation.check_ruleset() 0 4 1
A MetaDataValidation.read_in_ruleset() 0 11 2
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Tue Feb 19 16:15:35 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import re
10
import json
11
import logging
12
import requests
13
14
from django.db.models import Q
15
from django.core.exceptions import ObjectDoesNotExist
16
from django.utils.text import Truncator
17
18
from image_validation import validation, ValidationResult
19
from image_validation.static_parameters import ruleset_filename as \
20
    IMAGE_RULESET
21
22
from common.constants import BIOSAMPLE_URL
23
from image_app.models import Name
24
from biosample.helpers import parse_image_alias, get_model_object
25
26
# Get an instance of a logger
27
logger = logging.getLogger(__name__)
28
29
30
# a class to deal with temporary issues from EBI servers
31
class OntologyCacheError(Exception):
32
    """Identifies temporary issues with EBI servers and
33
    image_validation.use_ontology.OntologyCache objects"""
34
35
36
class MetaDataValidation():
37
    """A class to deal with IMAGE-ValidationTool ruleset objects"""
38
39
    ruleset = None
40
41
    def __init__(self, ruleset_filename=IMAGE_RULESET):
42
        self.read_in_ruleset(ruleset_filename)
43
44
    def read_in_ruleset(self, ruleset_filename):
45
        try:
46
            self.ruleset = validation.read_in_ruleset(ruleset_filename)
47
48
        except json.JSONDecodeError as message:
49
            logger.error(
50
                "Error with 'https://www.ebi.ac.uk/ols/api/': %s" % (
51
                    str(message)))
52
53
            raise OntologyCacheError(
54
                "Issue with 'https://www.ebi.ac.uk/ols/api/'")
55
56
    def check_usi_structure(self, record):
57
        """Check data against USI rules"""
58
59
        # this function need its input as a list
60
        return validation.check_usi_structure(record)
61
62
    def check_ruleset(self):
63
        """Check ruleset"""
64
65
        return validation.check_ruleset(self.ruleset)
66
67
    def check_duplicates(self, record):
68
        """Check duplicates in data"""
69
70
        return validation.check_duplicates(record)
71
72
    def check_biosample_id_target(
73
            self, biosample_id, record_id, record_result):
74
75
        """
76
        Check if a target biosample_id exists or not. If it is present, ok.
77
        Otherwise a ValidationResultColumn with a warning
78
79
        Args:
80
            biosample_id (str): the desidered biosample id
81
            record_id (str): is the name of the object in the original data
82
                source
83
            record_result (ValidationResult.ValidationResultRecord):
84
                an image_validation result object
85
86
        Returns:
87
            ValidationResult.ValidationResultRecord: an updated
88
            image_validation object
89
        """
90
91
        url = f"{BIOSAMPLE_URL}/{biosample_id}"
92
        response = requests.get(url)
93
        status = response.status_code
94
        if status != 200:
95
            record_result.add_validation_result_column(
96
                ValidationResult.ValidationResultColumn(
97
                    "Warning",
98
                    f"Fail to retrieve record {biosample_id} from "
99
                    f"BioSamples as required in the relationship",
100
                    record_id,
101
                    'sampleRelationships'))
102
103
        return record_result
104
105
    def check_relationship(self, record, record_result):
106
        """
107
        Check relationship for an Animal/Sample record and return a list
108
        of dictionaries (to_biosample() objects) of related object
109
110
        Args:
111
            record (dict): An Animal/Sample.to_biosample() dictionary object
112
            record_result (ValidationResult.ValidationResultRecord):
113
                an image_validation result object
114
115
        Returns:
116
            list: a list of dictionaries of relate objects
117
            ValidationResult.ValidationResultRecord: an updated
118
            image_validation object
119
        """
120
121
        # get relationship from a to_biosample() dictionary object
122
        relationships = record.get('sampleRelationships', [])
123
124
        # as described in image_validation.Submission.Submission
125
        # same as record["title"], is the original name of the object id DS
126
        record_id = record['attributes']["Data source ID"][0]['value']
127
128
        # related objects (from UID goes here)
129
        related = []
130
131
        for relationship in relationships:
132
            if 'accession' in relationship:
133
                target = relationship['accession']
134
135
                # check biosample target and update record_result if necessary
136
                record_result = self.check_biosample_id_target(
137
                    target, record_id, record_result)
138
139
            # HINT: should I check aliases? they came from PK and are related
140
            # in the same submission. I can't have a sample without an animal
141
            # since animal is a foreign key of sample (which doesn't tolerate
142
            # NULL). Even mother and father are related through keys. If
143
            # missing, no information about mother and father could be
144
            # determined
145
            else:
146
                # could be a parent relationship for an animal, or the animal
147
                # where this sample comes from
148
                target = relationship['alias']
149
150
                # test for object existence in db. Use biosample.helpers
151
                # method to derive a model object from database, then get
152
                # its related data
153
                try:
154
                    material_obj = get_model_object(
155
                        *parse_image_alias(target))
156
                    related.append(material_obj.to_biosample())
157
158
                except ObjectDoesNotExist:
159
                    record_result.add_validation_result_column(
160
                        ValidationResult.ValidationResultColumn(
161
                            "Error",
162
                            f"Could not locate the referenced record {target}",
163
                            record_id, 'sampleRelationships'))
164
165
        return related, record_result
166
167
    def validate(self, record):
168
        """
169
        Check attributes for record by calling image_validation methods
170
171
        Args:
172
            record (dict): An Animal/Sample.to_biosample() dictionary object
173
174
        Returns:
175
            ValidationResult.ValidationResultRecord: an image_validation
176
            object
177
        """
178
179
        # this validated in general way
180
        result = self.ruleset.validate(record)
181
182
        # as defined in image_valdiation.Submission, I will skip further
183
        # validation check
184
        if result.get_overall_status() == "Error":
185
            logger.warning(
186
                "record: %s has errors. Skipping context validation" % (
187
                        record["title"]))
188
189
        else:
190
            # context validation evaluate relationships. Get them
191
            related, result = self.check_relationship(record, result)
192
193
            # this validate context (attributes that depends on another one)
194
            result = validation.context_validation(record, result, related)
195
196
        return result
197
198
199
class ValidationSummary():
200
    """A class to deal with error messages and submission"""
201
202
    def __init__(self, submission_obj):
203
        """Istantiate a report object from Submission"""
204
205
        # get all names belonging to this submission
206
        self.names = Name.objects.select_related(
207
                "validationresult",
208
                "animal",
209
                "sample").filter(
210
                    submission=submission_obj)
211
212
        # here I will have 5 queries, each one executed when calling count
213
        # or when iterating queryset
214
215
        # count animal and samples
216
        self.n_animals = self.names.filter(animal__isnull=False).count()
217
        self.n_samples = self.names.filter(sample__isnull=False).count()
218
219
        logger.debug("Got %s animal and %s samples in total" % (
220
            self.n_animals, self.n_samples))
221
222
        # count animal and samples with unknown validation
223
        self.n_animal_unknown = self.names.filter(
224
            animal__isnull=False, validationresult__isnull=True).count()
225
        self.n_sample_unknown = self.names.filter(
226
            sample__isnull=False, validationresult__isnull=True).count()
227
228
        logger.debug("Got %s animal and %s samples with unknown validation" % (
229
            self.n_animal_unknown, self.n_sample_unknown))
230
231
        # filter names which have errors
232
        self.errors = self.names.exclude(
233
            Q(validationresult__status="Pass") |
234
            Q(validationresult__isnull=True)
235
        )
236
237
        # count animal and samples with issues
238
        self.n_animal_issues = self.errors.filter(animal__isnull=False).count()
239
        self.n_sample_issues = self.errors.filter(sample__isnull=False).count()
240
241
        logger.debug("Got %s animal and %s samples with issues" % (
242
            self.n_animal_issues, self.n_sample_issues))
243
244
        # setting patterns
245
        self.pattern1 = re.compile(
246
            r"<([^>]*)> of field (.*) \bis \b(.*) for Record")
247
248
        self.pattern2 = re.compile(
249
            r"(.*) for the field (.*) \which \b(.*) for Record")
250
251
        self.pattern3 = re.compile(
252
            r"Provided value (.*) (does not match to the provided ontology)")
253
254
        # setting report dictionary
255
        self.report = {}
256
257
    def process_errors(self):
258
        """Process errors and gives hints"""
259
260
        # resetting report dictionary
261
        self.report = {}
262
263
        # TODO: track passed objects in report
264
        for error in self.errors:
265
            if not hasattr(error, 'validationresult'):
266
                logger.debug("Ignoring %s" % (error))
267
                continue
268
269
            for message in error.validationresult.messages:
270
                if (self.parse1(message, error.id) or
271
                        self.parse2(message, error.id) or
272
                        self.parse3(message, error.id)):
273
                    logger.debug("Processed message: %s" % (message))
274
                else:
275
                    logger.error("Cannot parse: '%s'" % message)
276
277
                    # assign those values to report
278
                    key = ("unmanaged", Truncator(message).words(10))
279
                    self.__update_report(key, error.id)
280
281
            # block error message
282
283
        return self.report
284
285
    def parse1(self, message, error_id):
286
        match = re.search(self.pattern1, message)
287
288
        if match:
289
            value, field, reason = match.groups()
290
            logger.debug("parse1: Got '{}','{}' and '{}'".format(
291
                    value, field, reason))
292
293
            key = (field, reason)
294
            self.__update_report(key, error_id)
295
296
            return True
297
298
        else:
299
            return False
300
301
    def parse2(self, message, error_id):
302
        match = re.search(self.pattern2, message)
303
304
        if match:
305
            reason, field, field_type = match.groups()
306
            logger.debug("parse2: Got '{}','{}' and '{}'".format(
307
                    reason, field, field_type))
308
309
            key = (field, reason)
310
            self.__update_report(key, error_id)
311
312
            return True
313
314
        else:
315
            return False
316
317
    def parse3(self, message, error_id):
318
        match = re.search(self.pattern3, message)
319
320
        if match:
321
            value, reason = match.groups()
322
            logger.debug("parse3: Got '{}' and '{}'".format(
323
                    value, reason))
324
325
            key = (value, reason)
326
            self.__update_report(key, error_id)
327
328
            return True
329
330
        else:
331
            return False
332
333
    def __update_report(self, key, error_id):
334
        if key in self.report:
335
            self.report[key]['count'] += 1
336
            self.report[key]['ids'] += [error_id]
337
338
        else:
339
            self.report[key] = {'count': 1, 'ids': [error_id]}
340