validation.helpers - Code Metrics - Inspection of "modelling relationship between objects" - cnr-ibba/IMAGE-InjectTool - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#26)

by Paolo

created 2019-06-10 09:23 UTC

validation.helpers A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	340
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	30
eloc	160
dl	0
loc	340
rs	10
c	0
b	0
f	0

14 Methods

Rating	Name	Size	Complexity
A	MetaDataValidation.check_duplicates()	4	1
A	ValidationSummary.parse3()	15	2
A	ValidationSummary.__init__()	54	1
B	ValidationSummary.process_errors()	27	7
A	ValidationSummary.parse1()	15	2
A	MetaDataValidation.check_usi_structure()	5	1
A	MetaDataValidation.check_relationship()	61	4
A	MetaDataValidation.__init__()	2	1
A	MetaDataValidation.check_biosample_id_target()	32	2
A	ValidationSummary.__update_report()	7	2
A	MetaDataValidation.validate()	30	2
A	ValidationSummary.parse2()	15	2
A	MetaDataValidation.check_ruleset()	4	1
A	MetaDataValidation.read_in_ruleset()	11	2

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 19 16:15:35 2019

@author: Paolo Cozzi <[email protected]>
"""

import re
import json
import logging
import requests

from django.db.models import Q
from django.core.exceptions import ObjectDoesNotExist
from django.utils.text import Truncator

from image_validation import validation, ValidationResult
from image_validation.static_parameters import ruleset_filename as \
    IMAGE_RULESET

from common.constants import BIOSAMPLE_URL
from image_app.models import Name
from biosample.helpers import parse_image_alias, get_model_object

# Get an instance of a logger
logger = logging.getLogger(__name__)


# a class to deal with temporary issues from EBI servers
class OntologyCacheError(Exception):
    """Identifies temporary issues with EBI servers and
    image_validation.use_ontology.OntologyCache objects"""


class MetaDataValidation():
    """A class to deal with IMAGE-ValidationTool ruleset objects"""

    ruleset = None

    def __init__(self, ruleset_filename=IMAGE_RULESET):
        self.read_in_ruleset(ruleset_filename)

    def read_in_ruleset(self, ruleset_filename):
        try:
            self.ruleset = validation.read_in_ruleset(ruleset_filename)

        except json.JSONDecodeError as message:
            logger.error(
                "Error with 'https://www.ebi.ac.uk/ols/api/': %s" % (
                    str(message)))

            raise OntologyCacheError(
                "Issue with 'https://www.ebi.ac.uk/ols/api/'")

    def check_usi_structure(self, record):
        """Check data against USI rules"""

        # this function need its input as a list
        return validation.check_usi_structure(record)

    def check_ruleset(self):
        """Check ruleset"""

        return validation.check_ruleset(self.ruleset)

    def check_duplicates(self, record):
        """Check duplicates in data"""

        return validation.check_duplicates(record)

    def check_biosample_id_target(
            self, biosample_id, record_id, record_result):

        """
        Check if a target biosample_id exists or not. If it is present, ok.
        Otherwise a ValidationResultColumn with a warning

        Args:
            biosample_id (str): the desidered biosample id
            record_id (str): is the name of the object in the original data
                source
            record_result (ValidationResult.ValidationResultRecord):
                an image_validation result object

        Returns:
            ValidationResult.ValidationResultRecord: an updated
            image_validation object
        """

        url = f"{BIOSAMPLE_URL}/{biosample_id}"
        response = requests.get(url)
        status = response.status_code
        if status != 200:
            record_result.add_validation_result_column(
                ValidationResult.ValidationResultColumn(
                    "Warning",
                    f"Fail to retrieve record {biosample_id} from "
                    f"BioSamples as required in the relationship",
                    record_id,
                    'sampleRelationships'))

        return record_result

    def check_relationship(self, record, record_result):
        """
        Check relationship for an Animal/Sample record and return a list
        of dictionaries (to_biosample() objects) of related object

        Args:
            record (dict): An Animal/Sample.to_biosample() dictionary object
            record_result (ValidationResult.ValidationResultRecord):
                an image_validation result object

        Returns:
            list: a list of dictionaries of relate objects
            ValidationResult.ValidationResultRecord: an updated
            image_validation object
        """

        # get relationship from a to_biosample() dictionary object
        relationships = record.get('sampleRelationships', [])

        # as described in image_validation.Submission.Submission
        # same as record["title"], is the original name of the object id DS
        record_id = record['attributes']["Data source ID"][0]['value']

        # related objects (from UID goes here)
        related = []

        for relationship in relationships:
            if 'accession' in relationship:
                target = relationship['accession']

                # check biosample target and update record_result if necessary
                record_result = self.check_biosample_id_target(
                    target, record_id, record_result)

            # HINT: should I check aliases? they came from PK and are related
            # in the same submission. I can't have a sample without an animal
            # since animal is a foreign key of sample (which doesn't tolerate
            # NULL). Even mother and father are related through keys. If
            # missing, no information about mother and father could be
            # determined
            else:
                # could be a parent relationship for an animal, or the animal
                # where this sample comes from
                target = relationship['alias']

                # test for object existence in db. Use biosample.helpers
                # method to derive a model object from database, then get
                # its related data
                try:
                    material_obj = get_model_object(
                        *parse_image_alias(target))
                    related.append(material_obj.to_biosample())

                except ObjectDoesNotExist:
                    record_result.add_validation_result_column(
                        ValidationResult.ValidationResultColumn(
                            "Error",
                            f"Could not locate the referenced record {target}",
                            record_id, 'sampleRelationships'))

        return related, record_result

    def validate(self, record):
        """
        Check attributes for record by calling image_validation methods

        Args:
            record (dict): An Animal/Sample.to_biosample() dictionary object

        Returns:
            ValidationResult.ValidationResultRecord: an image_validation
            object
        """

        # this validated in general way
        result = self.ruleset.validate(record)

        # as defined in image_valdiation.Submission, I will skip further
        # validation check
        if result.get_overall_status() == "Error":
            logger.warning(
                "record: %s has errors. Skipping context validation" % (
                        record["title"]))

        else:
            # context validation evaluate relationships. Get them
            related, result = self.check_relationship(record, result)

            # this validate context (attributes that depends on another one)
            result = validation.context_validation(record, result, related)

        return result


class ValidationSummary():
    """A class to deal with error messages and submission"""

    def __init__(self, submission_obj):
        """Istantiate a report object from Submission"""

        # get all names belonging to this submission
        self.names = Name.objects.select_related(
                "validationresult",
                "animal",
                "sample").filter(
                    submission=submission_obj)

        # here I will have 5 queries, each one executed when calling count
        # or when iterating queryset

        # count animal and samples
        self.n_animals = self.names.filter(animal__isnull=False).count()
        self.n_samples = self.names.filter(sample__isnull=False).count()

        logger.debug("Got %s animal and %s samples in total" % (
            self.n_animals, self.n_samples))

        # count animal and samples with unknown validation
        self.n_animal_unknown = self.names.filter(
            animal__isnull=False, validationresult__isnull=True).count()
        self.n_sample_unknown = self.names.filter(
            sample__isnull=False, validationresult__isnull=True).count()

        logger.debug("Got %s animal and %s samples with unknown validation" % (
            self.n_animal_unknown, self.n_sample_unknown))

        # filter names which have errors
        self.errors = self.names.exclude(
            Q(validationresult__status="Pass") |
            Q(validationresult__isnull=True)
        )

        # count animal and samples with issues
        self.n_animal_issues = self.errors.filter(animal__isnull=False).count()
        self.n_sample_issues = self.errors.filter(sample__isnull=False).count()

        logger.debug("Got %s animal and %s samples with issues" % (
            self.n_animal_issues, self.n_sample_issues))

        # setting patterns
        self.pattern1 = re.compile(
            r"<([^>]*)> of field (.*) \bis \b(.*) for Record")

        self.pattern2 = re.compile(
            r"(.*) for the field (.*) \which \b(.*) for Record")

        self.pattern3 = re.compile(
            r"Provided value (.*) (does not match to the provided ontology)")

        # setting report dictionary
        self.report = {}

    def process_errors(self):
        """Process errors and gives hints"""

        # resetting report dictionary
        self.report = {}

        # TODO: track passed objects in report
        for error in self.errors:
            if not hasattr(error, 'validationresult'):
                logger.debug("Ignoring %s" % (error))
                continue

            for message in error.validationresult.messages:
                if (self.parse1(message, error.id) or
                        self.parse2(message, error.id) or
                        self.parse3(message, error.id)):
                    logger.debug("Processed message: %s" % (message))
                else:
                    logger.error("Cannot parse: '%s'" % message)

                    # assign those values to report
                    key = ("unmanaged", Truncator(message).words(10))
                    self.__update_report(key, error.id)

            # block error message

        return self.report

    def parse1(self, message, error_id):
        match = re.search(self.pattern1, message)

        if match:
            value, field, reason = match.groups()
            logger.debug("parse1: Got '{}','{}' and '{}'".format(
                    value, field, reason))

            key = (field, reason)
            self.__update_report(key, error_id)

            return True

        else:
            return False

    def parse2(self, message, error_id):
        match = re.search(self.pattern2, message)

        if match:
            reason, field, field_type = match.groups()
            logger.debug("parse2: Got '{}','{}' and '{}'".format(
                    reason, field, field_type))

            key = (field, reason)
            self.__update_report(key, error_id)

            return True

        else:
            return False

    def parse3(self, message, error_id):
        match = re.search(self.pattern3, message)

        if match:
            value, reason = match.groups()
            logger.debug("parse3: Got '{}' and '{}'".format(
                    value, reason))

            key = (value, reason)
            self.__update_report(key, error_id)

            return True

        else:
            return False

    def __update_report(self, key, error_id):
        if key in self.report:
            self.report[key]['count'] += 1
            self.report[key]['ids'] += [error_id]

        else:
            self.report[key] = {'count': 1, 'ids': [error_id]}


1			#!/usr/bin/env python3
2			# -- coding: utf-8 --
3			"""
4			Created on Tue Feb 19 16:15:35 2019
5
6			@author: Paolo Cozzi <[email protected]>
7			"""
8
9			import re
10			import json
11			import logging
12			import requests
13
14			from django.db.models import Q
15			from django.core.exceptions import ObjectDoesNotExist
16			from django.utils.text import Truncator
17
18			from image_validation import validation, ValidationResult
19			from image_validation.static_parameters import ruleset_filename as \
20			IMAGE_RULESET
21
22			from common.constants import BIOSAMPLE_URL
23			from image_app.models import Name
24			from biosample.helpers import parse_image_alias, get_model_object
25
26			# Get an instance of a logger
27			logger = logging.getLogger(__name__)
28
29
30			# a class to deal with temporary issues from EBI servers
31			class OntologyCacheError(Exception):
32			"""Identifies temporary issues with EBI servers and
33			image_validation.use_ontology.OntologyCache objects"""
34
35
36			class MetaDataValidation():
37			"""A class to deal with IMAGE-ValidationTool ruleset objects"""
38
39			ruleset = None
40
41			def __init__(self, ruleset_filename=IMAGE_RULESET):
42			self.read_in_ruleset(ruleset_filename)
43
44			def read_in_ruleset(self, ruleset_filename):
45			try:
46			self.ruleset = validation.read_in_ruleset(ruleset_filename)
47
48			except json.JSONDecodeError as message:
49			logger.error(
50			"Error with 'https://www.ebi.ac.uk/ols/api/': %s" % (
51			str(message)))
52
53			raise OntologyCacheError(
54			"Issue with 'https://www.ebi.ac.uk/ols/api/'")
55
56			def check_usi_structure(self, record):
57			"""Check data against USI rules"""
58
59			# this function need its input as a list
60			return validation.check_usi_structure(record)
61
62			def check_ruleset(self):
63			"""Check ruleset"""
64
65			return validation.check_ruleset(self.ruleset)
66
67			def check_duplicates(self, record):
68			"""Check duplicates in data"""
69
70			return validation.check_duplicates(record)
71
72			def check_biosample_id_target(
73			self, biosample_id, record_id, record_result):
74
75			"""
76			Check if a target biosample_id exists or not. If it is present, ok.
77			Otherwise a ValidationResultColumn with a warning
78
79			Args:
80			biosample_id (str): the desidered biosample id
81			record_id (str): is the name of the object in the original data
82			source
83			record_result (ValidationResult.ValidationResultRecord):
84			an image_validation result object
85
86			Returns:
87			ValidationResult.ValidationResultRecord: an updated
88			image_validation object
89			"""
90
91			url = f"{BIOSAMPLE_URL}/{biosample_id}"
92			response = requests.get(url)
93			status = response.status_code
94			if status != 200:
95			record_result.add_validation_result_column(
96			ValidationResult.ValidationResultColumn(
97			"Warning",
98			f"Fail to retrieve record {biosample_id} from "
99			f"BioSamples as required in the relationship",
100			record_id,
101			'sampleRelationships'))
102
103			return record_result
104
105			def check_relationship(self, record, record_result):
106			"""
107			Check relationship for an Animal/Sample record and return a list
108			of dictionaries (to_biosample() objects) of related object
109
110			Args:
111			record (dict): An Animal/Sample.to_biosample() dictionary object
112			record_result (ValidationResult.ValidationResultRecord):
113			an image_validation result object
114
115			Returns:
116			list: a list of dictionaries of relate objects
117			ValidationResult.ValidationResultRecord: an updated
118			image_validation object
119			"""
120
121			# get relationship from a to_biosample() dictionary object
122			relationships = record.get('sampleRelationships', [])
123
124			# as described in image_validation.Submission.Submission
125			# same as record["title"], is the original name of the object id DS
126			record_id = record['attributes']["Data source ID"][0]['value']
127
128			# related objects (from UID goes here)
129			related = []
130
131			for relationship in relationships:
132			if 'accession' in relationship:
133			target = relationship['accession']
134
135			# check biosample target and update record_result if necessary
136			record_result = self.check_biosample_id_target(
137			target, record_id, record_result)
138
139			# HINT: should I check aliases? they came from PK and are related
140			# in the same submission. I can't have a sample without an animal
141			# since animal is a foreign key of sample (which doesn't tolerate
142			# NULL). Even mother and father are related through keys. If
143			# missing, no information about mother and father could be
144			# determined
145			else:
146			# could be a parent relationship for an animal, or the animal
147			# where this sample comes from
148			target = relationship['alias']
149
150			# test for object existence in db. Use biosample.helpers
151			# method to derive a model object from database, then get
152			# its related data
153			try:
154			material_obj = get_model_object(
155			*parse_image_alias(target))
156			related.append(material_obj.to_biosample())
157
158			except ObjectDoesNotExist:
159			record_result.add_validation_result_column(
160			ValidationResult.ValidationResultColumn(
161			"Error",
162			f"Could not locate the referenced record {target}",
163			record_id, 'sampleRelationships'))
164
165			return related, record_result
166
167			def validate(self, record):
168			"""
169			Check attributes for record by calling image_validation methods
170
171			Args:
172			record (dict): An Animal/Sample.to_biosample() dictionary object
173
174			Returns:
175			ValidationResult.ValidationResultRecord: an image_validation
176			object
177			"""
178
179			# this validated in general way
180			result = self.ruleset.validate(record)
181
182			# as defined in image_valdiation.Submission, I will skip further
183			# validation check
184			if result.get_overall_status() == "Error":
185			logger.warning(
186			"record: %s has errors. Skipping context validation" % (
187			record["title"]))
188
189			else:
190			# context validation evaluate relationships. Get them
191			related, result = self.check_relationship(record, result)
192
193			# this validate context (attributes that depends on another one)
194			result = validation.context_validation(record, result, related)
195
196			return result
197
198
199			class ValidationSummary():
200			"""A class to deal with error messages and submission"""
201
202			def __init__(self, submission_obj):
203			"""Istantiate a report object from Submission"""
204
205			# get all names belonging to this submission
206			self.names = Name.objects.select_related(
207			"validationresult",
208			"animal",
209			"sample").filter(
210			submission=submission_obj)
211
212			# here I will have 5 queries, each one executed when calling count
213			# or when iterating queryset
214
215			# count animal and samples
216			self.n_animals = self.names.filter(animal__isnull=False).count()
217			self.n_samples = self.names.filter(sample__isnull=False).count()
218
219			logger.debug("Got %s animal and %s samples in total" % (
220			self.n_animals, self.n_samples))
221
222			# count animal and samples with unknown validation
223			self.n_animal_unknown = self.names.filter(
224			animal__isnull=False, validationresult__isnull=True).count()
225			self.n_sample_unknown = self.names.filter(
226			sample__isnull=False, validationresult__isnull=True).count()
227
228			logger.debug("Got %s animal and %s samples with unknown validation" % (
229			self.n_animal_unknown, self.n_sample_unknown))
230
231			# filter names which have errors
232			self.errors = self.names.exclude(
233			Q(validationresult__status="Pass") \|
234			Q(validationresult__isnull=True)
235			)
236
237			# count animal and samples with issues
238			self.n_animal_issues = self.errors.filter(animal__isnull=False).count()
239			self.n_sample_issues = self.errors.filter(sample__isnull=False).count()
240
241			logger.debug("Got %s animal and %s samples with issues" % (
242			self.n_animal_issues, self.n_sample_issues))
243
244			# setting patterns
245			self.pattern1 = re.compile(
246			r"<([^>])> of field (.) \bis \b(.*) for Record")
247
248			self.pattern2 = re.compile(
249			r"(.) for the field (.) \which \b(.*) for Record")
250
251			self.pattern3 = re.compile(
252			r"Provided value (.*) (does not match to the provided ontology)")
253
254			# setting report dictionary
255			self.report = {}
256
257			def process_errors(self):
258			"""Process errors and gives hints"""
259
260			# resetting report dictionary
261			self.report = {}
262
263			# TODO: track passed objects in report
264			for error in self.errors:
265			if not hasattr(error, 'validationresult'):
266			logger.debug("Ignoring %s" % (error))
267			continue
268
269			for message in error.validationresult.messages:
270			if (self.parse1(message, error.id) or
271			self.parse2(message, error.id) or
272			self.parse3(message, error.id)):
273			logger.debug("Processed message: %s" % (message))
274			else:
275			logger.error("Cannot parse: '%s'" % message)
276
277			# assign those values to report
278			key = ("unmanaged", Truncator(message).words(10))
279			self.__update_report(key, error.id)
280
281			# block error message
282
283			return self.report
284
285			def parse1(self, message, error_id):
286			match = re.search(self.pattern1, message)
287
288			if match:
289			value, field, reason = match.groups()
290			logger.debug("parse1: Got '{}','{}' and '{}'".format(
291			value, field, reason))
292
293			key = (field, reason)
294			self.__update_report(key, error_id)
295
296			return True
297
298			else:
299			return False
300
301			def parse2(self, message, error_id):
302			match = re.search(self.pattern2, message)
303
304			if match:
305			reason, field, field_type = match.groups()
306			logger.debug("parse2: Got '{}','{}' and '{}'".format(
307			reason, field, field_type))
308
309			key = (field, reason)
310			self.__update_report(key, error_id)
311
312			return True
313
314			else:
315			return False
316
317			def parse3(self, message, error_id):
318			match = re.search(self.pattern3, message)
319
320			if match:
321			value, reason = match.groups()
322			logger.debug("parse3: Got '{}' and '{}'".format(
323			value, reason))
324
325			key = (value, reason)
326			self.__update_report(key, error_id)
327
328			return True
329
330			else:
331			return False
332
333			def __update_report(self, key, error_id):
334			if key in self.report:
335			self.report[key]['count'] += 1
336			self.report[key]['ids'] += [error_id]
337
338			else:
339			self.report[key] = {'count': 1, 'ids': [error_id]}
340

cnr-ibba / IMAGE-InjectTool

Pull Request — master (#26)

validation.helpers A

Complexity

Size/Duplication

Importance

14 Methods

Duplication Side-by-Side

Filter issues like