validation.helpers - Code Metrics - Inspection of "modelling relationship between objects" - cnr-ibba/IMAGE-InjectTool - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#26)

by Paolo

created 2019-06-10 16:21 UTC

validation.helpers A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	353
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	31
eloc	165
dl	0
loc	353
rs	9.92
c	0
b	0
f	0

14 Methods

Rating	Name	Size	Complexity
A	MetaDataValidation.check_duplicates()	4	1
A	ValidationSummary.parse3()	15	2
A	ValidationSummary.__init__()	54	1
B	ValidationSummary.process_errors()	27	7
A	ValidationSummary.parse1()	15	2
A	MetaDataValidation.check_usi_structure()	5	1
A	MetaDataValidation.check_relationship()	61	4
A	MetaDataValidation.__init__()	9	2
A	MetaDataValidation.check_biosample_id_target()	32	2
A	ValidationSummary.__update_report()	7	2
A	MetaDataValidation.validate()	30	2
A	ValidationSummary.parse2()	15	2
A	MetaDataValidation.check_ruleset()	4	1
A	MetaDataValidation.read_in_ruleset()	11	2

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 19 16:15:35 2019

@author: Paolo Cozzi <[email protected]>
"""

import re
import json
import logging
import requests

from django.db.models import Q
from django.core.exceptions import ObjectDoesNotExist
from django.utils.text import Truncator

from image_validation import validation, ValidationResult
from image_validation.static_parameters import ruleset_filename as \
    IMAGE_RULESET

from common.constants import BIOSAMPLE_URL
from image_app.models import Name
from biosample.helpers import parse_image_alias, get_model_object

# Get an instance of a logger
logger = logging.getLogger(__name__)


# a class to deal with temporary issues from EBI servers
class OntologyCacheError(Exception):
    """Identifies temporary issues with EBI servers and
    image_validation.use_ontology.OntologyCache objects"""


# a class to deal with errors in ruleset (that are not user errors but
# errors within InjectTool and image_validation library)
class RulesetError(Exception):
    """Indentifies errors in ruleset"""


class MetaDataValidation():
    """A class to deal with IMAGE-ValidationTool ruleset objects"""

    ruleset = None

    def __init__(self, ruleset_filename=IMAGE_RULESET):
        self.read_in_ruleset(ruleset_filename)

        # check validation rules
        ruleset_errors = self.check_ruleset()

        if ruleset_errors != []:
            raise RulesetError(
                "Error with ruleset: %s" % "; ".join(ruleset_errors))

    def read_in_ruleset(self, ruleset_filename):
        try:
            self.ruleset = validation.read_in_ruleset(ruleset_filename)

        except json.JSONDecodeError as message:
            logger.error(
                "Error with 'https://www.ebi.ac.uk/ols/api/': %s" % (
                    str(message)))

            raise OntologyCacheError(
                "Issue with 'https://www.ebi.ac.uk/ols/api/'")

    def check_usi_structure(self, record):
        """Check data against USI rules"""

        # this function need its input as a list
        return validation.check_usi_structure(record)

    def check_ruleset(self):
        """Check ruleset"""

        return validation.check_ruleset(self.ruleset)

    def check_duplicates(self, record):
        """Check duplicates in data"""

        return validation.check_duplicates(record)

    def check_biosample_id_target(
            self, biosample_id, record_id, record_result):

        """
        Check if a target biosample_id exists or not. If it is present, ok.
        Otherwise a ValidationResultColumn with a warning

        Args:
            biosample_id (str): the desidered biosample id
            record_id (str): is the name of the object in the original data
                source
            record_result (ValidationResult.ValidationResultRecord):
                an image_validation result object

        Returns:
            ValidationResult.ValidationResultRecord: an updated
            image_validation object
        """

        url = f"{BIOSAMPLE_URL}/{biosample_id}"
        response = requests.get(url)
        status = response.status_code
        if status != 200:
            record_result.add_validation_result_column(
                ValidationResult.ValidationResultColumn(
                    "Warning",
                    f"Fail to retrieve record {biosample_id} from "
                    f"BioSamples as required in the relationship",
                    record_id,
                    'sampleRelationships'))

        return record_result

    def check_relationship(self, record, record_result):
        """
        Check relationship for an Animal/Sample record and return a list
        of dictionaries (to_biosample() objects) of related object

        Args:
            record (dict): An Animal/Sample.to_biosample() dictionary object
            record_result (ValidationResult.ValidationResultRecord):
                an image_validation result object

        Returns:
            list: a list of dictionaries of relate objects
            ValidationResult.ValidationResultRecord: an updated
            image_validation object
        """

        # get relationship from a to_biosample() dictionary object
        relationships = record.get('sampleRelationships', [])

        # as described in image_validation.Submission.Submission
        # same as record["title"], is the original name of the object id DS
        record_id = record['attributes']["Data source ID"][0]['value']

        # related objects (from UID goes here)
        related = []

        for relationship in relationships:
            if 'accession' in relationship:
                target = relationship['accession']

                # check biosample target and update record_result if necessary
                record_result = self.check_biosample_id_target(
                    target, record_id, record_result)

            # HINT: should I check aliases? they came from PK and are related
            # in the same submission. I can't have a sample without an animal
            # since animal is a foreign key of sample (which doesn't tolerate
            # NULL). Even mother and father are related through keys. If
            # missing, no information about mother and father could be
            # determined
            else:
                # could be a parent relationship for an animal, or the animal
                # where this sample comes from
                target = relationship['alias']

                # test for object existence in db. Use biosample.helpers
                # method to derive a model object from database, then get
                # its related data
                try:
                    material_obj = get_model_object(
                        *parse_image_alias(target))
                    related.append(material_obj.to_biosample())

                except ObjectDoesNotExist:
                    record_result.add_validation_result_column(
                        ValidationResult.ValidationResultColumn(
                            "Error",
                            f"Could not locate the referenced record {target}",
                            record_id, 'sampleRelationships'))

        return related, record_result

    def validate(self, record):
        """
        Check attributes for record by calling image_validation methods

        Args:
            record (dict): An Animal/Sample.to_biosample() dictionary object

        Returns:
            ValidationResult.ValidationResultRecord: an image_validation
            object
        """

        # this validated in general way
        result = self.ruleset.validate(record)

        # as defined in image_valdiation.Submission, I will skip further
        # validation check
        if result.get_overall_status() == "Error":
            logger.warning(
                "record: %s has errors. Skipping context validation" % (
                        record["title"]))

        else:
            # context validation evaluate relationships. Get them
            related, result = self.check_relationship(record, result)

            # this validate context (attributes that depends on another one)
            result = validation.context_validation(record, result, related)

        return result


class ValidationSummary():
    """A class to deal with error messages and submission"""

    def __init__(self, submission_obj):
        """Istantiate a report object from Submission"""

        # get all names belonging to this submission
        self.names = Name.objects.select_related(
                "validationresult",
                "animal",
                "sample").filter(
                    submission=submission_obj)

        # here I will have 5 queries, each one executed when calling count
        # or when iterating queryset

        # count animal and samples
        self.n_animals = self.names.filter(animal__isnull=False).count()
        self.n_samples = self.names.filter(sample__isnull=False).count()

        logger.debug("Got %s animal and %s samples in total" % (
            self.n_animals, self.n_samples))

        # count animal and samples with unknown validation
        self.n_animal_unknown = self.names.filter(
            animal__isnull=False, validationresult__isnull=True).count()
        self.n_sample_unknown = self.names.filter(
            sample__isnull=False, validationresult__isnull=True).count()

        logger.debug("Got %s animal and %s samples with unknown validation" % (
            self.n_animal_unknown, self.n_sample_unknown))

        # filter names which have errors
        self.errors = self.names.exclude(
            Q(validationresult__status="Pass") |
            Q(validationresult__isnull=True)
        )

        # count animal and samples with issues
        self.n_animal_issues = self.errors.filter(animal__isnull=False).count()
        self.n_sample_issues = self.errors.filter(sample__isnull=False).count()

        logger.debug("Got %s animal and %s samples with issues" % (
            self.n_animal_issues, self.n_sample_issues))

        # setting patterns
        self.pattern1 = re.compile(
            r"<([^>]*)> of field (.*) \bis \b(.*) for Record")

        self.pattern2 = re.compile(
            r"(.*) for the field (.*) \which \b(.*) for Record")

        self.pattern3 = re.compile(
            r"Provided value (.*) (does not match to the provided ontology)")

        # setting report dictionary
        self.report = {}

    def process_errors(self):
        """Process errors and gives hints"""

        # resetting report dictionary
        self.report = {}

        # TODO: track passed objects in report
        for error in self.errors:
            if not hasattr(error, 'validationresult'):
                logger.debug("Ignoring %s" % (error))
                continue

            for message in error.validationresult.messages:
                if (self.parse1(message, error.id) or
                        self.parse2(message, error.id) or
                        self.parse3(message, error.id)):
                    logger.debug("Processed message: %s" % (message))
                else:
                    logger.error("Cannot parse: '%s'" % message)

                    # assign those values to report
                    key = ("unmanaged", Truncator(message).words(10))
                    self.__update_report(key, error.id)

            # block error message

        return self.report

    def parse1(self, message, error_id):
        match = re.search(self.pattern1, message)

        if match:
            value, field, reason = match.groups()
            logger.debug("parse1: Got '{}','{}' and '{}'".format(
                    value, field, reason))

            key = (field, reason)
            self.__update_report(key, error_id)

            return True

        else:
            return False

    def parse2(self, message, error_id):
        match = re.search(self.pattern2, message)

        if match:
            reason, field, field_type = match.groups()
            logger.debug("parse2: Got '{}','{}' and '{}'".format(
                    reason, field, field_type))

            key = (field, reason)
            self.__update_report(key, error_id)

            return True

        else:
            return False

    def parse3(self, message, error_id):
        match = re.search(self.pattern3, message)

        if match:
            value, reason = match.groups()
            logger.debug("parse3: Got '{}' and '{}'".format(
                    value, reason))

            key = (value, reason)
            self.__update_report(key, error_id)

            return True

        else:
            return False

    def __update_report(self, key, error_id):
        if key in self.report:
            self.report[key]['count'] += 1
            self.report[key]['ids'] += [error_id]

        else:
            self.report[key] = {'count': 1, 'ids': [error_id]}


1			#!/usr/bin/env python3
2			# -- coding: utf-8 --
3			"""
4			Created on Tue Feb 19 16:15:35 2019
5
6			@author: Paolo Cozzi <[email protected]>
7			"""
8
9			import re
10			import json
11			import logging
12			import requests
13
14			from django.db.models import Q
15			from django.core.exceptions import ObjectDoesNotExist
16			from django.utils.text import Truncator
17
18			from image_validation import validation, ValidationResult
19			from image_validation.static_parameters import ruleset_filename as \
20			IMAGE_RULESET
21
22			from common.constants import BIOSAMPLE_URL
23			from image_app.models import Name
24			from biosample.helpers import parse_image_alias, get_model_object
25
26			# Get an instance of a logger
27			logger = logging.getLogger(__name__)
28
29
30			# a class to deal with temporary issues from EBI servers
31			class OntologyCacheError(Exception):
32			"""Identifies temporary issues with EBI servers and
33			image_validation.use_ontology.OntologyCache objects"""
34
35
36			# a class to deal with errors in ruleset (that are not user errors but
37			# errors within InjectTool and image_validation library)
38			class RulesetError(Exception):
39			"""Indentifies errors in ruleset"""
40
41
42			class MetaDataValidation():
43			"""A class to deal with IMAGE-ValidationTool ruleset objects"""
44
45			ruleset = None
46
47			def __init__(self, ruleset_filename=IMAGE_RULESET):
48			self.read_in_ruleset(ruleset_filename)
49
50			# check validation rules
51			ruleset_errors = self.check_ruleset()
52
53			if ruleset_errors != []:
54			raise RulesetError(
55			"Error with ruleset: %s" % "; ".join(ruleset_errors))
56
57			def read_in_ruleset(self, ruleset_filename):
58			try:
59			self.ruleset = validation.read_in_ruleset(ruleset_filename)
60
61			except json.JSONDecodeError as message:
62			logger.error(
63			"Error with 'https://www.ebi.ac.uk/ols/api/': %s" % (
64			str(message)))
65
66			raise OntologyCacheError(
67			"Issue with 'https://www.ebi.ac.uk/ols/api/'")
68
69			def check_usi_structure(self, record):
70			"""Check data against USI rules"""
71
72			# this function need its input as a list
73			return validation.check_usi_structure(record)
74
75			def check_ruleset(self):
76			"""Check ruleset"""
77
78			return validation.check_ruleset(self.ruleset)
79
80			def check_duplicates(self, record):
81			"""Check duplicates in data"""
82
83			return validation.check_duplicates(record)
84
85			def check_biosample_id_target(
86			self, biosample_id, record_id, record_result):
87
88			"""
89			Check if a target biosample_id exists or not. If it is present, ok.
90			Otherwise a ValidationResultColumn with a warning
91
92			Args:
93			biosample_id (str): the desidered biosample id
94			record_id (str): is the name of the object in the original data
95			source
96			record_result (ValidationResult.ValidationResultRecord):
97			an image_validation result object
98
99			Returns:
100			ValidationResult.ValidationResultRecord: an updated
101			image_validation object
102			"""
103
104			url = f"{BIOSAMPLE_URL}/{biosample_id}"
105			response = requests.get(url)
106			status = response.status_code
107			if status != 200:
108			record_result.add_validation_result_column(
109			ValidationResult.ValidationResultColumn(
110			"Warning",
111			f"Fail to retrieve record {biosample_id} from "
112			f"BioSamples as required in the relationship",
113			record_id,
114			'sampleRelationships'))
115
116			return record_result
117
118			def check_relationship(self, record, record_result):
119			"""
120			Check relationship for an Animal/Sample record and return a list
121			of dictionaries (to_biosample() objects) of related object
122
123			Args:
124			record (dict): An Animal/Sample.to_biosample() dictionary object
125			record_result (ValidationResult.ValidationResultRecord):
126			an image_validation result object
127
128			Returns:
129			list: a list of dictionaries of relate objects
130			ValidationResult.ValidationResultRecord: an updated
131			image_validation object
132			"""
133
134			# get relationship from a to_biosample() dictionary object
135			relationships = record.get('sampleRelationships', [])
136
137			# as described in image_validation.Submission.Submission
138			# same as record["title"], is the original name of the object id DS
139			record_id = record['attributes']["Data source ID"][0]['value']
140
141			# related objects (from UID goes here)
142			related = []
143
144			for relationship in relationships:
145			if 'accession' in relationship:
146			target = relationship['accession']
147
148			# check biosample target and update record_result if necessary
149			record_result = self.check_biosample_id_target(
150			target, record_id, record_result)
151
152			# HINT: should I check aliases? they came from PK and are related
153			# in the same submission. I can't have a sample without an animal
154			# since animal is a foreign key of sample (which doesn't tolerate
155			# NULL). Even mother and father are related through keys. If
156			# missing, no information about mother and father could be
157			# determined
158			else:
159			# could be a parent relationship for an animal, or the animal
160			# where this sample comes from
161			target = relationship['alias']
162
163			# test for object existence in db. Use biosample.helpers
164			# method to derive a model object from database, then get
165			# its related data
166			try:
167			material_obj = get_model_object(
168			*parse_image_alias(target))
169			related.append(material_obj.to_biosample())
170
171			except ObjectDoesNotExist:
172			record_result.add_validation_result_column(
173			ValidationResult.ValidationResultColumn(
174			"Error",
175			f"Could not locate the referenced record {target}",
176			record_id, 'sampleRelationships'))
177
178			return related, record_result
179
180			def validate(self, record):
181			"""
182			Check attributes for record by calling image_validation methods
183
184			Args:
185			record (dict): An Animal/Sample.to_biosample() dictionary object
186
187			Returns:
188			ValidationResult.ValidationResultRecord: an image_validation
189			object
190			"""
191
192			# this validated in general way
193			result = self.ruleset.validate(record)
194
195			# as defined in image_valdiation.Submission, I will skip further
196			# validation check
197			if result.get_overall_status() == "Error":
198			logger.warning(
199			"record: %s has errors. Skipping context validation" % (
200			record["title"]))
201
202			else:
203			# context validation evaluate relationships. Get them
204			related, result = self.check_relationship(record, result)
205
206			# this validate context (attributes that depends on another one)
207			result = validation.context_validation(record, result, related)
208
209			return result
210
211
212			class ValidationSummary():
213			"""A class to deal with error messages and submission"""
214
215			def __init__(self, submission_obj):
216			"""Istantiate a report object from Submission"""
217
218			# get all names belonging to this submission
219			self.names = Name.objects.select_related(
220			"validationresult",
221			"animal",
222			"sample").filter(
223			submission=submission_obj)
224
225			# here I will have 5 queries, each one executed when calling count
226			# or when iterating queryset
227
228			# count animal and samples
229			self.n_animals = self.names.filter(animal__isnull=False).count()
230			self.n_samples = self.names.filter(sample__isnull=False).count()
231
232			logger.debug("Got %s animal and %s samples in total" % (
233			self.n_animals, self.n_samples))
234
235			# count animal and samples with unknown validation
236			self.n_animal_unknown = self.names.filter(
237			animal__isnull=False, validationresult__isnull=True).count()
238			self.n_sample_unknown = self.names.filter(
239			sample__isnull=False, validationresult__isnull=True).count()
240
241			logger.debug("Got %s animal and %s samples with unknown validation" % (
242			self.n_animal_unknown, self.n_sample_unknown))
243
244			# filter names which have errors
245			self.errors = self.names.exclude(
246			Q(validationresult__status="Pass") \|
247			Q(validationresult__isnull=True)
248			)
249
250			# count animal and samples with issues
251			self.n_animal_issues = self.errors.filter(animal__isnull=False).count()
252			self.n_sample_issues = self.errors.filter(sample__isnull=False).count()
253
254			logger.debug("Got %s animal and %s samples with issues" % (
255			self.n_animal_issues, self.n_sample_issues))
256
257			# setting patterns
258			self.pattern1 = re.compile(
259			r"<([^>])> of field (.) \bis \b(.*) for Record")
260
261			self.pattern2 = re.compile(
262			r"(.) for the field (.) \which \b(.*) for Record")
263
264			self.pattern3 = re.compile(
265			r"Provided value (.*) (does not match to the provided ontology)")
266
267			# setting report dictionary
268			self.report = {}
269
270			def process_errors(self):
271			"""Process errors and gives hints"""
272
273			# resetting report dictionary
274			self.report = {}
275
276			# TODO: track passed objects in report
277			for error in self.errors:
278			if not hasattr(error, 'validationresult'):
279			logger.debug("Ignoring %s" % (error))
280			continue
281
282			for message in error.validationresult.messages:
283			if (self.parse1(message, error.id) or
284			self.parse2(message, error.id) or
285			self.parse3(message, error.id)):
286			logger.debug("Processed message: %s" % (message))
287			else:
288			logger.error("Cannot parse: '%s'" % message)
289
290			# assign those values to report
291			key = ("unmanaged", Truncator(message).words(10))
292			self.__update_report(key, error.id)
293
294			# block error message
295
296			return self.report
297
298			def parse1(self, message, error_id):
299			match = re.search(self.pattern1, message)
300
301			if match:
302			value, field, reason = match.groups()
303			logger.debug("parse1: Got '{}','{}' and '{}'".format(
304			value, field, reason))
305
306			key = (field, reason)
307			self.__update_report(key, error_id)
308
309			return True
310
311			else:
312			return False
313
314			def parse2(self, message, error_id):
315			match = re.search(self.pattern2, message)
316
317			if match:
318			reason, field, field_type = match.groups()
319			logger.debug("parse2: Got '{}','{}' and '{}'".format(
320			reason, field, field_type))
321
322			key = (field, reason)
323			self.__update_report(key, error_id)
324
325			return True
326
327			else:
328			return False
329
330			def parse3(self, message, error_id):
331			match = re.search(self.pattern3, message)
332
333			if match:
334			value, reason = match.groups()
335			logger.debug("parse3: Got '{}' and '{}'".format(
336			value, reason))
337
338			key = (value, reason)
339			self.__update_report(key, error_id)
340
341			return True
342
343			else:
344			return False
345
346			def __update_report(self, key, error_id):
347			if key in self.report:
348			self.report[key]['count'] += 1
349			self.report[key]['ids'] += [error_id]
350
351			else:
352			self.report[key] = {'count': 1, 'ids': [error_id]}
353

cnr-ibba / IMAGE-InjectTool

Pull Request — master (#26)

validation.helpers A

Complexity

Size/Duplication

Importance

14 Methods

Duplication Side-by-Side

Filter issues like