Issues (70)

django-data/image/validation/tasks.py (1 issue)

Severity
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Fri Oct  5 11:22:33 2018
5
6
@author: Paolo Cozzi <[email protected]>
7
8
Useful staff to deal with validation process
9
10
"""
11
12
import json
13
import traceback
14
15
from collections import Counter, defaultdict
16
from celery.utils.log import get_task_logger
17
18
from common.constants import (
19
    READY, ERROR, LOADED, NEED_REVISION, COMPLETED, SUBMITTED, STATUSES,
20
    KNOWN_STATUSES)
21
from common.helpers import send_mail_to_admins
22
from common.tasks import BaseTask, NotifyAdminTaskMixin
23
from image.celery import app as celery_app
24
from uid.models import Sample, Animal
25
from submissions.tasks import SubmissionTaskMixin
26
from validation.models import ValidationSummary
27
28
from .models import ValidationResult as ValidationResultModel
29
from .helpers import MetaDataValidation, OntologyCacheError, RulesetError
30
31
# Get an instance of a logger
32
logger = get_task_logger(__name__)
33
34
# get a dictionary from status name (ie {0: 'Waiting'})
35
key2status = dict([x.value for x in STATUSES])
36
37
38
# A class to deal with validation errors
39
class ValidationError(Exception):
40
    pass
41
42
43
class ValidateSubmission(object):
44
    """
45
    An helper class for submission task, useful to pass parameters like
46
    submission data between tasks"""
47
48
    # define my class attributes
49
    def __init__(self, submission_obj, ruleset):
50
        # track submission object
51
        self.submission_obj = submission_obj
52
53
        # track ruleset
54
        self.ruleset = ruleset
55
56
        # collect all unique messages for samples and animals
57
        self.animals_messages = defaultdict(list)
58
        self.samples_messages = defaultdict(list)
59
60
        self.animals_offending_columns = dict()
61
        self.samples_offending_columns = dict()
62
63
        # track global statuses for animals and samples
64
        # Don't set keys: if you take a key which doesn't exists, you will
65
        # get 0 instead of key errors. This is how Counter differ from a
66
        # default dictionary object
67
        self.animals_statuses = Counter()
68
        self.samples_statuses = Counter()
69
70
    def check_valid_statuses(self):
71
        """Check if validation return with an unsupported status message"""
72
73
        # test for keys in model_statuses
74
        for key in self.animals_statuses.keys():
75
            if key not in KNOWN_STATUSES:
76
                logger.error("Unsupported status '%s' from validation" % key)
77
                return False
78
79
        for key in self.samples_statuses.keys():
80
            if key not in KNOWN_STATUSES:
81
                logger.error("Unsupported status '%s' from validation" % key)
82
                return False
83
84
        # if I arrive here, all validation statuses are handled
85
        return True
86
87
    def __has_key_in_rules(self, key):
88
        """Generic function to test errors in validation rules"""
89
90
        if (self.animals_statuses[key] > 0 or
91
                self.samples_statuses[key] > 0):
92
            return True
93
94
        else:
95
            return False
96
97
    def has_errors_in_rules(self):
98
        "Return True if there is any errors in validation rules"""
99
100
        return self.__has_key_in_rules('Error')
101
102
    def has_warnings_in_rules(self):
103
        "Return True if there is any warnings in validation rules"""
104
105
        return self.__has_key_in_rules('Warning')
106
107
    def validate_model(self, model):
108
        logger.debug("Validating %s" % (model))
109
110
        # thsi could be animal or sample
111
        if isinstance(model, Sample):
112
            model_statuses = self.samples_statuses
113
114
        elif isinstance(model, Animal):
115
            model_statuses = self.animals_statuses
116
117
        # get data in biosample format
118
        data = model.to_biosample()
119
120
        # TODO: remove this when IMAGE-metadata rules will support
121
        # IMAGE submission id
122
        del(data['attributes']['IMAGE submission id'])
123
124
        # input is a list object
125
        usi_result = self.ruleset.check_usi_structure([data])
126
127
        # if I have errors here, JSON isn't valid: this is not an error
128
        # on user's data but on InjectTool itself
129
        if usi_result.get_overall_status() != 'Pass':
130
            # update statuses (update counters), mark model and return
131
            self.update_statuses(model_statuses, model, usi_result)
0 ignored issues
show
The variable model_statuses does not seem to be defined for all execution paths.
Loading history...
132
133
            # It make no sense continue validation since JSON is wrong
134
            return
135
136
        # no check_duplicates: it checks against alias (that is a pk)
137
        # HINT: improve check_duplicates or implement database constraints
138
139
        # check against image metadata
140
        ruleset_result = self.ruleset.validate(data)
141
142
        # update status and track data in a overall variable
143
        self.update_statuses(model_statuses, model, ruleset_result)
144
145
    # inspired from validation.deal_with_validation_results
146
    def update_statuses(self, model_statuses, model, result):
147
        """
148
        Update validation summary counter and then mark model with an
149
        appropriate status (READY for Pass and Warning, NEED_REVISION for
150
        the remaining statuses)
151
152
        Args:
153
            model_statuses (Counter): a counter object for animal or sample
154
            validation statuese
155
            model (Sample/Animal): a Sample or Animal object
156
            result (ValidationResultRecord): a validation result for a record
157
        """
158
159
        # get overall status (ie Pass, Error)
160
        overall = result.get_overall_status()
161
162
        # set model as valid even if has some warnings
163
        if overall in ["Pass", "Warning"]:
164
            self.mark_model(model, result, READY)
165
166
        else:
167
            model_statuses.update(['Issues'])
168
            self.mark_model(model, result, NEED_REVISION)
169
170
        # update a collections.Counter objects by key
171
        model_statuses.update({overall})
172
        model_statuses.update(['Known'])
173
174
    def mark_model(self, model, result, status):
175
        """Set status to a model and instantiate a ValidationResult obj"""
176
177
        messages = result.get_messages()
178
179
        # get comparable messages for batch update
180
        comparable_messages = list()
181
        for result_set in result.result_set:
182
            comparable_messages.append({
183
                'message': result_set.get_comparable_str(),
184
                'offending_column': result_set.get_field_name()
185
            })
186
        overall_status = result.get_overall_status()
187
188
        # Save all messages for validation summary
189
        if isinstance(model, Sample):
190
            for message in comparable_messages:
191
                # samples_messages is a counter object
192
                self.samples_messages[message['message']].append(model.pk)
193
                self.samples_offending_columns[message['message']] = \
194
                    message['offending_column']
195
196
        # is as an animal object
197
        elif isinstance(model, Animal):
198
            for message in comparable_messages:
199
                self.animals_messages[message['message']].append(model.pk)
200
                self.animals_offending_columns[message['message']] = \
201
                    message['offending_column']
202
203
        # get a validation result model or create a new one
204
        if model.validationresult:
205
            validationresult = model.validationresult
206
207
        else:
208
            validationresult = ValidationResultModel()
209
            model.validationresult = validationresult
210
211
        # setting valdiationtool results and save
212
        validationresult.messages = messages
213
        validationresult.status = overall_status
214
        validationresult.save()
215
216
        # ok, don't update statuses for submitted objects which
217
        # already are in biosamples and pass validation
218
        if model.status in [COMPLETED, SUBMITTED] and status == READY:
219
            logger.debug(
220
                "Ignoring %s: status was '%s' and validation is OK" % (
221
                    model, key2status[model.status]))
222
223
        else:
224
            logger.debug(
225
                "Marking %s with '%s' status (%s)" % (
226
                    model, key2status[status], messages))
227
228
            # update model status and save
229
            model.status = status
230
            model.save()
231
232
    def create_validation_summary(self):
233
        """
234
        This function will create ValidationSummary object that will be used
235
        on validation_summary view
236
        """
237
238
        for model_type in ['animal', 'sample']:
239
            summary_obj, created = ValidationSummary.objects.get_or_create(
240
                submission=self.submission_obj, type=model_type)
241
242
            if created:
243
                logger.debug(
244
                    "Created %s validationSummary for %s" % (
245
                        model_type, self.submission_obj))
246
247
            # reset all_count
248
            summary_obj.reset_all_count()
249
250
            if model_type == 'animal':
251
                messages = self.animals_messages
252
                model_statuses = self.animals_statuses
253
                offending_column = self.animals_offending_columns
254
255
            # Im cycling with animal and sample type
256
            else:
257
                messages = self.samples_messages
258
                model_statuses = self.samples_statuses
259
                offending_column = self.samples_offending_columns
260
261
            summary_obj.submission = self.submission_obj
262
263
            # they are counter object, so no Keyerror and returns 0
264
            summary_obj.pass_count = model_statuses['Pass']
265
            summary_obj.warning_count = model_statuses['Warning']
266
            summary_obj.error_count = model_statuses['Error']
267
            summary_obj.issues_count = model_statuses['Issues']
268
            summary_obj.validation_known_count = model_statuses['Known']
269
270
            validation_messages = list()
271
272
            for message, ids in messages.items():
273
                validation_messages.append({
274
                    'message': message,
275
                    'count': len(ids),
276
                    'ids': ids,
277
                    'offending_column': offending_column[message]
278
                })
279
280
            summary_obj.messages = validation_messages
281
            summary_obj.type = model_type
282
            summary_obj.save()
283
284
        logger.debug(
285
            "Results for submission %s: animals - %s, samples - %s" % (
286
                self.submission_obj,
287
                dict(self.animals_statuses),
288
                dict(self.samples_statuses))
289
        )
290
291
292
class ValidateTask(SubmissionTaskMixin, NotifyAdminTaskMixin, BaseTask):
293
    name = "Validate Submission"
294
    description = """Validate submission data against IMAGE rules"""
295
    action = "validation"
296
297
    # http://docs.celeryproject.org/en/latest/userguide/tasks.html#instantiation
298
    # A task is not instantiated for every request, but is registered in
299
    # the task registry as a global instance. This means that the __init__
300
    # constructor will only be called once per process, and that the
301
    # task class is semantically closer to an Actor. if you have a task and
302
    # you route every request to the same process, then it will keep state
303
    # between requests. This can also be useful to cache resources, For
304
    # example, a base Task class that caches a database connection
305
306
    # override SubmissionTaskMixin update_submission_status
307
    def update_submission_status(
308
            self, submission_obj, status, message, construct_message=True):
309
        """Mark submission with status, then send message
310
311
        Args:
312
            submission_obj (uid.models.Submission): an UID submission
313
            object
314
            status (int): a :py:class:`common.constants.STATUSES` value
315
            message (str): the message to send
316
            construct_message (bool): construct validation message or not
317
        """
318
319
        super().update_submission_status(
320
            submission_obj, status, message, construct_message)
321
322
    def __generic_error_report(
323
            self, submission_obj, status, message, notify_admins=False):
324
        """
325
        Generic report for updating submission objects and send email after
326
        an exception is called
327
328
        Args:
329
            submission_obj (uid.models.Submission): an UID submission
330
            object
331
            status (int): a :py:class:`common.constants.STATUSES` object
332
            message (str): a text object
333
            notify_admins (bool): send mail to the admins or not
334
        """
335
336
        # mark submission with its status
337
        self.update_submission_status(
338
            submission_obj,
339
            status,
340
            message
341
        )
342
343
        # get exception info
344
        einfo = traceback.format_exc()
345
346
        # send a mail to the user with the stacktrace (einfo)
347
        email_subject = "Error in IMAGE Validation: %s" % (message)
348
        email_message = (
349
            "Something goes wrong with validation. Please report "
350
            "this to InjectTool team\n\n %s" % str(einfo))
351
352
        self.mail_to_owner(submission_obj, email_subject, email_message)
353
354
        # this is a common.helpers method that should be used when needed
355
        if notify_admins:
356
            # submit mail to admins
357
            send_mail_to_admins(email_subject, email_message)
358
359
    # TODO: define a method to inform user for error in validation (Task run
360
    # with success but errors in data)
361
362
    def temporary_error_report(self, exc, submission_obj):
363
        """
364
        Deal with known issues in validation task. Notify the user using
365
        email and set status as READY in order to recall this task
366
367
        Args:
368
            exc (Exception): an py:exc`Exception` object
369
            submission_obj (uid.models.Submission): an UID submission
370
            object
371
372
        Return
373
            str: "success" since this task is correctly managed
374
        """
375
376
        logger.error("Error in validation: %s" % exc)
377
378
        message = "Errors in EBI API endpoints. Please try again later"
379
        logger.error(message)
380
381
        # call generic report which update submission and send email
382
        self.__generic_error_report(submission_obj, LOADED, message)
383
384
        return "success"
385
386
    def ruleset_error_report(self, exc, submission_obj):
387
        """
388
        Deal with ruleset issue in validation task. Notify the user using
389
        email and set status as ERROR, since he can't do anything without
390
        admin intervention
391
392
        Args:
393
            exc (Exception): an py:exc`Exception` object
394
            submission_obj (uid.models.Submission): an UID submission
395
            object
396
397
        Return
398
            str: "success" since this task is correctly managed
399
        """
400
401
        logger.error("Error ruleset: %s" % exc)
402
403
        message = (
404
            "Error in IMAGE-metadata ruleset. Please inform InjectTool team")
405
        logger.error(message)
406
407
        # call generic report which update submission and send email
408
        self.__generic_error_report(
409
            submission_obj, ERROR, message, notify_admins=True)
410
411
        return "success"
412
413
    def run(self, submission_id):
414
        """a function to perform validation steps"""
415
416
        logger.info("Validate Submission started")
417
418
        # get submissio object
419
        submission_obj = self.get_uid_submission(submission_id)
420
421
        # read rules when task starts. Model issues when starting
422
        # OntologyCache at start
423
        try:
424
            self.ruleset = MetaDataValidation()
425
426
        except OntologyCacheError as exc:
427
            return self.temporary_error_report(exc, submission_obj)
428
429
        except RulesetError as exc:
430
            return self.ruleset_error_report(exc, submission_obj)
431
432
        # get a submission data helper instance
433
        validate_submission = ValidateSubmission(submission_obj, self.ruleset)
434
435
        try:
436
            for animal in Animal.objects.filter(
437
                    submission=submission_obj).order_by('id'):
438
                validate_submission.validate_model(animal)
439
440
            for sample in Sample.objects.filter(
441
                    submission=submission_obj).order_by('id'):
442
                validate_submission.validate_model(sample)
443
444
        # TODO: errors in validation should raise custom exception
445
        except json.decoder.JSONDecodeError as exc:
446
            return self.temporary_error_report(exc, submission_obj)
447
448
        except Exception as exc:
449
            raise self.retry(exc=exc)
450
451
        # if error messages changes in IMAGE-ValidationTool, all this
452
        # stuff isn't valid and I throw an exception
453
454
        if not validate_submission.check_valid_statuses():
455
            message = (
456
                "Unsupported validation status for submission %s" % (
457
                    submission_obj))
458
459
            # debug: print error in log
460
            logger.error(message)
461
462
            # create validation summary
463
            validate_submission.create_validation_summary()
464
465
            # mark submission with ERROR (this is not related to user data)
466
            # calling the appropriate method passing ERROR as status
467
            self.submission_fail(submission_obj, message, status=ERROR)
468
469
            # raise an exception since is an InjectTool issue
470
            raise ValidationError(message)
471
472
        # set a proper value for status (READY or NEED_REVISION)
473
        # If I will found any error or warning, I will
474
        # return a message and I will set NEED_REVISION
475
        elif validate_submission.has_errors_in_rules():
476
            # create validation summary
477
            validate_submission.create_validation_summary()
478
479
            message = (
480
                "Error in metadata. Need revisions before submit")
481
482
            # mark submission with NEED_REVISION
483
            self.submission_fail(submission_obj, message)
484
485
            logger.warning(
486
                "Error in metadata for submission %s" % (submission_obj))
487
488
        # WOW: I can submit those data
489
        elif validate_submission.has_warnings_in_rules():
490
            # create validation summary
491
            validate_submission.create_validation_summary()
492
493
            message = "Submission validated with some warnings"
494
495
            # mark submission with READY status
496
            self.submission_ready(submission_obj, message)
497
498
            logger.info(
499
                "Submission %s validated with some warning" % (submission_obj))
500
501
        else:
502
            # create validation summary
503
            validate_submission.create_validation_summary()
504
505
            message = "Submission validated with success"
506
507
            # mark submission with READY status
508
            self.submission_ready(submission_obj, message)
509
510
            logger.info(
511
                "Submission %s validated with success" % (submission_obj))
512
513
        logger.info("Validate Submission completed")
514
515
        return "success"
516
517
    def submission_fail(self, submission_obj, message, status=NEED_REVISION):
518
        """Mark a submission with NEED_REVISION status"""
519
520
        # ovverride message
521
        message = ("Validation got errors: %s" % (message))
522
        self.update_submission_status(submission_obj, status, message)
523
524
    def submission_ready(self, submission_obj, message):
525
        """Mark a submission with READY status"""
526
527
        self.update_submission_status(submission_obj, READY, message)
528
529
530
# register explicitly tasks
531
# https://github.com/celery/celery/issues/3744#issuecomment-271366923
532
celery_app.tasks.register(ValidateTask)
533