Passed
Pull Request — master (#35)
by Paolo
02:58
created

crbanim.helpers.CRBAnimReader.__init__()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 6
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Thu Feb 21 15:37:16 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import io
10
import csv
11
import logging
12
import pycountry
13
import asyncio
14
15
from collections import defaultdict, namedtuple
16
17
from django.utils.dateparse import parse_date
18
19
from common.constants import LOADED, ERROR, MISSING, STATUSES
20
from common.helpers import image_timedelta, send_message_to_websocket
21
from image_app.models import (
22
    DictSpecie, DictSex, DictCountry, DictBreed, Name, Animal, Sample,
23
    DictUberon, Publication)
24
from language.helpers import check_species_synonyms
25
from validation.helpers import construct_validation_message
26
from validation.models import ValidationSummary
27
28
# Get an instance of a logger
29
logger = logging.getLogger(__name__)
30
31
32
# A class to deal with cryoweb import errors
33
class CRBAnimImportError(Exception):
34
    pass
35
36
37 View Code Duplication
def send_message(submission_obj, send_validation=False):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
38
    """
39
    Update submission.status and submission message using django
40
    channels
41
42
    Args:
43
        submission_obj (image_app.models.Submission): an UID submission
44
        object
45
        send_validation (bool): send validation message or not
46
    """
47
48
    # define a message to send
49
    message = {
50
        'message': STATUSES.get_value_display(submission_obj.status),
51
        'notification_message': submission_obj.message,
52
    }
53
54
    # if validation message is needed, add to the final message
55
    if send_validation:
56
        message['validation_message'] = construct_validation_message(
57
            submission_obj)
58
59
    # now send the message to its submission
60
    asyncio.get_event_loop().run_until_complete(
61
        send_message_to_websocket(
62
            message,
63
            submission_obj.pk
64
        )
65
    )
66
67
68
class CRBAnimReader():
69
    mandatory_columns = [
70
            'sex',
71
            'species_latin_name',
72
            'country_of_origin',
73
            'breed_name',
74
            'animal_ID',
75
            'sample_bibliographic_references',
76
            'sample_identifier',
77
            'animal_birth_date',
78
            'sample_storage_temperature',
79
            'sample_type_name',
80
            'body_part_name',
81
            'sampling_date',
82
            'sampling_protocol_url',
83
            'sample_availability',
84
            'EBI_Biosample_identifier',
85
        ]
86
87
    def __init__(self):
88
        self.data = None
89
        self.header = None
90
        self.dialect = None
91
        self.items = None
92
        self.filename = None
93
94
    @classmethod
95
    def get_dialect(cls, chunk):
96
        """Determine dialect of a CSV from a chunk"""
97
98
        return csv.Sniffer().sniff(chunk)
99
100
    @classmethod
101
    def is_valid(cls, chunk):
102
        """Try to determine if CRBanim has at least the required columns
103
        or not"""
104
105
        dialect = cls.get_dialect(chunk)
106
107
        # get a handle from a string
108
        handle = io.StringIO(chunk)
109
110
        # read chunk
111
        reader = csv.reader(handle, dialect)
112
        header = next(reader)
113
114
        not_found = []
115
116
        for column in cls.mandatory_columns:
117
            if column not in header:
118
                not_found.append(column)
119
120
        if len(not_found) == 0:
121
            logger.debug("This seems to be a valid CRBanim file")
122
            return True, []
123
124
        else:
125
            logger.error("Couldn't not find mandatory CRBanim columns %s" % (
126
                not_found))
127
            return False, not_found
128
129
    def read_file(self, filename):
130
        """Read crb anim files and set tit to class attribute"""
131
132
        with open(filename, newline='') as handle:
133
            # initialize data
134
            self.filename = filename
135
            self.data = []
136
137
            # get dialect
138
            chunk = handle.read(2048)
139
            self.dialect = self.get_dialect(chunk)
140
141
            # restart filename from the beginning
142
            handle.seek(0)
143
144
            # read csv file
145
            reader = csv.reader(handle, self.dialect)
146
            self.header = next(reader)
147
148
            # find sex index column
149
            sex_idx = self.header.index('sex')
150
151
            # create a namedtuple object
152
            Data = namedtuple("Data", self.header)
153
154
            # add records to data
155
            for record in reader:
156
                # replace all "\\N" occurences in a list
157
                record = [None if col in ["\\N", ""]
158
                          else col for col in record]
159
160
                # 'unknown' sex should be replaced with 'record of unknown sex'
161
                if record[sex_idx].lower() == 'unknown':
162
                    logger.debug(
163
                        "Changing '%s' with '%s'" % (
164
                            record[sex_idx], 'record of unknown sex'))
165
                    record[sex_idx] = 'record of unknown sex'
166
167
                record = Data._make(record)
168
                self.data.append(record)
169
170
        self.items = self.eval_columns()
171
172
    def eval_columns(self):
173
        """define a set from column data"""
174
175
        # target_columns = ['sex', 'species_latin_name', 'breed_name']
176
        target_columns = self.header
177
178
        items = defaultdict(list)
179
180
        for line in self.data:
181
            for column in target_columns:
182
                idx = self.header.index(column)
183
                items[column].append(line[idx])
184
185
        # now get a set of object
186
        for column in target_columns:
187
            items[column] = set(items[column])
188
189
        return items
190
191
    def print_line(self, num):
192
        """print a record with its column names"""
193
194
        for i, column in enumerate(self.header):
195
            logger.debug("%s: %s" % (column, self.data[num][i]))
196
197
    def filter_by_column_values(self, column, values, ignorecase=False):
198
        if ignorecase is True:
199
            # lower values
200
            values = [value.lower() for value in values]
201
202
        for line in self.data:
203
            # search for case insensitive value (lower attrib in lower values)
204
            if ignorecase is True:
205
                if getattr(line, column).lower() in values:
206
                    yield line
207
208
                else:
209
                    logger.debug("Filtering: %s" % (str(line)))
210
211
            else:
212
                if getattr(line, column) in values:
213
                    yield line
214
215
                else:
216
                    logger.debug("Filtering: %s" % (str(line)))
217
218
            # ignore case or not
219
220
        # cicle for line
221
222
    def __check_items(self, item_set, model, column):
223
        """General check of CRBanim items into database"""
224
225
        # a list of not found terms and a status to see if something is missing
226
        # or not
227
        not_found = []
228
        result = True
229
230
        for item in item_set:
231
            # check for species in database
232
            if not model.objects.filter(label=item).exists():
233
                not_found.append(item)
234
235
        if len(not_found) != 0:
236
            result = False
237
            logger.warning(
238
                "Those %s are not present in UID database:" % (column))
239
            logger.warning(not_found)
240
241
        return result, not_found
242
243
    # a function to detect if crbanim species are in UID database or not
244
    def check_species(self, country):
245
        """Check if all species are defined in UID DictSpecies"""
246
247
        # CRBAnim usually have species in the form required for UID
248
        # However sometimes there could be a common name, not a DictSpecie one
249
        column = 'species_latin_name'
250
251
        check, not_found = self.__check_items(
252
            self.items[column], DictSpecie, column)
253
254
        if check is False:
255
            # try to check in dictionary table
256
            logger.info("Searching for %s in dictionary tables" % (not_found))
257
258
            # if this function return True, I found all synonyms
259
            if check_species_synonyms(not_found, country) is True:
260
                logger.info("Found %s in dictionary tables" % not_found)
261
262
                # return True and an empty list for check and not found items
263
                return True, []
264
265
        # if I arrive here, there are species that I couldn't find
266
        logger.error("Couldnt' find those species in dictionary tables:")
267
        logger.error(not_found)
268
269
        return check, not_found
270
271
    # check that dict sex table contains data
272
    def check_sex(self):
273
        """check that dict sex table contains data"""
274
275
        # item.sex are in uppercase
276
        column = 'sex'
277
        item_set = [item.lower() for item in self.items[column]]
278
279
        return self.__check_items(item_set, DictSex, column)
280
281
282
def fill_uid_breed(record, language):
283
    """Fill DioctBreed from a crbanim record"""
284
285
    # get a DictSpecie object. Species are in latin names, but I can
286
    # find also a common name in translation tables
287
    try:
288
        specie = DictSpecie.objects.get(label=record.species_latin_name)
289
290
    except DictSpecie.DoesNotExist:
291
        logger.info("Search %s in synonyms" % (record.species_latin_name))
292
        # search for language synonym (if I arrived here a synonym should
293
        # exists)
294
        specie = DictSpecie.get_by_synonym(
295
            synonym=record.species_latin_name,
296
            language=language)
297
298
    # get country name using pycountries
299
    country_name = pycountry.countries.get(
300
        alpha_2=record.country_of_origin).name
301
302
    # get country for breeds. Ideally will be the same of submission,
303
    # however, it could be possible to store data from other contries
304
    country, created = DictCountry.objects.get_or_create(
305
        label=country_name)
306
307
    # I could create a country from a v_breed_specie instance. That's
308
    # ok, maybe I could have a lot of breed from different countries and
309
    # a few organizations submitting them
310
    if created:
311
        logger.info("Created %s" % country)
312
313
    else:
314
        logger.debug("Found %s" % country)
315
316
    breed, created = DictBreed.objects.get_or_create(
317
        supplied_breed=record.breed_name,
318
        specie=specie,
319
        country=country)
320
321
    if created:
322
        logger.info("Created %s" % breed)
323
324
    else:
325
        logger.debug("Found %s" % breed)
326
327
    # return a DictBreed object
328
    return breed
329
330
331
def fill_uid_names(record, submission):
332
    """fill Names table from crbanim record"""
333
334
    # in the same record I have the sample identifier and animal identifier
335
    # a name record for animal
336
    animal_name, created = Name.objects.get_or_create(
337
        name=record.animal_ID,
338
        submission=submission,
339
        owner=submission.owner)
340
341
    if created:
342
        logger.debug("Created animal name %s" % animal_name)
343
344
    else:
345
        logger.debug("Found animal name %s" % animal_name)
346
347
    # get a publication (if present)
348
    publication = None
349
350
    # HINT: mind this mispelling
351
    if record.sample_bibliographic_references:
352
        publication, created = Publication.objects.get_or_create(
353
            doi=record.sample_bibliographic_references)
354
355
        if created:
356
            logger.debug("Created publication %s" % publication)
357
358
    # name record for sample
359
    sample_name, created = Name.objects.get_or_create(
360
        name=record.sample_identifier,
361
        submission=submission,
362
        owner=submission.owner,
363
        publication=publication)
364
365
    if created:
366
        logger.debug("Created sample name %s" % sample_name)
367
368
    else:
369
        logger.debug("Found sample name %s" % sample_name)
370
371
    # returning 2 Name instances
372
    return animal_name, sample_name
373
374
375
def fill_uid_animal(record, animal_name, breed, submission, animals):
376
    """Helper function to fill animal data in UID animal table"""
377
378
    # HINT: does CRBAnim models mother and father?
379
380
    # check if such animal is already beed updated
381
    if animal_name.name in animals:
382
        logger.debug(
383
            "Ignoring %s: already created or updated" % (animal_name))
384
385
        # return an animal object
386
        animal = animals[animal_name.name]
387
388
    else:
389
        # determine sex. Check for values
390
        sex = DictSex.objects.get(label__iexact=record.sex)
391
392
        # there's no birth_location for animal in CRBAnim
393
        accuracy = MISSING
394
395
        # create a new object. Using defaults to avoid collisions when
396
        # updating data
397
        # HINT: CRBanim has less attribute than cryoweb
398
        defaults = {
399
            # HINT: is a duplication of name. Can this be non-mandatory?
400
            'alternative_id': animal_name.name,
401
            'breed': breed,
402
            'sex': sex,
403
            'birth_date': record.animal_birth_date,
404
            'birth_location_accuracy': accuracy,
405
            'owner': submission.owner
406
        }
407
408
        # HINT: I could have the same animal again and again. Should I update
409
        # every times?
410
        animal, created = Animal.objects.update_or_create(
411
            name=animal_name,
412
            defaults=defaults)
413
414
        if created:
415
            logger.debug("Created animal %s" % animal)
416
417
        else:
418
            logger.debug("Updating animal %s" % animal)
419
420
        # track this animal in dictionary
421
        animals[animal_name.name] = animal
422
423
    # I need to track animal to relate the sample
424
    return animal
425
426
427
def find_storage_type(record):
428
    """Determine a sample storage relying on a dictionary"""
429
430
    mapping = {
431
        '-196°C': 'frozen, liquid nitrogen',
432
        '-20°C': 'frozen, -20 degrees Celsius freezer',
433
        '-30°C': 'frozen, -20 degrees Celsius freezer',
434
        '-80°C': 'frozen, -80 degrees Celsius freezer'}
435
436
    if record.sample_storage_temperature in mapping:
437
        return mapping[record.sample_storage_temperature]
438
439
    else:
440
        logging.warning("Couldn't find %s in storage types mapping" % (
441
            record.sample_storage_temperature))
442
443
        return None
444
445
446
def fill_uid_sample(record, sample_name, animal, submission):
447
    """Helper function to fill animal data in UID sample table"""
448
449
    # name and animal name come from parameters
450
    organism_part_label = None
451
    sample_type_name = record.sample_type_name.lower()
452
    body_part_name = record.body_part_name.lower()
453
454
    # sylvain has proposed to apply the following decision rule:
455
    if body_part_name != "unknown" and body_part_name != "not relevant":
456
        organism_part_label = body_part_name
457
458
    else:
459
        organism_part_label = sample_type_name
460
461
    # get a organism part. Organism parts need to be in lowercases
462
    organism_part, created = DictUberon.objects.get_or_create(
463
        label=organism_part_label
464
    )
465
466
    if created:
467
        logger.info("Created uberon %s" % organism_part)
468
469
    else:
470
        logger.debug("Found uberon %s" % organism_part)
471
472
    # calculate animal age at collection
473
    animal_birth_date = parse_date(record.animal_birth_date)
474
    sampling_date = parse_date(record.sampling_date)
475
    animal_age_at_collection, time_units = image_timedelta(
476
        sampling_date, animal_birth_date)
477
478
    # create a new object. Using defaults to avoid collisions when
479
    # updating data
480
    defaults = {
481
        # HINT: is a duplication of name. Can this be non-mandatory?
482
        'alternative_id': sample_name.name,
483
        'collection_date': record.sampling_date,
484
        'protocol': record.sampling_protocol_url,
485
        'organism_part': organism_part,
486
        'animal': animal,
487
        # 'description': v_vessel.comment,
488
        'owner': submission.owner,
489
        'storage': find_storage_type(record),
490
        'availability': record.sample_availability,
491
        'animal_age_at_collection': animal_age_at_collection,
492
        'animal_age_at_collection_units': time_units
493
    }
494
495
    sample, created = Sample.objects.update_or_create(
496
        name=sample_name,
497
        defaults=defaults)
498
499
    if created:
500
        logger.debug("Created sample %s" % sample)
501
502
    else:
503
        logger.debug("Updating sample %s" % sample)
504
505
    return sample
506
507
508
def process_record(record, submission, animals, language):
509
    # Peter mail 26/02/19 18:30: I agree that it sounds like we will
510
    # need to create sameAs BioSamples for the IMAGE project, and it makes
511
    # sense that the inject tool is able to do this.  It may be that we
512
    # tackle these cases after getting the main part of the inject tool
513
    # functioning and hold or ignore these existing BioSamples for now.
514
    # HINT: record with a biosample id should be ignored, for the moment
515
    if record.EBI_Biosample_identifier is not None:
516
        logger.warning("Ignoring %s: already in biosample!" % str(record))
517
        return
518
519
    # filling breeds
520
    breed = fill_uid_breed(record, language)
521
522
    # filling name tables
523
    animal_name, sample_name = fill_uid_names(record, submission)
524
525
    # fill animal
526
    animal = fill_uid_animal(record, animal_name, breed, submission, animals)
527
528
    # fill sample
529
    fill_uid_sample(record, sample_name, animal, submission)
530
531
532
def upload_crbanim(submission):
533
    # debug
534
    logger.info("Importing from CRB-Anim file")
535
536
    # this is the full path in docker container
537
    fullpath = submission.get_uploaded_file_path()
538
539
    # read submission data
540
    reader = CRBAnimReader()
541
    reader.read_file(fullpath)
542
543
    # start data loading
544
    try:
545
        # check for species and sex in a similar way as cryoweb does
546
        check, not_found = reader.check_sex()
547
548
        if not check:
549
            message = (
550
                "Not all Sex terms are loaded into database: "
551
                "check for %s in your dataset" % (not_found))
552
553
            raise CRBAnimImportError(message)
554
555
        check, not_found = reader.check_species(submission.gene_bank_country)
556
557
        if not check:
558
            raise CRBAnimImportError(
559
                "Some species are not loaded in UID database: "
560
                "%s" % (not_found))
561
562
        # ok get languages from submission (useful for translation)
563
        # HINT: no traslations implemented, at the moment
564
        language = submission.gene_bank_country.label
565
566
        # a dictionary in which store animal data
567
        animals = {}
568
569
        for record in reader.data:
570
            process_record(record, submission, animals, language)
571
572
        # after processing records, initilize validationsummary objects
573
        # create a validation summary object and set all_count
574
        vs_animal, created = ValidationSummary.objects.get_or_create(
575
            submission=submission, type="animal")
576
577
        if created:
578
            logger.debug(
579
                "ValidationSummary animal created for "
580
                "submission %s" % submission)
581
582
        # reset counts
583
        vs_animal.reset_all_count()
584
585
        vs_sample, created = ValidationSummary.objects.get_or_create(
586
            submission=submission, type="sample")
587
588
        if created:
589
            logger.debug(
590
                "ValidationSummary sample created for "
591
                "submission %s" % submission)
592
593
        # reset counts
594
        vs_sample.reset_all_count()
595
596
    except Exception as exc:
597
        # set message:
598
        message = "Error in importing data: %s" % (str(exc))
599
600
        # save a message in database
601
        submission.status = ERROR
602
        submission.message = message
603
        submission.save()
604
605
        # send async message
606
        send_message(submission)
607
608
        # debug
609
        logger.error("error in importing from crbanim: %s" % (exc))
610
        logger.exception(exc)
611
612
        return False
613
614
    else:
615
        message = "CRBAnim import completed for submission: %s" % (
616
            submission.id)
617
618
        submission.message = message
619
        submission.status = LOADED
620
        submission.save()
621
622
        # send async message
623
        send_message(submission, send_validation=True)
624
625
    logger.info("Import from CRBAnim is complete")
626
627
    return True
628