Passed
Pull Request — master (#40)
by Paolo
01:28
created

crbanim.helpers.fill_uid_sample()   B

Complexity

Conditions 5

Size

Total Lines 60
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 34
dl 0
loc 60
rs 8.5973
c 0
b 0
f 0
cc 5
nop 4

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Thu Feb 21 15:37:16 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import io
10
import csv
11
import logging
12
import pycountry
13
14
from collections import defaultdict, namedtuple
15
16
from django.utils.dateparse import parse_date
17
18
from common.constants import LOADED, ERROR, MISSING, SAMPLE_STORAGE
19
from common.helpers import image_timedelta
20
from image_app.helpers import FileDataSourceMixin
21
from image_app.models import (
22
    DictSpecie, DictSex, DictCountry, DictBreed, Name, Animal, Sample,
23
    DictUberon, Publication)
24
from submissions.helpers import send_message
25
from validation.helpers import construct_validation_message
26
from validation.models import ValidationSummary
27
28
# Get an instance of a logger
29
logger = logging.getLogger(__name__)
30
31
32
# A class to deal with cryoweb import errors
33
class CRBAnimImportError(Exception):
34
    pass
35
36
37
class CRBAnimReader(FileDataSourceMixin):
38
    mandatory_columns = [
39
            'sex',
40
            'species_latin_name',
41
            'country_of_origin',
42
            'breed_name',
43
            'animal_ID',
44
            'sample_bibliographic_references',
45
            'sample_identifier',
46
            'animal_birth_date',
47
            'sample_storage_temperature',
48
            'sample_type_name',
49
            'body_part_name',
50
            'sampling_date',
51
            'sampling_protocol_url',
52
            'sample_availability',
53
            'EBI_Biosample_identifier',
54
        ]
55
56
    def __init__(self):
57
        self.data = None
58
        self.header = None
59
        self.dialect = None
60
        self.items = None
61
        self.filename = None
62
63
    @classmethod
64
    def get_dialect(cls, chunk):
65
        """Determine dialect of a CSV from a chunk"""
66
67
        return csv.Sniffer().sniff(chunk)
68
69
    @classmethod
70
    def is_valid(cls, chunk):
71
        """Try to determine if CRBanim has at least the required columns
72
        or not"""
73
74
        dialect = cls.get_dialect(chunk)
75
76
        # get a handle from a string
77
        handle = io.StringIO(chunk)
78
79
        # read chunk
80
        reader = csv.reader(handle, dialect)
81
        header = next(reader)
82
83
        not_found = []
84
85
        for column in cls.mandatory_columns:
86
            if column not in header:
87
                not_found.append(column)
88
89
        if len(not_found) == 0:
90
            logger.debug("This seems to be a valid CRBanim file")
91
            return True, []
92
93
        else:
94
            logger.error("Couldn't not find mandatory CRBanim columns %s" % (
95
                not_found))
96
            return False, not_found
97
98
    def read_file(self, filename):
99
        """Read crb anim files and set tit to class attribute"""
100
101
        with open(filename, newline='') as handle:
102
            # initialize data
103
            self.filename = filename
104
            self.data = []
105
106
            # get dialect
107
            chunk = handle.read(2048)
108
            self.dialect = self.get_dialect(chunk)
109
110
            # restart filename from the beginning
111
            handle.seek(0)
112
113
            # read csv file
114
            reader = csv.reader(handle, self.dialect)
115
            self.header = next(reader)
116
117
            # find sex index column
118
            sex_idx = self.header.index('sex')
119
120
            # create a namedtuple object
121
            Data = namedtuple("Data", self.header)
122
123
            # add records to data
124
            for record in reader:
125
                # replace all "\\N" occurences in a list
126
                record = [None if col in ["\\N", ""]
127
                          else col for col in record]
128
129
                # 'unknown' sex should be replaced with 'record of unknown sex'
130
                if record[sex_idx].lower() == 'unknown':
131
                    logger.debug(
132
                        "Changing '%s' with '%s'" % (
133
                            record[sex_idx], 'record of unknown sex'))
134
                    record[sex_idx] = 'record of unknown sex'
135
136
                record = Data._make(record)
137
                self.data.append(record)
138
139
        self.items = self.eval_columns()
140
141
    def eval_columns(self):
142
        """define a set from column data"""
143
144
        # target_columns = ['sex', 'species_latin_name', 'breed_name']
145
        target_columns = self.header
146
147
        items = defaultdict(list)
148
149
        for line in self.data:
150
            for column in target_columns:
151
                idx = self.header.index(column)
152
                items[column].append(line[idx])
153
154
        # now get a set of object
155
        for column in target_columns:
156
            items[column] = set(items[column])
157
158
        return items
159
160
    def print_line(self, num):
161
        """print a record with its column names"""
162
163
        for i, column in enumerate(self.header):
164
            logger.debug("%s: %s" % (column, self.data[num][i]))
165
166
    def filter_by_column_values(self, column, values, ignorecase=False):
167
        if ignorecase is True:
168
            # lower values
169
            values = [value.lower() for value in values]
170
171
        for line in self.data:
172
            # search for case insensitive value (lower attrib in lower values)
173
            if ignorecase is True:
174
                if getattr(line, column).lower() in values:
175
                    yield line
176
177
                else:
178
                    logger.debug("Filtering: %s" % (str(line)))
179
180
            else:
181
                if getattr(line, column) in values:
182
                    yield line
183
184
                else:
185
                    logger.debug("Filtering: %s" % (str(line)))
186
187
            # ignore case or not
188
189
        # cicle for line
190
191
    # a function to detect if crbanim species are in UID database or not
192
    def check_species(self, country):
193
        """Check if all species are defined in UID DictSpecies"""
194
195
        # CRBAnim usually have species in the form required for UID
196
        # However sometimes there could be a common name, not a DictSpecie one
197
        column = 'species_latin_name'
198
        item_set = self.items[column]
199
200
        # call FileDataSourceMixin.check_species
201
        return super().check_species(column, item_set, country)
202
203
    # check that dict sex table contains data
204
    def check_sex(self):
205
        """check that dict sex table contains data"""
206
207
        # item.sex are in uppercase
208
        column = 'sex'
209
        item_set = [item.lower() for item in self.items[column]]
210
211
        # call FileDataSourceMixin.check_items
212
        return self.check_items(item_set, DictSex, column)
213
214
215
def fill_uid_breed(record, language):
216
    """Fill DictBreed from a crbanim record"""
217
218
    # get a DictSpecie object. Species are in latin names, but I can
219
    # find also a common name in translation tables
220
    try:
221
        specie = DictSpecie.objects.get(label=record.species_latin_name)
222
223
    except DictSpecie.DoesNotExist:
224
        logger.info("Search %s in synonyms" % (record.species_latin_name))
225
        # search for language synonym (if I arrived here a synonym should
226
        # exists)
227
        specie = DictSpecie.get_by_synonym(
228
            synonym=record.species_latin_name,
229
            language=language)
230
231
    # get country name using pycountries
232
    country_name = pycountry.countries.get(
233
        alpha_2=record.country_of_origin).name
234
235
    # get country for breeds. Ideally will be the same of submission,
236
    # however, it could be possible to store data from other contries
237
    country, created = DictCountry.objects.get_or_create(
238
        label=country_name)
239
240
    # I could create a country from a v_breed_specie instance. That's
241
    # ok, maybe I could have a lot of breed from different countries and
242
    # a few organizations submitting them
243
    if created:
244
        logger.info("Created %s" % country)
245
246
    else:
247
        logger.debug("Found %s" % country)
248
249
    breed, created = DictBreed.objects.get_or_create(
250
        supplied_breed=record.breed_name,
251
        specie=specie,
252
        country=country)
253
254
    if created:
255
        logger.info("Created %s" % breed)
256
257
    else:
258
        logger.debug("Found %s" % breed)
259
260
    # return a DictBreed object
261
    return breed
262
263
264
def fill_uid_names(record, submission):
265
    """fill Names table from crbanim record"""
266
267
    # in the same record I have the sample identifier and animal identifier
268
    # a name record for animal
269
    animal_name, created = Name.objects.get_or_create(
270
        name=record.animal_ID,
271
        submission=submission,
272
        owner=submission.owner)
273
274
    if created:
275
        logger.debug("Created animal name %s" % animal_name)
276
277
    else:
278
        logger.debug("Found animal name %s" % animal_name)
279
280
    # get a publication (if present)
281
    publication = None
282
283
    if record.sample_bibliographic_references:
284
        publication, created = Publication.objects.get_or_create(
285
            doi=record.sample_bibliographic_references)
286
287
        if created:
288
            logger.debug("Created publication %s" % publication)
289
290
    # name record for sample
291
    sample_name, created = Name.objects.get_or_create(
292
        name=record.sample_identifier,
293
        submission=submission,
294
        owner=submission.owner,
295
        publication=publication)
296
297
    if created:
298
        logger.debug("Created sample name %s" % sample_name)
299
300
    else:
301
        logger.debug("Found sample name %s" % sample_name)
302
303
    # returning 2 Name instances
304
    return animal_name, sample_name
305
306
307
def fill_uid_animal(record, animal_name, breed, submission, animals):
308
    """Helper function to fill animal data in UID animal table"""
309
310
    # HINT: does CRBAnim models mother and father?
311
312
    # check if such animal is already beed updated
313
    if animal_name.name in animals:
314
        logger.debug(
315
            "Ignoring %s: already created or updated" % (animal_name))
316
317
        # return an animal object
318
        animal = animals[animal_name.name]
319
320
    else:
321
        # determine sex. Check for values
322
        sex = DictSex.objects.get(label__iexact=record.sex)
323
324
        # there's no birth_location for animal in CRBAnim
325
        accuracy = MISSING
326
327
        # create a new object. Using defaults to avoid collisions when
328
        # updating data
329
        # HINT: CRBanim has less attribute than cryoweb
330
        defaults = {
331
            # HINT: is a duplication of name. Can this be non-mandatory?
332
            'alternative_id': animal_name.name,
333
            'breed': breed,
334
            'sex': sex,
335
            'birth_date': record.animal_birth_date,
336
            'birth_location_accuracy': accuracy,
337
            'owner': submission.owner
338
        }
339
340
        # HINT: I could have the same animal again and again. Should I update
341
        # every times?
342
        animal, created = Animal.objects.update_or_create(
343
            name=animal_name,
344
            defaults=defaults)
345
346
        if created:
347
            logger.debug("Created animal %s" % animal)
348
349
        else:
350
            logger.debug("Updating animal %s" % animal)
351
352
        # track this animal in dictionary
353
        animals[animal_name.name] = animal
354
355
    # I need to track animal to relate the sample
356
    return animal
357
358
359
def find_storage_type(record):
360
    """Determine a sample storage relying on a dictionary"""
361
362
    mapping = {
363
        '-196°C': 'frozen, liquid nitrogen',
364
        '-20°C': 'frozen, -20 degrees Celsius freezer',
365
        '-30°C': 'frozen, -20 degrees Celsius freezer',
366
        '-80°C': 'frozen, -80 degrees Celsius freezer'}
367
368
    if record.sample_storage_temperature in mapping:
369
        # get ENUM conversion
370
        storage = SAMPLE_STORAGE.get_value_by_desc(
371
            mapping[record.sample_storage_temperature])
372
373
        return storage
374
375
    else:
376
        logging.warning("Couldn't find %s in storage types mapping" % (
377
            record.sample_storage_temperature))
378
379
        return None
380
381
382
def fill_uid_sample(record, sample_name, animal, submission):
383
    """Helper function to fill animal data in UID sample table"""
384
385
    # name and animal name come from parameters
386
    organism_part_label = None
387
    sample_type_name = record.sample_type_name.lower()
388
    body_part_name = record.body_part_name.lower()
389
390
    # sylvain has proposed to apply the following decision rule:
391
    if body_part_name != "unknown" and body_part_name != "not relevant":
392
        organism_part_label = body_part_name
393
394
    else:
395
        organism_part_label = sample_type_name
396
397
    # get a organism part. Organism parts need to be in lowercases
398
    organism_part, created = DictUberon.objects.get_or_create(
399
        label=organism_part_label
400
    )
401
402
    if created:
403
        logger.info("Created uberon %s" % organism_part)
404
405
    else:
406
        logger.debug("Found uberon %s" % organism_part)
407
408
    # calculate animal age at collection
409
    animal_birth_date = parse_date(record.animal_birth_date)
410
    sampling_date = parse_date(record.sampling_date)
411
    animal_age_at_collection, time_units = image_timedelta(
412
        sampling_date, animal_birth_date)
413
414
    # create a new object. Using defaults to avoid collisions when
415
    # updating data
416
    defaults = {
417
        # HINT: is a duplication of name. Can this be non-mandatory?
418
        'alternative_id': sample_name.name,
419
        'collection_date': record.sampling_date,
420
        'protocol': record.sampling_protocol_url,
421
        'organism_part': organism_part,
422
        'animal': animal,
423
        # 'description': v_vessel.comment,
424
        'owner': submission.owner,
425
        'storage': find_storage_type(record),
426
        'availability': record.sample_availability,
427
        'animal_age_at_collection': animal_age_at_collection,
428
        'animal_age_at_collection_units': time_units
429
    }
430
431
    sample, created = Sample.objects.update_or_create(
432
        name=sample_name,
433
        defaults=defaults)
434
435
    if created:
436
        logger.debug("Created sample %s" % sample)
437
438
    else:
439
        logger.debug("Updating sample %s" % sample)
440
441
    return sample
442
443
444
def process_record(record, submission, animals, language):
445
    # Peter mail 26/02/19 18:30: I agree that it sounds like we will
446
    # need to create sameAs BioSamples for the IMAGE project, and it makes
447
    # sense that the inject tool is able to do this.  It may be that we
448
    # tackle these cases after getting the main part of the inject tool
449
    # functioning and hold or ignore these existing BioSamples for now.
450
    # HINT: record with a biosample id should be ignored, for the moment
451
    if record.EBI_Biosample_identifier is not None:
452
        logger.warning("Ignoring %s: already in biosample!" % str(record))
453
        return
454
455
    # filling breeds
456
    breed = fill_uid_breed(record, language)
457
458
    # filling name tables
459
    animal_name, sample_name = fill_uid_names(record, submission)
460
461
    # fill animal
462
    animal = fill_uid_animal(record, animal_name, breed, submission, animals)
463
464
    # fill sample
465
    fill_uid_sample(record, sample_name, animal, submission)
466
467
468
def upload_crbanim(submission):
469
    # debug
470
    logger.info("Importing from CRB-Anim file")
471
472
    # this is the full path in docker container
473
    fullpath = submission.get_uploaded_file_path()
474
475
    # read submission data
476
    reader = CRBAnimReader()
477
    reader.read_file(fullpath)
478
479
    # start data loading
480
    try:
481
        # check for species and sex in a similar way as cryoweb does
482
        check, not_found = reader.check_sex()
483
484
        if not check:
485
            message = (
486
                "Not all Sex terms are loaded into database: "
487
                "check for %s in your dataset" % (not_found))
488
489
            raise CRBAnimImportError(message)
490
491
        check, not_found = reader.check_species(submission.gene_bank_country)
492
493
        if not check:
494
            raise CRBAnimImportError(
495
                "Some species are not loaded in UID database: "
496
                "%s" % (not_found))
497
498
        # ok get languages from submission (useful for translation)
499
        # HINT: no traslations implemented, at the moment
500
        language = submission.gene_bank_country.label
501
502
        # a dictionary in which store animal data
503
        animals = {}
504
505
        for record in reader.data:
506
            process_record(record, submission, animals, language)
507
508
        # after processing records, initilize validationsummary objects
509
        # create a validation summary object and set all_count
510
        vs_animal, created = ValidationSummary.objects.get_or_create(
511
            submission=submission, type="animal")
512
513
        if created:
514
            logger.debug(
515
                "ValidationSummary animal created for "
516
                "submission %s" % submission)
517
518
        # reset counts
519
        vs_animal.reset_all_count()
520
521
        vs_sample, created = ValidationSummary.objects.get_or_create(
522
            submission=submission, type="sample")
523
524
        if created:
525
            logger.debug(
526
                "ValidationSummary sample created for "
527
                "submission %s" % submission)
528
529
        # reset counts
530
        vs_sample.reset_all_count()
531
532
    except Exception as exc:
533
        # set message:
534
        message = "Error in importing data: %s" % (str(exc))
535
536
        # save a message in database
537
        submission.status = ERROR
538
        submission.message = message
539
        submission.save()
540
541
        # send async message
542
        send_message(submission)
543
544
        # debug
545
        logger.error("error in importing from crbanim: %s" % (exc))
546
        logger.exception(exc)
547
548
        return False
549
550
    else:
551
        message = "CRBAnim import completed for submission: %s" % (
552
            submission.id)
553
554
        submission.message = message
555
        submission.status = LOADED
556
        submission.save()
557
558
        # send async message
559
        send_message(
560
            submission,
561
            validation_message=construct_validation_message(submission))
562
563
    logger.info("Import from CRBAnim is complete")
564
565
    return True
566