Completed
Push — master ( eecf08...313cfe )
by Paolo
15s queued 12s
created

crbanim.helpers.check_UID()   A

Complexity

Conditions 4

Size

Total Lines 27
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 18
dl 0
loc 27
rs 9.5
c 0
b 0
f 0
cc 4
nop 2
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Thu Feb 21 15:37:16 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import io
10
import csv
11
import urllib
12
import logging
13
import pycountry
14
15
from collections import defaultdict, namedtuple
16
17
from django.utils.dateparse import parse_date
18
19
from common.constants import LOADED, ERROR, MISSING, SAMPLE_STORAGE
20
from common.helpers import image_timedelta
21
from image_app.helpers import (
22
    FileDataSourceMixin, get_or_create_obj, update_or_create_obj)
23
from image_app.models import (
24
    DictSpecie, DictSex, DictCountry, DictBreed, Name, Animal, Sample,
25
    DictUberon, Publication)
26
from submissions.helpers import send_message
27
from validation.helpers import construct_validation_message
28
from validation.models import ValidationSummary
29
30
# Get an instance of a logger
31
logger = logging.getLogger(__name__)
32
33
34
# A class to deal with cryoweb import errors
35
class CRBAnimImportError(Exception):
36
    pass
37
38
39
class CRBAnimReader(FileDataSourceMixin):
40
    mandatory_columns = [
41
            'sex',
42
            'species_latin_name',
43
            'country_of_origin',
44
            'breed_name',
45
            'animal_ID',
46
            'sample_bibliographic_references',
47
            'sample_identifier',
48
            'animal_birth_date',
49
            'sample_storage_temperature',
50
            'sample_type_name',
51
            'body_part_name',
52
            'sampling_date',
53
            'sampling_protocol_url',
54
            'sample_availability',
55
            'EBI_Biosample_identifier',
56
        ]
57
58
    def __init__(self):
59
        self.data = None
60
        self.header = None
61
        self.dialect = None
62
        self.items = None
63
        self.filename = None
64
65
    @classmethod
66
    def get_dialect(cls, chunk):
67
        """Determine dialect of a CSV from a chunk"""
68
69
        return csv.Sniffer().sniff(chunk)
70
71
    @classmethod
72
    def is_valid(cls, chunk):
73
        """Try to determine if CRBanim has at least the required columns
74
        or not"""
75
76
        dialect = cls.get_dialect(chunk)
77
78
        # get a handle from a string
79
        handle = io.StringIO(chunk)
80
81
        # read chunk
82
        reader = csv.reader(handle, dialect)
83
        header = next(reader)
84
85
        not_found = []
86
87
        for column in cls.mandatory_columns:
88
            if column not in header:
89
                not_found.append(column)
90
91
        if len(not_found) == 0:
92
            logger.debug("This seems to be a valid CRBanim file")
93
            return True, []
94
95
        else:
96
            logger.error("Couldn't not find mandatory CRBanim columns %s" % (
97
                not_found))
98
            return False, not_found
99
100
    def read_file(self, filename):
101
        """Read crb anim files and set tit to class attribute"""
102
103
        with open(filename, newline='') as handle:
104
            # initialize data
105
            self.filename = filename
106
            self.data = []
107
108
            # get dialect
109
            chunk = handle.read(2048)
110
            self.dialect = self.get_dialect(chunk)
111
112
            # restart filename from the beginning
113
            handle.seek(0)
114
115
            # read csv file
116
            reader = csv.reader(handle, self.dialect)
117
            self.header = next(reader)
118
119
            # find sex index column
120
            sex_idx = self.header.index('sex')
121
122
            # create a namedtuple object
123
            Data = namedtuple("Data", self.header)
124
125
            # add records to data
126
            for record in reader:
127
                # replace all "\\N" occurences in a list
128
                record = [None if col in ["\\N", ""]
129
                          else col for col in record]
130
131
                # 'unknown' sex should be replaced with 'record of unknown sex'
132
                if record[sex_idx].lower() == 'unknown':
133
                    logger.debug(
134
                        "Changing '%s' with '%s'" % (
135
                            record[sex_idx], 'record of unknown sex'))
136
                    record[sex_idx] = 'record of unknown sex'
137
138
                record = Data._make(record)
139
                self.data.append(record)
140
141
        self.items = self.eval_columns()
142
143
    def eval_columns(self):
144
        """define a set from column data"""
145
146
        # target_columns = ['sex', 'species_latin_name', 'breed_name']
147
        target_columns = self.header
148
149
        items = defaultdict(list)
150
151
        for line in self.data:
152
            for column in target_columns:
153
                idx = self.header.index(column)
154
                items[column].append(line[idx])
155
156
        # now get a set of object
157
        for column in target_columns:
158
            items[column] = set(items[column])
159
160
        return items
161
162
    def print_line(self, num):
163
        """print a record with its column names"""
164
165
        for i, column in enumerate(self.header):
166
            logger.debug("%s: %s" % (column, self.data[num][i]))
167
168
    def filter_by_column_values(self, column, values, ignorecase=False):
169
        if ignorecase is True:
170
            # lower values
171
            values = [value.lower() for value in values]
172
173
        for line in self.data:
174
            # search for case insensitive value (lower attrib in lower values)
175
            if ignorecase is True:
176
                if getattr(line, column).lower() in values:
177
                    yield line
178
179
                else:
180
                    logger.debug("Filtering: %s" % (str(line)))
181
182
            else:
183
                if getattr(line, column) in values:
184
                    yield line
185
186
                else:
187
                    logger.debug("Filtering: %s" % (str(line)))
188
189
            # ignore case or not
190
191
        # cicle for line
192
193
    # a function to detect if crbanim species are in UID database or not
194
    def check_species(self, country):
195
        """Check if all species are defined in UID DictSpecies"""
196
197
        # CRBAnim usually have species in the form required for UID
198
        # However sometimes there could be a common name, not a DictSpecie one
199
        column = 'species_latin_name'
200
        item_set = self.items[column]
201
202
        # call FileDataSourceMixin.check_species
203
        return super().check_species(column, item_set, country)
204
205
    # check that dict sex table contains data
206
    def check_sex(self):
207
        """check that dict sex table contains data"""
208
209
        # item.sex are in uppercase
210
        column = 'sex'
211
        item_set = [item.lower() for item in self.items[column]]
212
213
        # call FileDataSourceMixin.check_items
214
        return self.check_items(item_set, DictSex, column)
215
216
    def check_countries(self):
217
        """Check that all efabis countries are present in database"""
218
219
        def get_label(country_of_origin):
220
            return pycountry.countries.get(
221
                alpha_2=country_of_origin).name
222
223
        column = "country_of_origin"
224
        item_set = [get_label(item) for item in self.items[column]]
225
226
        # call FileDataSourceMixin.check_items
227
        return self.check_items(item_set, DictCountry, column)
228
229
230
def fill_uid_breed(record, language):
231
    """Fill DictBreed from a crbanim record"""
232
233
    # get a DictSpecie object. Species are in latin names, but I can
234
    # find also a common name in translation tables
235
    specie = DictSpecie.get_specie_check_synonyms(
236
            species_label=record.species_latin_name,
237
            language=language)
238
239
    # get country name using pycountries
240
    country_name = pycountry.countries.get(
241
        alpha_2=record.country_of_origin).name
242
243
    # get country for breeds. Ideally will be the same of submission,
244
    # however, it could be possible to store data from other contries
245
    country = DictCountry.objects.get(label=country_name)
246
247
    breed = get_or_create_obj(
248
        DictBreed,
249
        supplied_breed=record.breed_name,
250
        specie=specie,
251
        country=country)
252
253
    # return a DictBreed object
254
    return breed
255
256
257
def fill_uid_names(record, submission):
258
    """fill Names table from crbanim record"""
259
260
    # in the same record I have the sample identifier and animal identifier
261
    # a name record for animal
262
    animal_name = get_or_create_obj(
263
        Name,
264
        name=record.animal_ID,
265
        submission=submission,
266
        owner=submission.owner)
267
268
    # get a publication (if present)
269
    publication = None
270
271
    if record.sample_bibliographic_references:
272
        publication = get_or_create_obj(
273
            Publication,
274
            doi=record.sample_bibliographic_references)
275
276
    # name record for sample
277
    sample_name = get_or_create_obj(
278
        Name,
279
        name=record.sample_identifier,
280
        submission=submission,
281
        owner=submission.owner,
282
        publication=publication)
283
284
    # returning 2 Name instances
285
    return animal_name, sample_name
286
287
288
def fill_uid_animal(record, animal_name, breed, submission, animals):
289
    """Helper function to fill animal data in UID animal table"""
290
291
    # HINT: does CRBAnim models mother and father?
292
293
    # check if such animal is already beed updated
294
    if animal_name.name in animals:
295
        logger.debug(
296
            "Ignoring %s: already created or updated" % (animal_name))
297
298
        # return an animal object
299
        animal = animals[animal_name.name]
300
301
    else:
302
        # determine sex. Check for values
303
        sex = DictSex.objects.get(label__iexact=record.sex)
304
305
        # there's no birth_location for animal in CRBAnim
306
        accuracy = MISSING
307
308
        # create a new object. Using defaults to avoid collisions when
309
        # updating data
310
        # HINT: CRBanim has less attribute than cryoweb
311
        defaults = {
312
            # HINT: is a duplication of name. Can this be non-mandatory?
313
            'alternative_id': animal_name.name,
314
            'breed': breed,
315
            'sex': sex,
316
            'birth_date': record.animal_birth_date,
317
            'birth_location_accuracy': accuracy,
318
            'owner': submission.owner
319
        }
320
321
        # HINT: I could have the same animal again and again. Should I update
322
        # every times?
323
        animal = update_or_create_obj(
324
            Animal,
325
            name=animal_name,
326
            defaults=defaults)
327
328
        # track this animal in dictionary
329
        animals[animal_name.name] = animal
330
331
    # I need to track animal to relate the sample
332
    return animal
333
334
335
def find_storage_type(record):
336
    """Determine a sample storage relying on a dictionary"""
337
338
    mapping = {
339
        '-196°C': 'frozen, liquid nitrogen',
340
        '-20°C': 'frozen, -20 degrees Celsius freezer',
341
        '-30°C': 'frozen, -20 degrees Celsius freezer',
342
        '-80°C': 'frozen, -80 degrees Celsius freezer'}
343
344
    if record.sample_storage_temperature in mapping:
345
        # get ENUM conversion
346
        storage = SAMPLE_STORAGE.get_value_by_desc(
347
            mapping[record.sample_storage_temperature])
348
349
        return storage
350
351
    else:
352
        logging.warning("Couldn't find %s in storage types mapping" % (
353
            record.sample_storage_temperature))
354
355
        return None
356
357
358
def sanitize_url(url):
359
    """Quote URLs for accession"""
360
361
    return urllib.parse.quote(url, ':/#?=')
362
363
364
def fill_uid_sample(record, sample_name, animal, submission):
365
    """Helper function to fill animal data in UID sample table"""
366
367
    # name and animal name come from parameters
368
    organism_part_label = None
369
    sample_type_name = record.sample_type_name.lower()
370
    body_part_name = record.body_part_name.lower()
371
372
    # sylvain has proposed to apply the following decision rule:
373
    if body_part_name != "unknown" and body_part_name != "not relevant":
374
        organism_part_label = body_part_name
375
376
    else:
377
        organism_part_label = sample_type_name
378
379
    # get a organism part. Organism parts need to be in lowercases
380
    organism_part = get_or_create_obj(
381
        DictUberon,
382
        label=organism_part_label
383
    )
384
385
    # calculate animal age at collection
386
    animal_birth_date = parse_date(record.animal_birth_date)
387
    sampling_date = parse_date(record.sampling_date)
388
    animal_age_at_collection, time_units = image_timedelta(
389
        sampling_date, animal_birth_date)
390
391
    # create a new object. Using defaults to avoid collisions when
392
    # updating data
393
    defaults = {
394
        # HINT: is a duplication of name. Can this be non-mandatory?
395
        'alternative_id': sample_name.name,
396
        'collection_date': record.sampling_date,
397
        'protocol': record.sampling_protocol_url,
398
        'organism_part': organism_part,
399
        'animal': animal,
400
        # 'description': v_vessel.comment,
401
        'owner': submission.owner,
402
        'storage': find_storage_type(record),
403
        'availability': sanitize_url(record.sample_availability),
404
        'animal_age_at_collection': animal_age_at_collection,
405
        'animal_age_at_collection_units': time_units
406
    }
407
408
    sample = update_or_create_obj(
409
        Sample,
410
        name=sample_name,
411
        defaults=defaults)
412
413
    return sample
414
415
416
def process_record(record, submission, animals, language):
417
    # Peter mail 26/02/19 18:30: I agree that it sounds like we will
418
    # need to create sameAs BioSamples for the IMAGE project, and it makes
419
    # sense that the inject tool is able to do this.  It may be that we
420
    # tackle these cases after getting the main part of the inject tool
421
    # functioning and hold or ignore these existing BioSamples for now.
422
    # HINT: record with a biosample id should be ignored, for the moment
423
    if record.EBI_Biosample_identifier is not None:
424
        logger.warning("Ignoring %s: already in biosample!" % str(record))
425
        return
426
427
    # filling breeds
428
    breed = fill_uid_breed(record, language)
429
430
    # filling name tables
431
    animal_name, sample_name = fill_uid_names(record, submission)
432
433
    # fill animal
434
    animal = fill_uid_animal(record, animal_name, breed, submission, animals)
435
436
    # fill sample
437
    fill_uid_sample(record, sample_name, animal, submission)
438
439
440
def check_UID(submission, reader):
441
    # check for species and sex in a similar way as cryoweb does
442
    check, not_found = reader.check_sex()
443
444
    if not check:
445
        message = (
446
            "Not all Sex terms are loaded into database: "
447
            "check for '%s' in your dataset" % (not_found))
448
449
        raise CRBAnimImportError(message)
450
451
    # check for countries
452
    check, not_found = reader.check_countries()
453
454
    if not check:
455
        message = (
456
            "Not all countries are loaded into database: "
457
            "check for '%s' in your dataset" % (not_found))
458
459
        raise CRBAnimImportError(message)
460
461
    check, not_found = reader.check_species(submission.gene_bank_country)
462
463
    if not check:
464
        raise CRBAnimImportError(
465
            "Some species are not loaded in UID database: "
466
            "check for '%s' in your dataset" % (not_found))
467
468
469
def upload_crbanim(submission):
470
    # debug
471
    logger.info("Importing from CRB-Anim file")
472
473
    # this is the full path in docker container
474
    fullpath = submission.get_uploaded_file_path()
475
476
    # read submission data
477
    reader = CRBAnimReader()
478
    reader.read_file(fullpath)
479
480
    # start data loading
481
    try:
482
        # check UID data like cryoweb does
483
        check_UID(submission, reader)
484
485
        # ok get languages from submission (useful for translation)
486
        # HINT: no traslations implemented, at the moment
487
        language = submission.gene_bank_country.label
488
489
        # a dictionary in which store animal data
490
        animals = {}
491
492
        for record in reader.data:
493
            process_record(record, submission, animals, language)
494
495
        # after processing records, initilize validationsummary objects
496
        # create a validation summary object and set all_count
497
        vs_animal = get_or_create_obj(
498
            ValidationSummary,
499
            submission=submission,
500
            type="animal")
501
502
        # reset counts
503
        vs_animal.reset_all_count()
504
505
        vs_sample = get_or_create_obj(
506
            ValidationSummary,
507
            submission=submission,
508
            type="sample")
509
510
        # reset counts
511
        vs_sample.reset_all_count()
512
513
    except Exception as exc:
514
        # set message:
515
        message = "Error in importing data: %s" % (str(exc))
516
517
        # save a message in database
518
        submission.status = ERROR
519
        submission.message = message
520
        submission.save()
521
522
        # send async message
523
        send_message(submission)
524
525
        # debug
526
        logger.error("error in importing from crbanim: %s" % (exc))
527
        logger.exception(exc)
528
529
        return False
530
531
    else:
532
        message = "CRBAnim import completed for submission: %s" % (
533
            submission.id)
534
535
        submission.message = message
536
        submission.status = LOADED
537
        submission.save()
538
539
        # send async message
540
        send_message(
541
            submission,
542
            validation_message=construct_validation_message(submission))
543
544
    logger.info("Import from CRBAnim is complete")
545
546
    return True
547