Passed
Pull Request — master (#40)
by Paolo
01:44
created

excel.helpers.fill_uid_names()   B

Complexity

Conditions 5

Size

Total Lines 36
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 19
dl 0
loc 36
rs 8.9833
c 0
b 0
f 0
cc 5
nop 2
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Tue Jul  2 10:44:54 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import xlrd
10
import logging
11
import datetime
12
13
from collections import defaultdict, namedtuple
14
15
from common.constants import (
16
    ERROR, LOADED, ACCURACIES, SAMPLE_STORAGE, SAMPLE_STORAGE_PROCESSING)
17
from common.helpers import image_timedelta
18
from image_app.models import (
19
    DictBreed, DictCountry, DictSpecie, DictSex, DictUberon, Name, Animal,
20
    Sample)
21
from language.helpers import check_species_synonyms
22
from submissions.helpers import send_message
23
from validation.helpers import construct_validation_message
24
from validation.models import ValidationSummary
25
26
# Get an instance of a logger
27
logger = logging.getLogger(__name__)
28
29
# defining the template columns in need for data import
30
TEMPLATE_COLUMNS = {
31
    'breed': [
32
        'Supplied breed',
33
        # 'Mapped breed',
34
        # 'Mapped breed ontology library',
35
        # 'Mapped breed ontology accession',
36
        'EFABIS Breed country',
37
        'Species',
38
        # 'Species ontology library',
39
        # 'Species ontology accession'
40
    ],
41
    'animal': [
42
        'Animal id in data source',
43
        'Animal description',
44
        'Alternative animal ID',
45
        'Father id in data source',
46
        'Mother id in data source',
47
        'Breed',
48
        'Species',
49
        'Sex',
50
        'Birth date',
51
        'Birth location',
52
        'Birth location longitude',
53
        'Birth location latitude',
54
        'Birth location accuracy'
55
    ],
56
    'sample': [
57
        'Sample id in data source',
58
        'Alternative sample ID',
59
        'Sample description',
60
        'Animal id in data source',
61
        'Specimen collection protocol',
62
        'availability',
63
        'Collection date',
64
        'Collection place latitude',
65
        'Collection place longitude',
66
        'Collection place',
67
        'Collection place accuracy',
68
        'Organism part',
69
        'Developmental stage',
70
        'Physiological stage',
71
        'Animal age at collection',
72
        'Sample storage',
73
        'Sample storage processing',
74
        'Sampling to preparation interval'
75
    ]
76
}
77
78
79
# A class to deal with template import errors
80
class ExcelImportError(Exception):
81
    pass
82
83
84
class ExcelTemplate():
85
    """A class to read template excel files"""
86
87
    def __init__(self):
88
        # read xls file and track it
89
        self.book = None
90
        self.sheet_names = []
91
92
    def read_file(self, filename):
93
        # read xls file and track it
94
        self.book = xlrd.open_workbook(filename)
95
        self.sheet_names = self.book.sheet_names()
96
97
    def check_sheets(self):
98
        """Test for the minimal sheets required to upload data"""
99
100
        not_found = []
101
102
        for sheet_name in TEMPLATE_COLUMNS.keys():
103
            if sheet_name not in self.sheet_names:
104
                not_found.append(sheet_name)
105
                logger.error(
106
                    "required sheet {name} not found in template".format(
107
                        name=sheet_name)
108
                )
109
110
        if len(not_found) > 0:
111
            return False, not_found
112
113
        else:
114
            logger.debug("This seems to be a valid Template file")
115
            return True, not_found
116
117
    def check_columns(self):
118
        """Test for minimal column required for template load"""
119
120
        not_found = defaultdict(list)
121
122
        for sheet_name in TEMPLATE_COLUMNS.keys():
123
            # get a sheet from xls workbook
124
            sheet = self.book.sheet_by_name(sheet_name)
125
126
            # get header from sheet
127
            header = sheet.row_values(0)
128
129
            for column in TEMPLATE_COLUMNS[sheet_name]:
130
                if column not in header:
131
                    not_found[sheet_name].append(column)
132
                    logger.error(
133
                        "required column {column} not found in sheet "
134
                        "{sheet_name}".format(
135
                            sheet_name=sheet_name,
136
                            column=column)
137
                    )
138
139
        if len(not_found) > 0:
140
            return False, not_found
141
142
        else:
143
            logger.debug("This seems to be a valid Template file")
144
            return True, not_found
145
146
    def __get_sheet_records(self, sheet_name):
147
        """Generic functions to iterate on excel records"""
148
149
        # this is the sheet I need
150
        sheet = self.book.sheet_by_name(sheet_name)
151
152
        # now get columns to create a collection objects
153
        header = sheet.row_values(0)
154
155
        column_idxs = {}
156
157
        # get the column index I need
158
        for column in TEMPLATE_COLUMNS[sheet_name]:
159
            idx = header.index(column)
160
            column_idxs[column.lower().replace(" ", "_")] = idx
161
162
        # get new column names
163
        columns = column_idxs.keys()
164
165
        # create a namedtuple object
166
        Record = namedtuple(sheet_name.capitalize(), columns)
167
168
        # iterate over record, mind the header column
169
        for i in range(1, sheet.nrows):
170
            # get a row from excel file
171
            row = sheet.row_values(i)
172
173
            # get the data I need
174
            data = [row[column_idxs[column]] for column in columns]
175
176
            # replace all empty  occurences in a list
177
            data = [None if col in [""]
178
                    else col for col in data]
179
180
            # fix date fields. Search for 'date' in column names
181
            date_idxs = [column_idxs[column] for column in columns if
182
                         'date' in column]
183
184
            # fix date objects using datetime, as described here:
185
            # https://stackoverflow.com/a/13962976/4385116
186
            for idx in date_idxs:
187
                if not data[idx]:
188
                    continue
189
190
                data[idx] = datetime.datetime(
191
                    *xlrd.xldate_as_tuple(
192
                        data[idx],
193
                        self.book.datemode
194
                    )
195
                )
196
197
            # get a new object
198
            record = Record._make(data)
199
200
            yield record
201
202
    def get_breed_records(self):
203
        """Iterate among breeds record"""
204
205
        # this is the sheet I need
206
        sheet_name = "breed"
207
        return self.__get_sheet_records(sheet_name)
208
209
    def get_animal_records(self):
210
        """Iterate among animal records"""
211
212
        # this is the sheet I need
213
        sheet_name = "animal"
214
        return self.__get_sheet_records(sheet_name)
215
216
    def get_sample_records(self):
217
        """Iterate among sample records"""
218
219
        # this is the sheet I need
220
        sheet_name = "sample"
221
        return self.__get_sheet_records(sheet_name)
222
223
    # TODO: identical to CRBanim move in a common mixin
224 View Code Duplication
    def __check_items(self, item_set, model, column):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
225
        """General check of Template items into database"""
226
227
        # a list of not found terms and a status to see if something is missing
228
        # or not
229
        not_found = []
230
        result = True
231
232
        for item in item_set:
233
            # check for species in database
234
            if not model.objects.filter(label=item).exists():
235
                not_found.append(item)
236
237
        if len(not_found) != 0:
238
            result = False
239
            logger.warning(
240
                "Those %s are not present in UID database:" % (column))
241
            logger.warning(not_found)
242
243
        return result, not_found
244
245
    # TODO: nearly identical to CRBanim move in a common mixin
246 View Code Duplication
    def check_species(self, country):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
247
        """Check if all species are defined in UID DictSpecies"""
248
249
        column = 'species'
250
        item_set = set([breed.species for breed in self.get_breed_records()])
251
252
        check, not_found = self.__check_items(
253
            item_set, DictSpecie, column)
254
255
        if check is False:
256
            # try to check in dictionary table
257
            logger.info("Searching for %s in dictionary tables" % (not_found))
258
259
            # if this function return True, I found all synonyms
260
            if check_species_synonyms(not_found, country) is True:
261
                logger.info("Found %s in dictionary tables" % not_found)
262
263
                # return True and an empty list for check and not found items
264
                return True, []
265
266
            else:
267
                # if I arrive here, there are species that I couldn't find
268
                logger.error(
269
                    "Couldnt' find those species in dictionary tables:")
270
                logger.error(not_found)
271
272
        return check, not_found
273
274
    # TODO: nearly identical to CRBanim move in a common mixin
275
    def check_sex(self):
276
        """Check that all sex records are present in database"""
277
278
        column = 'sex'
279
        item_set = set([animal.sex for animal in self.get_animal_records()])
280
281
        return self.__check_items(item_set, DictSex, column)
282
283
    def __check_accuracy(self, item_set):
284
        """A generic method to test for accuracies"""
285
286
        # a list of not found terms and a status to see if something is missing
287
        # or not
288
        not_found = []
289
        result = True
290
291
        for item in item_set:
292
            try:
293
                ACCURACIES.get_value_by_desc(item)
294
295
            except KeyError:
296
                logger.warning("accuracy level '%s' not found" % (item))
297
                not_found.append(item)
298
299
        if len(not_found) != 0:
300
            result = False
301
302
        return result, not_found
303
304
    def check_accuracies(self):
305
        """Check accuracy specified in table"""
306
307
        item_set = set([animal.birth_location_accuracy
308
                        for animal in self.get_animal_records()])
309
310
        # test for accuracy in animal table
311
        result_animal, not_found_animal = self.__check_accuracy(item_set)
312
313
        item_set = set([sample.collection_place_accuracy
314
                        for sample in self.get_sample_records()])
315
316
        # test for accuracy in sample table
317
        result_sample, not_found_sample = self.__check_accuracy(item_set)
318
319
        # merge two results
320
        check = result_animal and result_sample
321
        not_found = set(not_found_animal + not_found_sample)
322
323
        if check is False:
324
            logger.error(
325
                    "Couldnt' find those accuracies in constants:")
326
            logger.error(not_found)
327
328
        return check, not_found
329
330
331
def fill_uid_breeds(submission_obj, template):
332
    """Fill DictBreed from a excel record"""
333
334
    logger.info("fill_uid_breeds() started")
335
336
    # ok get languages from submission (useful for translation)
337
    language = submission_obj.gene_bank_country.label
338
339
    # iterate among excel template
340
    for record in template.get_breed_records():
341
        # TODO: move this in a helper module (image_app.helpers?)
342
        # get a DictSpecie object. Species are in latin names, but I can
343
        # find also a common name in translation tables
344
        try:
345
            specie = DictSpecie.objects.get(label=record.species)
346
347
        except DictSpecie.DoesNotExist:
348
            logger.info("Search %s in synonyms" % (record.species))
349
            # search for language synonym (if I arrived here a synonym should
350
            # exists)
351
            specie = DictSpecie.get_by_synonym(
352
                synonym=record.species,
353
                language=language)
354
355
        # get country for breeds. Ideally will be the same of submission,
356
        # however, it could be possible to store data from other contries
357
        country, created = DictCountry.objects.get_or_create(
358
            label=record.efabis_breed_country)
359
360
        # I could create a country from a v_breed_specie instance. That's
361
        # ok, maybe I could have a lot of breed from different countries and
362
        # a few organizations submitting them
363
        if created:
364
            logger.info("Created %s" % country)
365
366
        else:
367
            logger.debug("Found %s" % country)
368
369
        breed, created = DictBreed.objects.get_or_create(
370
            supplied_breed=record.supplied_breed,
371
            specie=specie,
372
            country=country)
373
374
        if created:
375
            logger.info("Created %s" % breed)
376
377
        else:
378
            logger.debug("Found %s" % breed)
379
380
    logger.info("fill_uid_breeds() completed")
381
382
383
def fill_uid_names(submission_obj, template):
384
    """fill Names table from crbanim record"""
385
386
    # debug
387
    logger.info("called fill_uid_names()")
388
389
    # iterate among excel template
390
    for record in template.get_animal_records():
391
        # in the same record I have the sample identifier and animal identifier
392
        # a name record for animal
393
        animal_name, created = Name.objects.get_or_create(
394
            name=record.animal_id_in_data_source,
395
            submission=submission_obj,
396
            owner=submission_obj.owner)
397
398
        if created:
399
            logger.debug("Created animal name %s" % animal_name)
400
401
        else:
402
            logger.debug("Found animal name %s" % animal_name)
403
404
    # iterate among excel template
405
    for record in template.get_sample_records():
406
        # name record for sample
407
        sample_name, created = Name.objects.get_or_create(
408
            name=record.sample_id_in_data_source,
409
            submission=submission_obj,
410
            owner=submission_obj.owner)
411
412
        if created:
413
            logger.debug("Created sample name %s" % sample_name)
414
415
        else:
416
            logger.debug("Found sample name %s" % sample_name)
417
418
    logger.info("fill_uid_names() completed")
419
420
421
def fill_uid_animals(submission_obj, template):
422
    # debug
423
    logger.info("called fill_uid_animals()")
424
425
    # get submission language
426
    language = submission_obj.gene_bank_country.label
427
428
    # iterate among excel template
429
    for record in template.get_animal_records():
430
        # determine sex. Check for values
431
        sex = DictSex.objects.get(label__iexact=record.sex)
432
433
        # get specie
434
        specie = DictSpecie.objects.get(label=record.species)
435
436
        # how I can get breed from my data?
437
        breeds = [breed for breed in template.get_breed_records()
438
                  if breed.supplied_breed == record.breed and
439
                  breed.species == record.species]
440
441
        # breed is supposed to be unique, from UID constraints. However
442
        # I could place the same breed name for two countries. In that case,
443
        # I cant derive a unique breed from users data
444
        if len(breeds) != 1:
445
            raise ExcelImportError(
446
                "Can't determine a unique breed for '%s:%s' from user data" %
447
                (record.breed, record.species))
448
449
        # get a country for this breed
450
        country = DictCountry.objects.get(
451
            label=breeds[0].efabis_breed_country)
452
453
        # ok get a real dictbreed object
454
        breed = DictBreed.objects.get(
455
            supplied_breed=record.breed,
456
            specie=specie,
457
            country=country)
458
459
        logger.debug("Selected breed is %s" % (breed))
460
461
        # define names
462
        name, mother, father = None, None, None
463
464
        # get name for this animal and for mother and father
465
        logger.debug("Getting %s as my name" % (
466
            record.animal_id_in_data_source))
467
468
        name = Name.objects.get(
469
            name=record.animal_id_in_data_source,
470
            submission=submission_obj)
471
472
        if record.father_id_in_data_source:
473
            logger.debug("Getting %s as father" % (
474
                record.father_id_in_data_source))
475
476
            father = Name.objects.get(
477
                name=record.father_id_in_data_source,
478
                submission=submission_obj)
479
480
        if record.mother_id_in_data_source:
481
            logger.debug("Getting %s as mother" % (
482
                record.mother_id_in_data_source))
483
484
            mother = Name.objects.get(
485
                name=record.mother_id_in_data_source,
486
                submission=submission_obj)
487
488
        # now get accuracy
489
        accuracy = ACCURACIES.get_value_by_desc(
490
            record.birth_location_accuracy)
491
492
        # create a new object. Using defaults to avoid collisions when
493
        # updating data
494
        defaults = {
495
            'alternative_id': record.alternative_animal_id,
496
            'description': record.animal_description,
497
            'breed': breed,
498
            'sex': sex,
499
            'father': father,
500
            'mother': mother,
501
            'birth_date': record.birth_date,
502
            'birth_location': record.birth_location,
503
            'birth_location_latitude': record.birth_location_latitude,
504
            'birth_location_longitude': record.birth_location_longitude,
505
            'birth_location_accuracy': accuracy,
506
            'owner': submission_obj.owner
507
        }
508
509
        animal, created = Animal.objects.update_or_create(
510
            name=name,
511
            defaults=defaults)
512
513
        if created:
514
            logger.debug("Created %s" % animal)
515
516
        else:
517
            logger.debug("Updating %s" % animal)
518
519
    # create a validation summary object and set all_count
520
    validation_summary, created = ValidationSummary.objects.get_or_create(
521
        submission=submission_obj, type="animal")
522
523
    if created:
524
        logger.debug(
525
            "ValidationSummary animal created for submission %s" %
526
            submission_obj)
527
528
    # reset counts
529
    validation_summary.reset_all_count()
530
531
    # debug
532
    logger.info("fill_uid_animals() completed")
533
534
535
def fill_uid_samples(submission_obj, template):
536
    # debug
537
    logger.info("called fill_uid_samples()")
538
539
    # iterate among excel template
540
    for record in template.get_sample_records():
541
        # get name for this sample
542
        name = Name.objects.get(
543
            name=record.sample_id_in_data_source,
544
            submission=submission_obj,
545
            owner=submission_obj.owner)
546
547
        # get animal by reading record
548
        animal = Animal.objects.get(
549
            name__name=record.animal_id_in_data_source,
550
            name__submission=submission_obj)
551
552
        # get a organism part. Organism parts need to be in lowercases
553
        organism_part, created = DictUberon.objects.get_or_create(
554
            label=record.organism_part
555
        )
556
557
        if created:
558
            logger.info("Created %s" % organism_part)
559
560
        else:
561
            logger.debug("Found %s" % organism_part)
562
563
        # TODO: get developmental_stage and physiological_stage terms
564
565
        # derive animal age at collection. THis function deals with NULL valies
566
        animal_age_at_collection, time_units = image_timedelta(
567
            record.collection_date, animal.birth_date)
568
569
        # now get accuracy
570
        accuracy = ACCURACIES.get_value_by_desc(
571
            record.collection_place_accuracy)
572
573
        # now get storage and storage processing
574
        # TODO; check those values in excel columns
575
        storage = SAMPLE_STORAGE.get_value_by_desc(
576
            record.sample_storage)
577
578
        storage_processing = SAMPLE_STORAGE_PROCESSING.get_value_by_desc(
579
            record.sample_storage_processing)
580
581
        # create a new object. Using defaults to avoid collisions when
582
        # updating data
583
        defaults = {
584
            'alternative_id': record.alternative_sample_id,
585
            'description': record.sample_description,
586
            'animal': animal,
587
            'protocol': record.specimen_collection_protocol,
588
            'collection_date': record.collection_date,
589
            'collection_place_latitude': record.collection_place_latitude,
590
            'collection_place_longitude': record.collection_place_longitude,
591
            'collection_place': record.collection_place,
592
            'collection_place_accuracy': accuracy,
593
            'organism_part': organism_part,
594
            # 'developmental_stage': None,
595
            # 'physiological_stage': None,
596
            'animal_age_at_collection': animal_age_at_collection,
597
            'animal_age_at_collection_units': time_units,
598
            'availability': record.availability,
599
            'storage': storage,
600
            'storage_processing': storage_processing,
601
            # TODO: this is a time unit column
602
            'preparation_interval': record.sampling_to_preparation_interval,
603
            'owner': submission_obj.owner,
604
        }
605
606
        sample, created = Sample.objects.update_or_create(
607
            name=name,
608
            defaults=defaults)
609
610
        if created:
611
            logger.debug("Created %s" % sample)
612
613
        else:
614
            logger.debug("Updating %s" % sample)
615
616
    # create a validation summary object and set all_count
617
    validation_summary, created = ValidationSummary.objects.get_or_create(
618
        submission=submission_obj, type="sample")
619
620
    if created:
621
        logger.debug(
622
            "ValidationSummary animal created for submission %s" %
623
            submission_obj)
624
625
    # reset counts
626
    validation_summary.reset_all_count()
627
628
    # debug
629
    logger.info("fill_uid_samples() completed")
630
631
632
def upload_template(submission_obj):
633
    # debug
634
    logger.info("Importing from Excel template file")
635
636
    # this is the full path in docker container
637
    fullpath = submission_obj.get_uploaded_file_path()
638
639
    # read submission data
640
    reader = ExcelTemplate()
641
    reader.read_file(fullpath)
642
643
    # start data loading
644
    try:
645
        # check for species and sex in a similar way as cryoweb does
646
        # TODO: identical to CRBanim. Move to a mixin
647
        check, not_found = reader.check_sex()
648
649
        if not check:
650
            message = (
651
                "Not all Sex terms are loaded into database: "
652
                "check for %s in your dataset" % (not_found))
653
654
            raise ExcelImportError(message)
655
656
        check, not_found = reader.check_species(
657
            submission_obj.gene_bank_country)
658
659
        if not check:
660
            raise ExcelImportError(
661
                "Some species are not loaded in UID database: "
662
                "%s" % (not_found))
663
664
        check, not_found = reader.check_accuracies()
665
666
        if not check:
667
            message = (
668
                "Not all accuracy levels are defined in database: "
669
                "check for %s in your dataset" % (not_found))
670
671
            raise ExcelImportError(message)
672
673
        # BREEDS
674
        fill_uid_breeds(submission_obj, reader)
675
676
        # NAME
677
        fill_uid_names(submission_obj, reader)
678
679
        # ANIMALS
680
        fill_uid_animals(submission_obj, reader)
681
682
        # SAMPLES
683
        fill_uid_samples(submission_obj, reader)
684
685
    except Exception as exc:
686
        # set message:
687
        message = "Error in importing data: %s" % (str(exc))
688
689
        # save a message in database
690
        submission_obj.status = ERROR
691
        submission_obj.message = message
692
        submission_obj.save()
693
694
        # send async message
695
        send_message(submission_obj)
696
697
        # debug
698
        logger.error("Error in importing from Template: %s" % (exc))
699
        logger.exception(exc)
700
701
        return False
702
703
    else:
704
        message = "Template import completed for submission: %s" % (
705
            submission_obj.id)
706
707
        submission_obj.message = message
708
        submission_obj.status = LOADED
709
        submission_obj.save()
710
711
        # send async message
712
        send_message(
713
            submission_obj,
714
            validation_message=construct_validation_message(submission_obj))
715
716
    logger.info("Import from Template is complete")
717
718
    return True
719