Passed
Pull Request — master (#40)
by Paolo
01:16
created

crbanim.helpers   B

Complexity

Total Complexity 43

Size/Duplication

Total Lines 514
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 43
eloc 271
dl 0
loc 514
rs 8.96
c 0
b 0
f 0

7 Functions

Rating   Name   Duplication   Size   Complexity  
A find_storage_type() 0 21 2
A fill_uid_breed() 0 27 1
A fill_uid_names() 0 29 2
A fill_uid_animal() 0 45 2
A process_record() 0 22 2
B upload_crbanim() 0 92 6
A fill_uid_sample() 0 50 3

9 Methods

Rating   Name   Duplication   Size   Complexity  
A CRBAnimReader.print_line() 0 5 2
A CRBAnimReader.is_valid() 0 28 4
A CRBAnimReader.get_dialect() 0 5 1
A CRBAnimReader.check_sex() 0 9 1
B CRBAnimReader.read_file() 0 42 5
A CRBAnimReader.check_species() 0 10 1
A CRBAnimReader.__init__() 0 6 1
A CRBAnimReader.eval_columns() 0 18 4
B CRBAnimReader.filter_by_column_values() 0 20 6

How to fix   Complexity   

Complexity

Complex classes like crbanim.helpers often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Thu Feb 21 15:37:16 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import io
10
import csv
11
import logging
12
import pycountry
13
14
from collections import defaultdict, namedtuple
15
16
from django.utils.dateparse import parse_date
17
18
from common.constants import LOADED, ERROR, MISSING, SAMPLE_STORAGE
19
from common.helpers import image_timedelta
20
from image_app.helpers import (
21
    FileDataSourceMixin, get_or_create_obj, update_or_create_obj)
22
from image_app.models import (
23
    DictSpecie, DictSex, DictCountry, DictBreed, Name, Animal, Sample,
24
    DictUberon, Publication)
25
from submissions.helpers import send_message
26
from validation.helpers import construct_validation_message
27
from validation.models import ValidationSummary
28
29
# Get an instance of a logger
30
logger = logging.getLogger(__name__)
31
32
33
# A class to deal with cryoweb import errors
34
class CRBAnimImportError(Exception):
35
    pass
36
37
38
class CRBAnimReader(FileDataSourceMixin):
39
    mandatory_columns = [
40
            'sex',
41
            'species_latin_name',
42
            'country_of_origin',
43
            'breed_name',
44
            'animal_ID',
45
            'sample_bibliographic_references',
46
            'sample_identifier',
47
            'animal_birth_date',
48
            'sample_storage_temperature',
49
            'sample_type_name',
50
            'body_part_name',
51
            'sampling_date',
52
            'sampling_protocol_url',
53
            'sample_availability',
54
            'EBI_Biosample_identifier',
55
        ]
56
57
    def __init__(self):
58
        self.data = None
59
        self.header = None
60
        self.dialect = None
61
        self.items = None
62
        self.filename = None
63
64
    @classmethod
65
    def get_dialect(cls, chunk):
66
        """Determine dialect of a CSV from a chunk"""
67
68
        return csv.Sniffer().sniff(chunk)
69
70
    @classmethod
71
    def is_valid(cls, chunk):
72
        """Try to determine if CRBanim has at least the required columns
73
        or not"""
74
75
        dialect = cls.get_dialect(chunk)
76
77
        # get a handle from a string
78
        handle = io.StringIO(chunk)
79
80
        # read chunk
81
        reader = csv.reader(handle, dialect)
82
        header = next(reader)
83
84
        not_found = []
85
86
        for column in cls.mandatory_columns:
87
            if column not in header:
88
                not_found.append(column)
89
90
        if len(not_found) == 0:
91
            logger.debug("This seems to be a valid CRBanim file")
92
            return True, []
93
94
        else:
95
            logger.error("Couldn't not find mandatory CRBanim columns %s" % (
96
                not_found))
97
            return False, not_found
98
99
    def read_file(self, filename):
100
        """Read crb anim files and set tit to class attribute"""
101
102
        with open(filename, newline='') as handle:
103
            # initialize data
104
            self.filename = filename
105
            self.data = []
106
107
            # get dialect
108
            chunk = handle.read(2048)
109
            self.dialect = self.get_dialect(chunk)
110
111
            # restart filename from the beginning
112
            handle.seek(0)
113
114
            # read csv file
115
            reader = csv.reader(handle, self.dialect)
116
            self.header = next(reader)
117
118
            # find sex index column
119
            sex_idx = self.header.index('sex')
120
121
            # create a namedtuple object
122
            Data = namedtuple("Data", self.header)
123
124
            # add records to data
125
            for record in reader:
126
                # replace all "\\N" occurences in a list
127
                record = [None if col in ["\\N", ""]
128
                          else col for col in record]
129
130
                # 'unknown' sex should be replaced with 'record of unknown sex'
131
                if record[sex_idx].lower() == 'unknown':
132
                    logger.debug(
133
                        "Changing '%s' with '%s'" % (
134
                            record[sex_idx], 'record of unknown sex'))
135
                    record[sex_idx] = 'record of unknown sex'
136
137
                record = Data._make(record)
138
                self.data.append(record)
139
140
        self.items = self.eval_columns()
141
142
    def eval_columns(self):
143
        """define a set from column data"""
144
145
        # target_columns = ['sex', 'species_latin_name', 'breed_name']
146
        target_columns = self.header
147
148
        items = defaultdict(list)
149
150
        for line in self.data:
151
            for column in target_columns:
152
                idx = self.header.index(column)
153
                items[column].append(line[idx])
154
155
        # now get a set of object
156
        for column in target_columns:
157
            items[column] = set(items[column])
158
159
        return items
160
161
    def print_line(self, num):
162
        """print a record with its column names"""
163
164
        for i, column in enumerate(self.header):
165
            logger.debug("%s: %s" % (column, self.data[num][i]))
166
167
    def filter_by_column_values(self, column, values, ignorecase=False):
168
        if ignorecase is True:
169
            # lower values
170
            values = [value.lower() for value in values]
171
172
        for line in self.data:
173
            # search for case insensitive value (lower attrib in lower values)
174
            if ignorecase is True:
175
                if getattr(line, column).lower() in values:
176
                    yield line
177
178
                else:
179
                    logger.debug("Filtering: %s" % (str(line)))
180
181
            else:
182
                if getattr(line, column) in values:
183
                    yield line
184
185
                else:
186
                    logger.debug("Filtering: %s" % (str(line)))
187
188
            # ignore case or not
189
190
        # cicle for line
191
192
    # a function to detect if crbanim species are in UID database or not
193
    def check_species(self, country):
194
        """Check if all species are defined in UID DictSpecies"""
195
196
        # CRBAnim usually have species in the form required for UID
197
        # However sometimes there could be a common name, not a DictSpecie one
198
        column = 'species_latin_name'
199
        item_set = self.items[column]
200
201
        # call FileDataSourceMixin.check_species
202
        return super().check_species(column, item_set, country)
203
204
    # check that dict sex table contains data
205
    def check_sex(self):
206
        """check that dict sex table contains data"""
207
208
        # item.sex are in uppercase
209
        column = 'sex'
210
        item_set = [item.lower() for item in self.items[column]]
211
212
        # call FileDataSourceMixin.check_items
213
        return self.check_items(item_set, DictSex, column)
214
215
216
def fill_uid_breed(record, language):
217
    """Fill DictBreed from a crbanim record"""
218
219
    # get a DictSpecie object. Species are in latin names, but I can
220
    # find also a common name in translation tables
221
    specie = DictSpecie.get_specie_check_synonyms(
222
            species_label=record.species_latin_name,
223
            language=language)
224
225
    # get country name using pycountries
226
    country_name = pycountry.countries.get(
227
        alpha_2=record.country_of_origin).name
228
229
    # get country for breeds. Ideally will be the same of submission,
230
    # however, it could be possible to store data from other contries
231
    country = get_or_create_obj(
232
        DictCountry,
233
        label=country_name)
234
235
    breed = get_or_create_obj(
236
        DictBreed,
237
        supplied_breed=record.breed_name,
238
        specie=specie,
239
        country=country)
240
241
    # return a DictBreed object
242
    return breed
243
244
245
def fill_uid_names(record, submission):
246
    """fill Names table from crbanim record"""
247
248
    # in the same record I have the sample identifier and animal identifier
249
    # a name record for animal
250
    animal_name = get_or_create_obj(
251
        Name,
252
        name=record.animal_ID,
253
        submission=submission,
254
        owner=submission.owner)
255
256
    # get a publication (if present)
257
    publication = None
258
259
    if record.sample_bibliographic_references:
260
        publication = get_or_create_obj(
261
            Publication,
262
            doi=record.sample_bibliographic_references)
263
264
    # name record for sample
265
    sample_name = get_or_create_obj(
266
        Name,
267
        name=record.sample_identifier,
268
        submission=submission,
269
        owner=submission.owner,
270
        publication=publication)
271
272
    # returning 2 Name instances
273
    return animal_name, sample_name
274
275
276
def fill_uid_animal(record, animal_name, breed, submission, animals):
277
    """Helper function to fill animal data in UID animal table"""
278
279
    # HINT: does CRBAnim models mother and father?
280
281
    # check if such animal is already beed updated
282
    if animal_name.name in animals:
283
        logger.debug(
284
            "Ignoring %s: already created or updated" % (animal_name))
285
286
        # return an animal object
287
        animal = animals[animal_name.name]
288
289
    else:
290
        # determine sex. Check for values
291
        sex = DictSex.objects.get(label__iexact=record.sex)
292
293
        # there's no birth_location for animal in CRBAnim
294
        accuracy = MISSING
295
296
        # create a new object. Using defaults to avoid collisions when
297
        # updating data
298
        # HINT: CRBanim has less attribute than cryoweb
299
        defaults = {
300
            # HINT: is a duplication of name. Can this be non-mandatory?
301
            'alternative_id': animal_name.name,
302
            'breed': breed,
303
            'sex': sex,
304
            'birth_date': record.animal_birth_date,
305
            'birth_location_accuracy': accuracy,
306
            'owner': submission.owner
307
        }
308
309
        # HINT: I could have the same animal again and again. Should I update
310
        # every times?
311
        animal = update_or_create_obj(
312
            Animal,
313
            name=animal_name,
314
            defaults=defaults)
315
316
        # track this animal in dictionary
317
        animals[animal_name.name] = animal
318
319
    # I need to track animal to relate the sample
320
    return animal
321
322
323
def find_storage_type(record):
324
    """Determine a sample storage relying on a dictionary"""
325
326
    mapping = {
327
        '-196°C': 'frozen, liquid nitrogen',
328
        '-20°C': 'frozen, -20 degrees Celsius freezer',
329
        '-30°C': 'frozen, -20 degrees Celsius freezer',
330
        '-80°C': 'frozen, -80 degrees Celsius freezer'}
331
332
    if record.sample_storage_temperature in mapping:
333
        # get ENUM conversion
334
        storage = SAMPLE_STORAGE.get_value_by_desc(
335
            mapping[record.sample_storage_temperature])
336
337
        return storage
338
339
    else:
340
        logging.warning("Couldn't find %s in storage types mapping" % (
341
            record.sample_storage_temperature))
342
343
        return None
344
345
346
def fill_uid_sample(record, sample_name, animal, submission):
347
    """Helper function to fill animal data in UID sample table"""
348
349
    # name and animal name come from parameters
350
    organism_part_label = None
351
    sample_type_name = record.sample_type_name.lower()
352
    body_part_name = record.body_part_name.lower()
353
354
    # sylvain has proposed to apply the following decision rule:
355
    if body_part_name != "unknown" and body_part_name != "not relevant":
356
        organism_part_label = body_part_name
357
358
    else:
359
        organism_part_label = sample_type_name
360
361
    # get a organism part. Organism parts need to be in lowercases
362
    organism_part = get_or_create_obj(
363
        DictUberon,
364
        label=organism_part_label
365
    )
366
367
    # calculate animal age at collection
368
    animal_birth_date = parse_date(record.animal_birth_date)
369
    sampling_date = parse_date(record.sampling_date)
370
    animal_age_at_collection, time_units = image_timedelta(
371
        sampling_date, animal_birth_date)
372
373
    # create a new object. Using defaults to avoid collisions when
374
    # updating data
375
    defaults = {
376
        # HINT: is a duplication of name. Can this be non-mandatory?
377
        'alternative_id': sample_name.name,
378
        'collection_date': record.sampling_date,
379
        'protocol': record.sampling_protocol_url,
380
        'organism_part': organism_part,
381
        'animal': animal,
382
        # 'description': v_vessel.comment,
383
        'owner': submission.owner,
384
        'storage': find_storage_type(record),
385
        'availability': record.sample_availability,
386
        'animal_age_at_collection': animal_age_at_collection,
387
        'animal_age_at_collection_units': time_units
388
    }
389
390
    sample = update_or_create_obj(
391
        Sample,
392
        name=sample_name,
393
        defaults=defaults)
394
395
    return sample
396
397
398
def process_record(record, submission, animals, language):
399
    # Peter mail 26/02/19 18:30: I agree that it sounds like we will
400
    # need to create sameAs BioSamples for the IMAGE project, and it makes
401
    # sense that the inject tool is able to do this.  It may be that we
402
    # tackle these cases after getting the main part of the inject tool
403
    # functioning and hold or ignore these existing BioSamples for now.
404
    # HINT: record with a biosample id should be ignored, for the moment
405
    if record.EBI_Biosample_identifier is not None:
406
        logger.warning("Ignoring %s: already in biosample!" % str(record))
407
        return
408
409
    # filling breeds
410
    breed = fill_uid_breed(record, language)
411
412
    # filling name tables
413
    animal_name, sample_name = fill_uid_names(record, submission)
414
415
    # fill animal
416
    animal = fill_uid_animal(record, animal_name, breed, submission, animals)
417
418
    # fill sample
419
    fill_uid_sample(record, sample_name, animal, submission)
420
421
422
def upload_crbanim(submission):
423
    # debug
424
    logger.info("Importing from CRB-Anim file")
425
426
    # this is the full path in docker container
427
    fullpath = submission.get_uploaded_file_path()
428
429
    # read submission data
430
    reader = CRBAnimReader()
431
    reader.read_file(fullpath)
432
433
    # start data loading
434
    try:
435
        # check for species and sex in a similar way as cryoweb does
436
        check, not_found = reader.check_sex()
437
438
        if not check:
439
            message = (
440
                "Not all Sex terms are loaded into database: "
441
                "check for %s in your dataset" % (not_found))
442
443
            raise CRBAnimImportError(message)
444
445
        check, not_found = reader.check_species(submission.gene_bank_country)
446
447
        if not check:
448
            raise CRBAnimImportError(
449
                "Some species are not loaded in UID database: "
450
                "%s" % (not_found))
451
452
        # ok get languages from submission (useful for translation)
453
        # HINT: no traslations implemented, at the moment
454
        language = submission.gene_bank_country.label
455
456
        # a dictionary in which store animal data
457
        animals = {}
458
459
        for record in reader.data:
460
            process_record(record, submission, animals, language)
461
462
        # after processing records, initilize validationsummary objects
463
        # create a validation summary object and set all_count
464
        vs_animal = get_or_create_obj(
465
            ValidationSummary,
466
            submission=submission,
467
            type="animal")
468
469
        # reset counts
470
        vs_animal.reset_all_count()
471
472
        vs_sample = get_or_create_obj(
473
            ValidationSummary,
474
            submission=submission,
475
            type="sample")
476
477
        # reset counts
478
        vs_sample.reset_all_count()
479
480
    except Exception as exc:
481
        # set message:
482
        message = "Error in importing data: %s" % (str(exc))
483
484
        # save a message in database
485
        submission.status = ERROR
486
        submission.message = message
487
        submission.save()
488
489
        # send async message
490
        send_message(submission)
491
492
        # debug
493
        logger.error("error in importing from crbanim: %s" % (exc))
494
        logger.exception(exc)
495
496
        return False
497
498
    else:
499
        message = "CRBAnim import completed for submission: %s" % (
500
            submission.id)
501
502
        submission.message = message
503
        submission.status = LOADED
504
        submission.save()
505
506
        # send async message
507
        send_message(
508
            submission,
509
            validation_message=construct_validation_message(submission))
510
511
    logger.info("Import from CRBAnim is complete")
512
513
    return True
514