excel.helpers - Code Metrics - Inspection of ":sparkles: import from Template file" - cnr-ibba/IMAGE-InjectTool - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#40)

by Paolo

created 2019-07-03 12:32 UTC

excel.helpers F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	719
Duplicated Lines	6.54 %

Importance

Changes

Metric	Value
wmc	62
eloc	395
dl	47
loc	719
rs	3.44
c	0
b	0
f	0

13 Methods

Rating	Name	Duplication	Size	Complexity
A	ExcelTemplate.get_animal_records()	0	6	1
A	ExcelTemplate.check_sheets()	0	19	4
A	ExcelTemplate.read_file()	0	4	1
A	ExcelTemplate.check_sex()	0	7	1
A	ExcelTemplate.check_species()	27	27	3
A	ExcelTemplate.__init__()	0	4	1
A	ExcelTemplate.__check_accuracy()	0	20	4
B	ExcelTemplate.__get_sheet_records()	0	55	6
A	ExcelTemplate.check_accuracies()	0	25	2
A	ExcelTemplate.check_columns()	0	28	5
A	ExcelTemplate.get_breed_records()	0	6	1
A	ExcelTemplate.__check_items()	20	20	4
A	ExcelTemplate.get_sample_records()	0	6	1

5 Functions

Rating	Name	Size	Complexity
B	fill_uid_samples()	95	5
B	fill_uid_names()	36	5
B	upload_template()	87	6
C	fill_uid_animals()	112	7
B	fill_uid_breeds()	50	5

How to fix Duplicated Code Complexity

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul  2 10:44:54 2019

@author: Paolo Cozzi <[email protected]>
"""

import xlrd
import logging
import datetime

from collections import defaultdict, namedtuple

from common.constants import (
    ERROR, LOADED, ACCURACIES, SAMPLE_STORAGE, SAMPLE_STORAGE_PROCESSING)
from common.helpers import image_timedelta
from image_app.models import (
    DictBreed, DictCountry, DictSpecie, DictSex, DictUberon, Name, Animal,
    Sample)
from language.helpers import check_species_synonyms
from submissions.helpers import send_message
from validation.helpers import construct_validation_message
from validation.models import ValidationSummary

# Get an instance of a logger
logger = logging.getLogger(__name__)

# defining the template columns in need for data import
TEMPLATE_COLUMNS = {
    'breed': [
        'Supplied breed',
        # 'Mapped breed',
        # 'Mapped breed ontology library',
        # 'Mapped breed ontology accession',
        'EFABIS Breed country',
        'Species',
        # 'Species ontology library',
        # 'Species ontology accession'
    ],
    'animal': [
        'Animal id in data source',
        'Animal description',
        'Alternative animal ID',
        'Father id in data source',
        'Mother id in data source',
        'Breed',
        'Species',
        'Sex',
        'Birth date',
        'Birth location',
        'Birth location longitude',
        'Birth location latitude',
        'Birth location accuracy'
    ],
    'sample': [
        'Sample id in data source',
        'Alternative sample ID',
        'Sample description',
        'Animal id in data source',
        'Specimen collection protocol',
        'availability',
        'Collection date',
        'Collection place latitude',
        'Collection place longitude',
        'Collection place',
        'Collection place accuracy',
        'Organism part',
        'Developmental stage',
        'Physiological stage',
        'Animal age at collection',
        'Sample storage',
        'Sample storage processing',
        'Sampling to preparation interval'
    ]
}


# A class to deal with template import errors
class ExcelImportError(Exception):
    pass


class ExcelTemplate():
    """A class to read template excel files"""

    def __init__(self):
        # read xls file and track it
        self.book = None
        self.sheet_names = []

    def read_file(self, filename):
        # read xls file and track it
        self.book = xlrd.open_workbook(filename)
        self.sheet_names = self.book.sheet_names()

    def check_sheets(self):
        """Test for the minimal sheets required to upload data"""

        not_found = []

        for sheet_name in TEMPLATE_COLUMNS.keys():
            if sheet_name not in self.sheet_names:
                not_found.append(sheet_name)
                logger.error(
                    "required sheet {name} not found in template".format(
                        name=sheet_name)
                )

        if len(not_found) > 0:
            return False, not_found

        else:
            logger.debug("This seems to be a valid Template file")
            return True, not_found

    def check_columns(self):
        """Test for minimal column required for template load"""

        not_found = defaultdict(list)

        for sheet_name in TEMPLATE_COLUMNS.keys():
            # get a sheet from xls workbook
            sheet = self.book.sheet_by_name(sheet_name)

            # get header from sheet
            header = sheet.row_values(0)

            for column in TEMPLATE_COLUMNS[sheet_name]:
                if column not in header:
                    not_found[sheet_name].append(column)
                    logger.error(
                        "required column {column} not found in sheet "
                        "{sheet_name}".format(
                            sheet_name=sheet_name,
                            column=column)
                    )

        if len(not_found) > 0:
            return False, not_found

        else:
            logger.debug("This seems to be a valid Template file")
            return True, not_found

    def __get_sheet_records(self, sheet_name):
        """Generic functions to iterate on excel records"""

        # this is the sheet I need
        sheet = self.book.sheet_by_name(sheet_name)

        # now get columns to create a collection objects
        header = sheet.row_values(0)

        column_idxs = {}

        # get the column index I need
        for column in TEMPLATE_COLUMNS[sheet_name]:
            idx = header.index(column)
            column_idxs[column.lower().replace(" ", "_")] = idx

        # get new column names
        columns = column_idxs.keys()

        # create a namedtuple object
        Record = namedtuple(sheet_name.capitalize(), columns)

        # iterate over record, mind the header column
        for i in range(1, sheet.nrows):
            # get a row from excel file
            row = sheet.row_values(i)

            # get the data I need
            data = [row[column_idxs[column]] for column in columns]

            # replace all empty  occurences in a list
            data = [None if col in [""]
                    else col for col in data]

            # fix date fields. Search for 'date' in column names
            date_idxs = [column_idxs[column] for column in columns if
                         'date' in column]

            # fix date objects using datetime, as described here:
            # https://stackoverflow.com/a/13962976/4385116
            for idx in date_idxs:
                if not data[idx]:
                    continue

                data[idx] = datetime.datetime(
                    *xlrd.xldate_as_tuple(
                        data[idx],
                        self.book.datemode
                    )
                )

            # get a new object
            record = Record._make(data)

            yield record

    def get_breed_records(self):
        """Iterate among breeds record"""

        # this is the sheet I need
        sheet_name = "breed"
        return self.__get_sheet_records(sheet_name)

    def get_animal_records(self):
        """Iterate among animal records"""

        # this is the sheet I need
        sheet_name = "animal"
        return self.__get_sheet_records(sheet_name)

    def get_sample_records(self):
        """Iterate among sample records"""

        # this is the sheet I need
        sheet_name = "sample"
        return self.__get_sheet_records(sheet_name)

    # TODO: identical to CRBanim move in a common mixin
    def __check_items(self, item_set, model, column):

        """General check of Template items into database"""

        # a list of not found terms and a status to see if something is missing
        # or not
        not_found = []
        result = True

        for item in item_set:
            # check for species in database
            if not model.objects.filter(label=item).exists():
                not_found.append(item)

        if len(not_found) != 0:
            result = False
            logger.warning(
                "Those %s are not present in UID database:" % (column))
            logger.warning(not_found)

        return result, not_found

    # TODO: nearly identical to CRBanim move in a common mixin
    def check_species(self, country):

        """Check if all species are defined in UID DictSpecies"""

        column = 'species'
        item_set = set([breed.species for breed in self.get_breed_records()])

        check, not_found = self.__check_items(
            item_set, DictSpecie, column)

        if check is False:
            # try to check in dictionary table
            logger.info("Searching for %s in dictionary tables" % (not_found))

            # if this function return True, I found all synonyms
            if check_species_synonyms(not_found, country) is True:
                logger.info("Found %s in dictionary tables" % not_found)

                # return True and an empty list for check and not found items
                return True, []

            else:
                # if I arrive here, there are species that I couldn't find
                logger.error(
                    "Couldnt' find those species in dictionary tables:")
                logger.error(not_found)

        return check, not_found

    # TODO: nearly identical to CRBanim move in a common mixin
    def check_sex(self):
        """Check that all sex records are present in database"""

        column = 'sex'
        item_set = set([animal.sex for animal in self.get_animal_records()])

        return self.__check_items(item_set, DictSex, column)

    def __check_accuracy(self, item_set):
        """A generic method to test for accuracies"""

        # a list of not found terms and a status to see if something is missing
        # or not
        not_found = []
        result = True

        for item in item_set:
            try:
                ACCURACIES.get_value_by_desc(item)

            except KeyError:
                logger.warning("accuracy level '%s' not found" % (item))
                not_found.append(item)

        if len(not_found) != 0:
            result = False

        return result, not_found

    def check_accuracies(self):
        """Check accuracy specified in table"""

        item_set = set([animal.birth_location_accuracy
                        for animal in self.get_animal_records()])

        # test for accuracy in animal table
        result_animal, not_found_animal = self.__check_accuracy(item_set)

        item_set = set([sample.collection_place_accuracy
                        for sample in self.get_sample_records()])

        # test for accuracy in sample table
        result_sample, not_found_sample = self.__check_accuracy(item_set)

        # merge two results
        check = result_animal and result_sample
        not_found = set(not_found_animal + not_found_sample)

        if check is False:
            logger.error(
                    "Couldnt' find those accuracies in constants:")
            logger.error(not_found)

        return check, not_found


def fill_uid_breeds(submission_obj, template):
    """Fill DictBreed from a excel record"""

    logger.info("fill_uid_breeds() started")

    # ok get languages from submission (useful for translation)
    language = submission_obj.gene_bank_country.label

    # iterate among excel template
    for record in template.get_breed_records():
        # TODO: move this in a helper module (image_app.helpers?)
        # get a DictSpecie object. Species are in latin names, but I can
        # find also a common name in translation tables
        try:
            specie = DictSpecie.objects.get(label=record.species)

        except DictSpecie.DoesNotExist:
            logger.info("Search %s in synonyms" % (record.species))
            # search for language synonym (if I arrived here a synonym should
            # exists)
            specie = DictSpecie.get_by_synonym(
                synonym=record.species,
                language=language)

        # get country for breeds. Ideally will be the same of submission,
        # however, it could be possible to store data from other contries
        country, created = DictCountry.objects.get_or_create(
            label=record.efabis_breed_country)

        # I could create a country from a v_breed_specie instance. That's
        # ok, maybe I could have a lot of breed from different countries and
        # a few organizations submitting them
        if created:
            logger.info("Created %s" % country)

        else:
            logger.debug("Found %s" % country)

        breed, created = DictBreed.objects.get_or_create(
            supplied_breed=record.supplied_breed,
            specie=specie,
            country=country)

        if created:
            logger.info("Created %s" % breed)

        else:
            logger.debug("Found %s" % breed)

    logger.info("fill_uid_breeds() completed")


def fill_uid_names(submission_obj, template):
    """fill Names table from crbanim record"""

    # debug
    logger.info("called fill_uid_names()")

    # iterate among excel template
    for record in template.get_animal_records():
        # in the same record I have the sample identifier and animal identifier
        # a name record for animal
        animal_name, created = Name.objects.get_or_create(
            name=record.animal_id_in_data_source,
            submission=submission_obj,
            owner=submission_obj.owner)

        if created:
            logger.debug("Created animal name %s" % animal_name)

        else:
            logger.debug("Found animal name %s" % animal_name)

    # iterate among excel template
    for record in template.get_sample_records():
        # name record for sample
        sample_name, created = Name.objects.get_or_create(
            name=record.sample_id_in_data_source,
            submission=submission_obj,
            owner=submission_obj.owner)

        if created:
            logger.debug("Created sample name %s" % sample_name)

        else:
            logger.debug("Found sample name %s" % sample_name)

    logger.info("fill_uid_names() completed")


def fill_uid_animals(submission_obj, template):
    # debug
    logger.info("called fill_uid_animals()")

    # get submission language
    language = submission_obj.gene_bank_country.label

    # iterate among excel template
    for record in template.get_animal_records():
        # determine sex. Check for values
        sex = DictSex.objects.get(label__iexact=record.sex)

        # get specie
        specie = DictSpecie.objects.get(label=record.species)

        # how I can get breed from my data?
        breeds = [breed for breed in template.get_breed_records()
                  if breed.supplied_breed == record.breed and
                  breed.species == record.species]

        # breed is supposed to be unique, from UID constraints. However
        # I could place the same breed name for two countries. In that case,
        # I cant derive a unique breed from users data
        if len(breeds) != 1:
            raise ExcelImportError(
                "Can't determine a unique breed for '%s:%s' from user data" %
                (record.breed, record.species))

        # get a country for this breed
        country = DictCountry.objects.get(
            label=breeds[0].efabis_breed_country)

        # ok get a real dictbreed object
        breed = DictBreed.objects.get(
            supplied_breed=record.breed,
            specie=specie,
            country=country)

        logger.debug("Selected breed is %s" % (breed))

        # define names
        name, mother, father = None, None, None

        # get name for this animal and for mother and father
        logger.debug("Getting %s as my name" % (
            record.animal_id_in_data_source))

        name = Name.objects.get(
            name=record.animal_id_in_data_source,
            submission=submission_obj)

        if record.father_id_in_data_source:
            logger.debug("Getting %s as father" % (
                record.father_id_in_data_source))

            father = Name.objects.get(
                name=record.father_id_in_data_source,
                submission=submission_obj)

        if record.mother_id_in_data_source:
            logger.debug("Getting %s as mother" % (
                record.mother_id_in_data_source))

            mother = Name.objects.get(
                name=record.mother_id_in_data_source,
                submission=submission_obj)

        # now get accuracy
        accuracy = ACCURACIES.get_value_by_desc(
            record.birth_location_accuracy)

        # create a new object. Using defaults to avoid collisions when
        # updating data
        defaults = {
            'alternative_id': record.alternative_animal_id,
            'description': record.animal_description,
            'breed': breed,
            'sex': sex,
            'father': father,
            'mother': mother,
            'birth_date': record.birth_date,
            'birth_location': record.birth_location,
            'birth_location_latitude': record.birth_location_latitude,
            'birth_location_longitude': record.birth_location_longitude,
            'birth_location_accuracy': accuracy,
            'owner': submission_obj.owner
        }

        animal, created = Animal.objects.update_or_create(
            name=name,
            defaults=defaults)

        if created:
            logger.debug("Created %s" % animal)

        else:
            logger.debug("Updating %s" % animal)

    # create a validation summary object and set all_count
    validation_summary, created = ValidationSummary.objects.get_or_create(
        submission=submission_obj, type="animal")

    if created:
        logger.debug(
            "ValidationSummary animal created for submission %s" %
            submission_obj)

    # reset counts
    validation_summary.reset_all_count()

    # debug
    logger.info("fill_uid_animals() completed")


def fill_uid_samples(submission_obj, template):
    # debug
    logger.info("called fill_uid_samples()")

    # iterate among excel template
    for record in template.get_sample_records():
        # get name for this sample
        name = Name.objects.get(
            name=record.sample_id_in_data_source,
            submission=submission_obj,
            owner=submission_obj.owner)

        # get animal by reading record
        animal = Animal.objects.get(
            name__name=record.animal_id_in_data_source,
            name__submission=submission_obj)

        # get a organism part. Organism parts need to be in lowercases
        organism_part, created = DictUberon.objects.get_or_create(
            label=record.organism_part
        )

        if created:
            logger.info("Created %s" % organism_part)

        else:
            logger.debug("Found %s" % organism_part)

        # TODO: get developmental_stage and physiological_stage terms

        # derive animal age at collection. THis function deals with NULL valies
        animal_age_at_collection, time_units = image_timedelta(
            record.collection_date, animal.birth_date)

        # now get accuracy
        accuracy = ACCURACIES.get_value_by_desc(
            record.collection_place_accuracy)

        # now get storage and storage processing
        # TODO; check those values in excel columns
        storage = SAMPLE_STORAGE.get_value_by_desc(
            record.sample_storage)

        storage_processing = SAMPLE_STORAGE_PROCESSING.get_value_by_desc(
            record.sample_storage_processing)

        # create a new object. Using defaults to avoid collisions when
        # updating data
        defaults = {
            'alternative_id': record.alternative_sample_id,
            'description': record.sample_description,
            'animal': animal,
            'protocol': record.specimen_collection_protocol,
            'collection_date': record.collection_date,
            'collection_place_latitude': record.collection_place_latitude,
            'collection_place_longitude': record.collection_place_longitude,
            'collection_place': record.collection_place,
            'collection_place_accuracy': accuracy,
            'organism_part': organism_part,
            # 'developmental_stage': None,
            # 'physiological_stage': None,
            'animal_age_at_collection': animal_age_at_collection,
            'animal_age_at_collection_units': time_units,
            'availability': record.availability,
            'storage': storage,
            'storage_processing': storage_processing,
            # TODO: this is a time unit column
            'preparation_interval': record.sampling_to_preparation_interval,
            'owner': submission_obj.owner,
        }

        sample, created = Sample.objects.update_or_create(
            name=name,
            defaults=defaults)

        if created:
            logger.debug("Created %s" % sample)

        else:
            logger.debug("Updating %s" % sample)

    # create a validation summary object and set all_count
    validation_summary, created = ValidationSummary.objects.get_or_create(
        submission=submission_obj, type="sample")

    if created:
        logger.debug(
            "ValidationSummary animal created for submission %s" %
            submission_obj)

    # reset counts
    validation_summary.reset_all_count()

    # debug
    logger.info("fill_uid_samples() completed")


def upload_template(submission_obj):
    # debug
    logger.info("Importing from Excel template file")

    # this is the full path in docker container
    fullpath = submission_obj.get_uploaded_file_path()

    # read submission data
    reader = ExcelTemplate()
    reader.read_file(fullpath)

    # start data loading
    try:
        # check for species and sex in a similar way as cryoweb does
        # TODO: identical to CRBanim. Move to a mixin
        check, not_found = reader.check_sex()

        if not check:
            message = (
                "Not all Sex terms are loaded into database: "
                "check for %s in your dataset" % (not_found))

            raise ExcelImportError(message)

        check, not_found = reader.check_species(
            submission_obj.gene_bank_country)

        if not check:
            raise ExcelImportError(
                "Some species are not loaded in UID database: "
                "%s" % (not_found))

        check, not_found = reader.check_accuracies()

        if not check:
            message = (
                "Not all accuracy levels are defined in database: "
                "check for %s in your dataset" % (not_found))

            raise ExcelImportError(message)

        # BREEDS
        fill_uid_breeds(submission_obj, reader)

        # NAME
        fill_uid_names(submission_obj, reader)

        # ANIMALS
        fill_uid_animals(submission_obj, reader)

        # SAMPLES
        fill_uid_samples(submission_obj, reader)

    except Exception as exc:
        # set message:
        message = "Error in importing data: %s" % (str(exc))

        # save a message in database
        submission_obj.status = ERROR
        submission_obj.message = message
        submission_obj.save()

        # send async message
        send_message(submission_obj)

        # debug
        logger.error("Error in importing from Template: %s" % (exc))
        logger.exception(exc)

        return False

    else:
        message = "Template import completed for submission: %s" % (
            submission_obj.id)

        submission_obj.message = message
        submission_obj.status = LOADED
        submission_obj.save()

        # send async message
        send_message(
            submission_obj,
            validation_message=construct_validation_message(submission_obj))

    logger.info("Import from Template is complete")

    return True


1		#!/usr/bin/env python3
2		# -- coding: utf-8 --
3		"""
4		Created on Tue Jul 2 10:44:54 2019
5
6		@author: Paolo Cozzi <[email protected]>
7		"""
8
9		import xlrd
10		import logging
11		import datetime
12
13		from collections import defaultdict, namedtuple
14
15		from common.constants import (
16		ERROR, LOADED, ACCURACIES, SAMPLE_STORAGE, SAMPLE_STORAGE_PROCESSING)
17		from common.helpers import image_timedelta
18		from image_app.models import (
19		DictBreed, DictCountry, DictSpecie, DictSex, DictUberon, Name, Animal,
20		Sample)
21		from language.helpers import check_species_synonyms
22		from submissions.helpers import send_message
23		from validation.helpers import construct_validation_message
24		from validation.models import ValidationSummary
25
26		# Get an instance of a logger
27		logger = logging.getLogger(__name__)
28
29		# defining the template columns in need for data import
30		TEMPLATE_COLUMNS = {
31		'breed': [
32		'Supplied breed',
33		# 'Mapped breed',
34		# 'Mapped breed ontology library',
35		# 'Mapped breed ontology accession',
36		'EFABIS Breed country',
37		'Species',
38		# 'Species ontology library',
39		# 'Species ontology accession'
40		],
41		'animal': [
42		'Animal id in data source',
43		'Animal description',
44		'Alternative animal ID',
45		'Father id in data source',
46		'Mother id in data source',
47		'Breed',
48		'Species',
49		'Sex',
50		'Birth date',
51		'Birth location',
52		'Birth location longitude',
53		'Birth location latitude',
54		'Birth location accuracy'
55		],
56		'sample': [
57		'Sample id in data source',
58		'Alternative sample ID',
59		'Sample description',
60		'Animal id in data source',
61		'Specimen collection protocol',
62		'availability',
63		'Collection date',
64		'Collection place latitude',
65		'Collection place longitude',
66		'Collection place',
67		'Collection place accuracy',
68		'Organism part',
69		'Developmental stage',
70		'Physiological stage',
71		'Animal age at collection',
72		'Sample storage',
73		'Sample storage processing',
74		'Sampling to preparation interval'
75		]
76		}
77
78
79		# A class to deal with template import errors
80		class ExcelImportError(Exception):
81		pass
82
83
84		class ExcelTemplate():
85		"""A class to read template excel files"""
86
87		def __init__(self):
88		# read xls file and track it
89		self.book = None
90		self.sheet_names = []
91
92		def read_file(self, filename):
93		# read xls file and track it
94		self.book = xlrd.open_workbook(filename)
95		self.sheet_names = self.book.sheet_names()
96
97		def check_sheets(self):
98		"""Test for the minimal sheets required to upload data"""
99
100		not_found = []
101
102		for sheet_name in TEMPLATE_COLUMNS.keys():
103		if sheet_name not in self.sheet_names:
104		not_found.append(sheet_name)
105		logger.error(
106		"required sheet {name} not found in template".format(
107		name=sheet_name)
108		)
109
110		if len(not_found) > 0:
111		return False, not_found
112
113		else:
114		logger.debug("This seems to be a valid Template file")
115		return True, not_found
116
117		def check_columns(self):
118		"""Test for minimal column required for template load"""
119
120		not_found = defaultdict(list)
121
122		for sheet_name in TEMPLATE_COLUMNS.keys():
123		# get a sheet from xls workbook
124		sheet = self.book.sheet_by_name(sheet_name)
125
126		# get header from sheet
127		header = sheet.row_values(0)
128
129		for column in TEMPLATE_COLUMNS[sheet_name]:
130		if column not in header:
131		not_found[sheet_name].append(column)
132		logger.error(
133		"required column {column} not found in sheet "
134		"{sheet_name}".format(
135		sheet_name=sheet_name,
136		column=column)
137		)
138
139		if len(not_found) > 0:
140		return False, not_found
141
142		else:
143		logger.debug("This seems to be a valid Template file")
144		return True, not_found
145
146		def __get_sheet_records(self, sheet_name):
147		"""Generic functions to iterate on excel records"""
148
149		# this is the sheet I need
150		sheet = self.book.sheet_by_name(sheet_name)
151
152		# now get columns to create a collection objects
153		header = sheet.row_values(0)
154
155		column_idxs = {}
156
157		# get the column index I need
158		for column in TEMPLATE_COLUMNS[sheet_name]:
159		idx = header.index(column)
160		column_idxs[column.lower().replace(" ", "_")] = idx
161
162		# get new column names
163		columns = column_idxs.keys()
164
165		# create a namedtuple object
166		Record = namedtuple(sheet_name.capitalize(), columns)
167
168		# iterate over record, mind the header column
169		for i in range(1, sheet.nrows):
170		# get a row from excel file
171		row = sheet.row_values(i)
172
173		# get the data I need
174		data = [row[column_idxs[column]] for column in columns]
175
176		# replace all empty occurences in a list
177		data = [None if col in [""]
178		else col for col in data]
179
180		# fix date fields. Search for 'date' in column names
181		date_idxs = [column_idxs[column] for column in columns if
182		'date' in column]
183
184		# fix date objects using datetime, as described here:
185		# https://stackoverflow.com/a/13962976/4385116
186		for idx in date_idxs:
187		if not data[idx]:
188		continue
189
190		data[idx] = datetime.datetime(
191		*xlrd.xldate_as_tuple(
192		data[idx],
193		self.book.datemode
194		)
195		)
196
197		# get a new object
198		record = Record._make(data)
199
200		yield record
201
202		def get_breed_records(self):
203		"""Iterate among breeds record"""
204
205		# this is the sheet I need
206		sheet_name = "breed"
207		return self.__get_sheet_records(sheet_name)
208
209		def get_animal_records(self):
210		"""Iterate among animal records"""
211
212		# this is the sheet I need
213		sheet_name = "animal"
214		return self.__get_sheet_records(sheet_name)
215
216		def get_sample_records(self):
217		"""Iterate among sample records"""
218
219		# this is the sheet I need
220		sheet_name = "sample"
221		return self.__get_sheet_records(sheet_name)
222
223		# TODO: identical to CRBanim move in a common mixin
224	View Code Duplication	def __check_items(self, item_set, model, column):
		0 ignored issues – show Duplication introduced 2019-07-03 12:33 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
225		"""General check of Template items into database"""
226
227		# a list of not found terms and a status to see if something is missing
228		# or not
229		not_found = []
230		result = True
231
232		for item in item_set:
233		# check for species in database
234		if not model.objects.filter(label=item).exists():
235		not_found.append(item)
236
237		if len(not_found) != 0:
238		result = False
239		logger.warning(
240		"Those %s are not present in UID database:" % (column))
241		logger.warning(not_found)
242
243		return result, not_found
244
245		# TODO: nearly identical to CRBanim move in a common mixin
246	View Code Duplication	def check_species(self, country):
		0 ignored issues – show Duplication introduced 2019-07-03 12:33 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
247		"""Check if all species are defined in UID DictSpecies"""
248
249		column = 'species'
250		item_set = set([breed.species for breed in self.get_breed_records()])
251
252		check, not_found = self.__check_items(
253		item_set, DictSpecie, column)
254
255		if check is False:
256		# try to check in dictionary table
257		logger.info("Searching for %s in dictionary tables" % (not_found))
258
259		# if this function return True, I found all synonyms
260		if check_species_synonyms(not_found, country) is True:
261		logger.info("Found %s in dictionary tables" % not_found)
262
263		# return True and an empty list for check and not found items
264		return True, []
265
266		else:
267		# if I arrive here, there are species that I couldn't find
268		logger.error(
269		"Couldnt' find those species in dictionary tables:")
270		logger.error(not_found)
271
272		return check, not_found
273
274		# TODO: nearly identical to CRBanim move in a common mixin
275		def check_sex(self):
276		"""Check that all sex records are present in database"""
277
278		column = 'sex'
279		item_set = set([animal.sex for animal in self.get_animal_records()])
280
281		return self.__check_items(item_set, DictSex, column)
282
283		def __check_accuracy(self, item_set):
284		"""A generic method to test for accuracies"""
285
286		# a list of not found terms and a status to see if something is missing
287		# or not
288		not_found = []
289		result = True
290
291		for item in item_set:
292		try:
293		ACCURACIES.get_value_by_desc(item)
294
295		except KeyError:
296		logger.warning("accuracy level '%s' not found" % (item))
297		not_found.append(item)
298
299		if len(not_found) != 0:
300		result = False
301
302		return result, not_found
303
304		def check_accuracies(self):
305		"""Check accuracy specified in table"""
306
307		item_set = set([animal.birth_location_accuracy
308		for animal in self.get_animal_records()])
309
310		# test for accuracy in animal table
311		result_animal, not_found_animal = self.__check_accuracy(item_set)
312
313		item_set = set([sample.collection_place_accuracy
314		for sample in self.get_sample_records()])
315
316		# test for accuracy in sample table
317		result_sample, not_found_sample = self.__check_accuracy(item_set)
318
319		# merge two results
320		check = result_animal and result_sample
321		not_found = set(not_found_animal + not_found_sample)
322
323		if check is False:
324		logger.error(
325		"Couldnt' find those accuracies in constants:")
326		logger.error(not_found)
327
328		return check, not_found
329
330
331		def fill_uid_breeds(submission_obj, template):
332		"""Fill DictBreed from a excel record"""
333
334		logger.info("fill_uid_breeds() started")
335
336		# ok get languages from submission (useful for translation)
337		language = submission_obj.gene_bank_country.label
338
339		# iterate among excel template
340		for record in template.get_breed_records():
341		# TODO: move this in a helper module (image_app.helpers?)
342		# get a DictSpecie object. Species are in latin names, but I can
343		# find also a common name in translation tables
344		try:
345		specie = DictSpecie.objects.get(label=record.species)
346
347		except DictSpecie.DoesNotExist:
348		logger.info("Search %s in synonyms" % (record.species))
349		# search for language synonym (if I arrived here a synonym should
350		# exists)
351		specie = DictSpecie.get_by_synonym(
352		synonym=record.species,
353		language=language)
354
355		# get country for breeds. Ideally will be the same of submission,
356		# however, it could be possible to store data from other contries
357		country, created = DictCountry.objects.get_or_create(
358		label=record.efabis_breed_country)
359
360		# I could create a country from a v_breed_specie instance. That's
361		# ok, maybe I could have a lot of breed from different countries and
362		# a few organizations submitting them
363		if created:
364		logger.info("Created %s" % country)
365
366		else:
367		logger.debug("Found %s" % country)
368
369		breed, created = DictBreed.objects.get_or_create(
370		supplied_breed=record.supplied_breed,
371		specie=specie,
372		country=country)
373
374		if created:
375		logger.info("Created %s" % breed)
376
377		else:
378		logger.debug("Found %s" % breed)
379
380		logger.info("fill_uid_breeds() completed")
381
382
383		def fill_uid_names(submission_obj, template):
384		"""fill Names table from crbanim record"""
385
386		# debug
387		logger.info("called fill_uid_names()")
388
389		# iterate among excel template
390		for record in template.get_animal_records():
391		# in the same record I have the sample identifier and animal identifier
392		# a name record for animal
393		animal_name, created = Name.objects.get_or_create(
394		name=record.animal_id_in_data_source,
395		submission=submission_obj,
396		owner=submission_obj.owner)
397
398		if created:
399		logger.debug("Created animal name %s" % animal_name)
400
401		else:
402		logger.debug("Found animal name %s" % animal_name)
403
404		# iterate among excel template
405		for record in template.get_sample_records():
406		# name record for sample
407		sample_name, created = Name.objects.get_or_create(
408		name=record.sample_id_in_data_source,
409		submission=submission_obj,
410		owner=submission_obj.owner)
411
412		if created:
413		logger.debug("Created sample name %s" % sample_name)
414
415		else:
416		logger.debug("Found sample name %s" % sample_name)
417
418		logger.info("fill_uid_names() completed")
419
420
421		def fill_uid_animals(submission_obj, template):
422		# debug
423		logger.info("called fill_uid_animals()")
424
425		# get submission language
426		language = submission_obj.gene_bank_country.label
427
428		# iterate among excel template
429		for record in template.get_animal_records():
430		# determine sex. Check for values
431		sex = DictSex.objects.get(label__iexact=record.sex)
432
433		# get specie
434		specie = DictSpecie.objects.get(label=record.species)
435
436		# how I can get breed from my data?
437		breeds = [breed for breed in template.get_breed_records()
438		if breed.supplied_breed == record.breed and
439		breed.species == record.species]
440
441		# breed is supposed to be unique, from UID constraints. However
442		# I could place the same breed name for two countries. In that case,
443		# I cant derive a unique breed from users data
444		if len(breeds) != 1:
445		raise ExcelImportError(
446		"Can't determine a unique breed for '%s:%s' from user data" %
447		(record.breed, record.species))
448
449		# get a country for this breed
450		country = DictCountry.objects.get(
451		label=breeds[0].efabis_breed_country)
452
453		# ok get a real dictbreed object
454		breed = DictBreed.objects.get(
455		supplied_breed=record.breed,
456		specie=specie,
457		country=country)
458
459		logger.debug("Selected breed is %s" % (breed))
460
461		# define names
462		name, mother, father = None, None, None
463
464		# get name for this animal and for mother and father
465		logger.debug("Getting %s as my name" % (
466		record.animal_id_in_data_source))
467
468		name = Name.objects.get(
469		name=record.animal_id_in_data_source,
470		submission=submission_obj)
471
472		if record.father_id_in_data_source:
473		logger.debug("Getting %s as father" % (
474		record.father_id_in_data_source))
475
476		father = Name.objects.get(
477		name=record.father_id_in_data_source,
478		submission=submission_obj)
479
480		if record.mother_id_in_data_source:
481		logger.debug("Getting %s as mother" % (
482		record.mother_id_in_data_source))
483
484		mother = Name.objects.get(
485		name=record.mother_id_in_data_source,
486		submission=submission_obj)
487
488		# now get accuracy
489		accuracy = ACCURACIES.get_value_by_desc(
490		record.birth_location_accuracy)
491
492		# create a new object. Using defaults to avoid collisions when
493		# updating data
494		defaults = {
495		'alternative_id': record.alternative_animal_id,
496		'description': record.animal_description,
497		'breed': breed,
498		'sex': sex,
499		'father': father,
500		'mother': mother,
501		'birth_date': record.birth_date,
502		'birth_location': record.birth_location,
503		'birth_location_latitude': record.birth_location_latitude,
504		'birth_location_longitude': record.birth_location_longitude,
505		'birth_location_accuracy': accuracy,
506		'owner': submission_obj.owner
507		}
508
509		animal, created = Animal.objects.update_or_create(
510		name=name,
511		defaults=defaults)
512
513		if created:
514		logger.debug("Created %s" % animal)
515
516		else:
517		logger.debug("Updating %s" % animal)
518
519		# create a validation summary object and set all_count
520		validation_summary, created = ValidationSummary.objects.get_or_create(
521		submission=submission_obj, type="animal")
522
523		if created:
524		logger.debug(
525		"ValidationSummary animal created for submission %s" %
526		submission_obj)
527
528		# reset counts
529		validation_summary.reset_all_count()
530
531		# debug
532		logger.info("fill_uid_animals() completed")
533
534
535		def fill_uid_samples(submission_obj, template):
536		# debug
537		logger.info("called fill_uid_samples()")
538
539		# iterate among excel template
540		for record in template.get_sample_records():
541		# get name for this sample
542		name = Name.objects.get(
543		name=record.sample_id_in_data_source,
544		submission=submission_obj,
545		owner=submission_obj.owner)
546
547		# get animal by reading record
548		animal = Animal.objects.get(
549		name__name=record.animal_id_in_data_source,
550		name__submission=submission_obj)
551
552		# get a organism part. Organism parts need to be in lowercases
553		organism_part, created = DictUberon.objects.get_or_create(
554		label=record.organism_part
555		)
556
557		if created:
558		logger.info("Created %s" % organism_part)
559
560		else:
561		logger.debug("Found %s" % organism_part)
562
563		# TODO: get developmental_stage and physiological_stage terms
564
565		# derive animal age at collection. THis function deals with NULL valies
566		animal_age_at_collection, time_units = image_timedelta(
567		record.collection_date, animal.birth_date)
568
569		# now get accuracy
570		accuracy = ACCURACIES.get_value_by_desc(
571		record.collection_place_accuracy)
572
573		# now get storage and storage processing
574		# TODO; check those values in excel columns
575		storage = SAMPLE_STORAGE.get_value_by_desc(
576		record.sample_storage)
577
578		storage_processing = SAMPLE_STORAGE_PROCESSING.get_value_by_desc(
579		record.sample_storage_processing)
580
581		# create a new object. Using defaults to avoid collisions when
582		# updating data
583		defaults = {
584		'alternative_id': record.alternative_sample_id,
585		'description': record.sample_description,
586		'animal': animal,
587		'protocol': record.specimen_collection_protocol,
588		'collection_date': record.collection_date,
589		'collection_place_latitude': record.collection_place_latitude,
590		'collection_place_longitude': record.collection_place_longitude,
591		'collection_place': record.collection_place,
592		'collection_place_accuracy': accuracy,
593		'organism_part': organism_part,
594		# 'developmental_stage': None,
595		# 'physiological_stage': None,
596		'animal_age_at_collection': animal_age_at_collection,
597		'animal_age_at_collection_units': time_units,
598		'availability': record.availability,
599		'storage': storage,
600		'storage_processing': storage_processing,
601		# TODO: this is a time unit column
602		'preparation_interval': record.sampling_to_preparation_interval,
603		'owner': submission_obj.owner,
604		}
605
606		sample, created = Sample.objects.update_or_create(
607		name=name,
608		defaults=defaults)
609
610		if created:
611		logger.debug("Created %s" % sample)
612
613		else:
614		logger.debug("Updating %s" % sample)
615
616		# create a validation summary object and set all_count
617		validation_summary, created = ValidationSummary.objects.get_or_create(
618		submission=submission_obj, type="sample")
619
620		if created:
621		logger.debug(
622		"ValidationSummary animal created for submission %s" %
623		submission_obj)
624
625		# reset counts
626		validation_summary.reset_all_count()
627
628		# debug
629		logger.info("fill_uid_samples() completed")
630
631
632		def upload_template(submission_obj):
633		# debug
634		logger.info("Importing from Excel template file")
635
636		# this is the full path in docker container
637		fullpath = submission_obj.get_uploaded_file_path()
638
639		# read submission data
640		reader = ExcelTemplate()
641		reader.read_file(fullpath)
642
643		# start data loading
644		try:
645		# check for species and sex in a similar way as cryoweb does
646		# TODO: identical to CRBanim. Move to a mixin
647		check, not_found = reader.check_sex()
648
649		if not check:
650		message = (
651		"Not all Sex terms are loaded into database: "
652		"check for %s in your dataset" % (not_found))
653
654		raise ExcelImportError(message)
655
656		check, not_found = reader.check_species(
657		submission_obj.gene_bank_country)
658
659		if not check:
660		raise ExcelImportError(
661		"Some species are not loaded in UID database: "
662		"%s" % (not_found))
663
664		check, not_found = reader.check_accuracies()
665
666		if not check:
667		message = (
668		"Not all accuracy levels are defined in database: "
669		"check for %s in your dataset" % (not_found))
670
671		raise ExcelImportError(message)
672
673		# BREEDS
674		fill_uid_breeds(submission_obj, reader)
675
676		# NAME
677		fill_uid_names(submission_obj, reader)
678
679		# ANIMALS
680		fill_uid_animals(submission_obj, reader)
681
682		# SAMPLES
683		fill_uid_samples(submission_obj, reader)
684
685		except Exception as exc:
686		# set message:
687		message = "Error in importing data: %s" % (str(exc))
688
689		# save a message in database
690		submission_obj.status = ERROR
691		submission_obj.message = message
692		submission_obj.save()
693
694		# send async message
695		send_message(submission_obj)
696
697		# debug
698		logger.error("Error in importing from Template: %s" % (exc))
699		logger.exception(exc)
700
701		return False
702
703		else:
704		message = "Template import completed for submission: %s" % (
705		submission_obj.id)
706
707		submission_obj.message = message
708		submission_obj.status = LOADED
709		submission_obj.save()
710
711		# send async message
712		send_message(
713		submission_obj,
714		validation_message=construct_validation_message(submission_obj))
715
716		logger.info("Import from Template is complete")
717
718		return True
719

cnr-ibba / IMAGE-InjectTool

Pull Request — master (#40)

excel.helpers F

Complexity

Size/Duplication

Importance

13 Methods

5 Functions

How to fix Duplicated Code Complexity

Duplicated Code

Complexity

Duplication Side-by-Side

Filter issues like