crbanim.helpers.CRBAnimReader.get_dialect() - Code Metrics - Inspection of "last requirements of Validation Summary Views issu..." - cnr-ibba/IMAGE-InjectTool - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#35)

by Paolo

created 2019-06-20 15:45 UTC

crbanim.helpers.CRBAnimReader.get_dialect() A

↳ Parent: crbanim.helpers

Complexity

Conditions

Size

Total Lines	5
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	3
dl	0
loc	5
rs	10
c	0
b	0
f	0
cc	1
nop	2

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 21 15:37:16 2019

@author: Paolo Cozzi <[email protected]>
"""

import io
import csv
import logging
import pycountry
import asyncio

from collections import defaultdict, namedtuple

from django.utils.dateparse import parse_date

from common.constants import LOADED, ERROR, MISSING, STATUSES
from common.helpers import image_timedelta, send_message_to_websocket
from image_app.models import (
    DictSpecie, DictSex, DictCountry, DictBreed, Name, Animal, Sample,
    DictUberon, Publication)
from language.helpers import check_species_synonyms
from validation.helpers import construct_validation_message
from validation.models import ValidationSummary

# Get an instance of a logger
logger = logging.getLogger(__name__)


# A class to deal with cryoweb import errors
class CRBAnimImportError(Exception):
    pass


def send_message(submission_obj, send_validation=False):

    """
    Update submission.status and submission message using django
    channels

    Args:
        submission_obj (image_app.models.Submission): an UID submission
        object
        send_validation (bool): send validation message or not
    """

    # define a message to send
    message = {
        'message': STATUSES.get_value_display(submission_obj.status),
        'notification_message': submission_obj.message,
    }

    # if validation message is needed, add to the final message
    if send_validation:
        message['validation_message'] = construct_validation_message(
            submission_obj)

    # now send the message to its submission
    asyncio.get_event_loop().run_until_complete(
        send_message_to_websocket(
            message,
            submission_obj.pk
        )
    )


class CRBAnimReader():
    mandatory_columns = [
            'sex',
            'species_latin_name',
            'country_of_origin',
            'breed_name',
            'animal_ID',
            'sample_bibliographic_references',
            'sample_identifier',
            'animal_birth_date',
            'sample_storage_temperature',
            'sample_type_name',
            'body_part_name',
            'sampling_date',
            'sampling_protocol_url',
            'sample_availability',
            'EBI_Biosample_identifier',
        ]

    def __init__(self):
        self.data = None
        self.header = None
        self.dialect = None
        self.items = None
        self.filename = None

    @classmethod
    def get_dialect(cls, chunk):
        """Determine dialect of a CSV from a chunk"""

        return csv.Sniffer().sniff(chunk)

    @classmethod
    def is_valid(cls, chunk):
        """Try to determine if CRBanim has at least the required columns
        or not"""

        dialect = cls.get_dialect(chunk)

        # get a handle from a string
        handle = io.StringIO(chunk)

        # read chunk
        reader = csv.reader(handle, dialect)
        header = next(reader)

        not_found = []

        for column in cls.mandatory_columns:
            if column not in header:
                not_found.append(column)

        if len(not_found) == 0:
            logger.debug("This seems to be a valid CRBanim file")
            return True, []

        else:
            logger.error("Couldn't not find mandatory CRBanim columns %s" % (
                not_found))
            return False, not_found

    def read_file(self, filename):
        """Read crb anim files and set tit to class attribute"""

        with open(filename, newline='') as handle:
            # initialize data
            self.filename = filename
            self.data = []

            # get dialect
            chunk = handle.read(2048)
            self.dialect = self.get_dialect(chunk)

            # restart filename from the beginning
            handle.seek(0)

            # read csv file
            reader = csv.reader(handle, self.dialect)
            self.header = next(reader)

            # find sex index column
            sex_idx = self.header.index('sex')

            # create a namedtuple object
            Data = namedtuple("Data", self.header)

            # add records to data
            for record in reader:
                # replace all "\\N" occurences in a list
                record = [None if col in ["\\N", ""]
                          else col for col in record]

                # 'unknown' sex should be replaced with 'record of unknown sex'
                if record[sex_idx].lower() == 'unknown':
                    logger.debug(
                        "Changing '%s' with '%s'" % (
                            record[sex_idx], 'record of unknown sex'))
                    record[sex_idx] = 'record of unknown sex'

                record = Data._make(record)
                self.data.append(record)

        self.items = self.eval_columns()

    def eval_columns(self):
        """define a set from column data"""

        # target_columns = ['sex', 'species_latin_name', 'breed_name']
        target_columns = self.header

        items = defaultdict(list)

        for line in self.data:
            for column in target_columns:
                idx = self.header.index(column)
                items[column].append(line[idx])

        # now get a set of object
        for column in target_columns:
            items[column] = set(items[column])

        return items

    def print_line(self, num):
        """print a record with its column names"""

        for i, column in enumerate(self.header):
            logger.debug("%s: %s" % (column, self.data[num][i]))

    def filter_by_column_values(self, column, values, ignorecase=False):
        if ignorecase is True:
            # lower values
            values = [value.lower() for value in values]

        for line in self.data:
            # search for case insensitive value (lower attrib in lower values)
            if ignorecase is True:
                if getattr(line, column).lower() in values:
                    yield line

                else:
                    logger.debug("Filtering: %s" % (str(line)))

            else:
                if getattr(line, column) in values:
                    yield line

                else:
                    logger.debug("Filtering: %s" % (str(line)))

            # ignore case or not

        # cicle for line

    def __check_items(self, item_set, model, column):
        """General check of CRBanim items into database"""

        # a list of not found terms and a status to see if something is missing
        # or not
        not_found = []
        result = True

        for item in item_set:
            # check for species in database
            if not model.objects.filter(label=item).exists():
                not_found.append(item)

        if len(not_found) != 0:
            result = False
            logger.warning(
                "Those %s are not present in UID database:" % (column))
            logger.warning(not_found)

        return result, not_found

    # a function to detect if crbanim species are in UID database or not
    def check_species(self, country):
        """Check if all species are defined in UID DictSpecies"""

        # CRBAnim usually have species in the form required for UID
        # However sometimes there could be a common name, not a DictSpecie one
        column = 'species_latin_name'

        check, not_found = self.__check_items(
            self.items[column], DictSpecie, column)

        if check is False:
            # try to check in dictionary table
            logger.info("Searching for %s in dictionary tables" % (not_found))

            # if this function return True, I found all synonyms
            if check_species_synonyms(not_found, country) is True:
                logger.info("Found %s in dictionary tables" % not_found)

                # return True and an empty list for check and not found items
                return True, []

        # if I arrive here, there are species that I couldn't find
        logger.error("Couldnt' find those species in dictionary tables:")
        logger.error(not_found)

        return check, not_found

    # check that dict sex table contains data
    def check_sex(self):
        """check that dict sex table contains data"""

        # item.sex are in uppercase
        column = 'sex'
        item_set = [item.lower() for item in self.items[column]]

        return self.__check_items(item_set, DictSex, column)


def fill_uid_breed(record, language):
    """Fill DioctBreed from a crbanim record"""

    # get a DictSpecie object. Species are in latin names, but I can
    # find also a common name in translation tables
    try:
        specie = DictSpecie.objects.get(label=record.species_latin_name)

    except DictSpecie.DoesNotExist:
        logger.info("Search %s in synonyms" % (record.species_latin_name))
        # search for language synonym (if I arrived here a synonym should
        # exists)
        specie = DictSpecie.get_by_synonym(
            synonym=record.species_latin_name,
            language=language)

    # get country name using pycountries
    country_name = pycountry.countries.get(
        alpha_2=record.country_of_origin).name

    # get country for breeds. Ideally will be the same of submission,
    # however, it could be possible to store data from other contries
    country, created = DictCountry.objects.get_or_create(
        label=country_name)

    # I could create a country from a v_breed_specie instance. That's
    # ok, maybe I could have a lot of breed from different countries and
    # a few organizations submitting them
    if created:
        logger.info("Created %s" % country)

    else:
        logger.debug("Found %s" % country)

    breed, created = DictBreed.objects.get_or_create(
        supplied_breed=record.breed_name,
        specie=specie,
        country=country)

    if created:
        logger.info("Created %s" % breed)

    else:
        logger.debug("Found %s" % breed)

    # return a DictBreed object
    return breed


def fill_uid_names(record, submission):
    """fill Names table from crbanim record"""

    # in the same record I have the sample identifier and animal identifier
    # a name record for animal
    animal_name, created = Name.objects.get_or_create(
        name=record.animal_ID,
        submission=submission,
        owner=submission.owner)

    if created:
        logger.debug("Created animal name %s" % animal_name)

    else:
        logger.debug("Found animal name %s" % animal_name)

    # get a publication (if present)
    publication = None

    # HINT: mind this mispelling
    if record.sample_bibliographic_references:
        publication, created = Publication.objects.get_or_create(
            doi=record.sample_bibliographic_references)

        if created:
            logger.debug("Created publication %s" % publication)

    # name record for sample
    sample_name, created = Name.objects.get_or_create(
        name=record.sample_identifier,
        submission=submission,
        owner=submission.owner,
        publication=publication)

    if created:
        logger.debug("Created sample name %s" % sample_name)

    else:
        logger.debug("Found sample name %s" % sample_name)

    # returning 2 Name instances
    return animal_name, sample_name


def fill_uid_animal(record, animal_name, breed, submission, animals):
    """Helper function to fill animal data in UID animal table"""

    # HINT: does CRBAnim models mother and father?

    # check if such animal is already beed updated
    if animal_name.name in animals:
        logger.debug(
            "Ignoring %s: already created or updated" % (animal_name))

        # return an animal object
        animal = animals[animal_name.name]

    else:
        # determine sex. Check for values
        sex = DictSex.objects.get(label__iexact=record.sex)

        # there's no birth_location for animal in CRBAnim
        accuracy = MISSING

        # create a new object. Using defaults to avoid collisions when
        # updating data
        # HINT: CRBanim has less attribute than cryoweb
        defaults = {
            # HINT: is a duplication of name. Can this be non-mandatory?
            'alternative_id': animal_name.name,
            'breed': breed,
            'sex': sex,
            'birth_date': record.animal_birth_date,
            'birth_location_accuracy': accuracy,
            'owner': submission.owner
        }

        # HINT: I could have the same animal again and again. Should I update
        # every times?
        animal, created = Animal.objects.update_or_create(
            name=animal_name,
            defaults=defaults)

        if created:
            logger.debug("Created animal %s" % animal)

        else:
            logger.debug("Updating animal %s" % animal)

        # track this animal in dictionary
        animals[animal_name.name] = animal

    # I need to track animal to relate the sample
    return animal


def find_storage_type(record):
    """Determine a sample storage relying on a dictionary"""

    mapping = {
        '-196°C': 'frozen, liquid nitrogen',
        '-20°C': 'frozen, -20 degrees Celsius freezer',
        '-30°C': 'frozen, -20 degrees Celsius freezer',
        '-80°C': 'frozen, -80 degrees Celsius freezer'}

    if record.sample_storage_temperature in mapping:
        return mapping[record.sample_storage_temperature]

    else:
        logging.warning("Couldn't find %s in storage types mapping" % (
            record.sample_storage_temperature))

        return None


def fill_uid_sample(record, sample_name, animal, submission):
    """Helper function to fill animal data in UID sample table"""

    # name and animal name come from parameters
    organism_part_label = None
    sample_type_name = record.sample_type_name.lower()
    body_part_name = record.body_part_name.lower()

    # sylvain has proposed to apply the following decision rule:
    if body_part_name != "unknown" and body_part_name != "not relevant":
        organism_part_label = body_part_name

    else:
        organism_part_label = sample_type_name

    # get a organism part. Organism parts need to be in lowercases
    organism_part, created = DictUberon.objects.get_or_create(
        label=organism_part_label
    )

    if created:
        logger.info("Created uberon %s" % organism_part)

    else:
        logger.debug("Found uberon %s" % organism_part)

    # calculate animal age at collection
    animal_birth_date = parse_date(record.animal_birth_date)
    sampling_date = parse_date(record.sampling_date)
    animal_age_at_collection, time_units = image_timedelta(
        sampling_date, animal_birth_date)

    # create a new object. Using defaults to avoid collisions when
    # updating data
    defaults = {
        # HINT: is a duplication of name. Can this be non-mandatory?
        'alternative_id': sample_name.name,
        'collection_date': record.sampling_date,
        'protocol': record.sampling_protocol_url,
        'organism_part': organism_part,
        'animal': animal,
        # 'description': v_vessel.comment,
        'owner': submission.owner,
        'storage': find_storage_type(record),
        'availability': record.sample_availability,
        'animal_age_at_collection': animal_age_at_collection,
        'animal_age_at_collection_units': time_units
    }

    sample, created = Sample.objects.update_or_create(
        name=sample_name,
        defaults=defaults)

    if created:
        logger.debug("Created sample %s" % sample)

    else:
        logger.debug("Updating sample %s" % sample)

    return sample


def process_record(record, submission, animals, language):
    # Peter mail 26/02/19 18:30: I agree that it sounds like we will
    # need to create sameAs BioSamples for the IMAGE project, and it makes
    # sense that the inject tool is able to do this.  It may be that we
    # tackle these cases after getting the main part of the inject tool
    # functioning and hold or ignore these existing BioSamples for now.
    # HINT: record with a biosample id should be ignored, for the moment
    if record.EBI_Biosample_identifier is not None:
        logger.warning("Ignoring %s: already in biosample!" % str(record))
        return

    # filling breeds
    breed = fill_uid_breed(record, language)

    # filling name tables
    animal_name, sample_name = fill_uid_names(record, submission)

    # fill animal
    animal = fill_uid_animal(record, animal_name, breed, submission, animals)

    # fill sample
    fill_uid_sample(record, sample_name, animal, submission)


def upload_crbanim(submission):
    # debug
    logger.info("Importing from CRB-Anim file")

    # this is the full path in docker container
    fullpath = submission.get_uploaded_file_path()

    # read submission data
    reader = CRBAnimReader()
    reader.read_file(fullpath)

    # start data loading
    try:
        # check for species and sex in a similar way as cryoweb does
        check, not_found = reader.check_sex()

        if not check:
            message = (
                "Not all Sex terms are loaded into database: "
                "check for %s in your dataset" % (not_found))

            raise CRBAnimImportError(message)

        check, not_found = reader.check_species(submission.gene_bank_country)

        if not check:
            raise CRBAnimImportError(
                "Some species are not loaded in UID database: "
                "%s" % (not_found))

        # ok get languages from submission (useful for translation)
        # HINT: no traslations implemented, at the moment
        language = submission.gene_bank_country.label

        # a dictionary in which store animal data
        animals = {}

        for record in reader.data:
            process_record(record, submission, animals, language)

        # after processing records, initilize validationsummary objects
        # create a validation summary object and set all_count
        vs_animal, created = ValidationSummary.objects.get_or_create(
            submission=submission, type="animal")

        if created:
            logger.debug(
                "ValidationSummary animal created for "
                "submission %s" % submission)

        # reset counts
        vs_animal.reset_all_count()

        vs_sample, created = ValidationSummary.objects.get_or_create(
            submission=submission, type="sample")

        if created:
            logger.debug(
                "ValidationSummary sample created for "
                "submission %s" % submission)

        # reset counts
        vs_sample.reset_all_count()

    except Exception as exc:
        # set message:
        message = "Error in importing data: %s" % (str(exc))

        # save a message in database
        submission.status = ERROR
        submission.message = message
        submission.save()

        # send async message
        send_message(submission)

        # debug
        logger.error("error in importing from crbanim: %s" % (exc))
        logger.exception(exc)

        return False

    else:
        message = "CRBAnim import completed for submission: %s" % (
            submission.id)

        submission.message = message
        submission.status = LOADED
        submission.save()

        # send async message
        send_message(submission, send_validation=True)

    logger.info("Import from CRBAnim is complete")

    return True


1		#!/usr/bin/env python3
2		# -- coding: utf-8 --
3		"""
4		Created on Thu Feb 21 15:37:16 2019
5
6		@author: Paolo Cozzi <[email protected]>
7		"""
8
9		import io
10		import csv
11		import logging
12		import pycountry
13		import asyncio
14
15		from collections import defaultdict, namedtuple
16
17		from django.utils.dateparse import parse_date
18
19		from common.constants import LOADED, ERROR, MISSING, STATUSES
20		from common.helpers import image_timedelta, send_message_to_websocket
21		from image_app.models import (
22		DictSpecie, DictSex, DictCountry, DictBreed, Name, Animal, Sample,
23		DictUberon, Publication)
24		from language.helpers import check_species_synonyms
25		from validation.helpers import construct_validation_message
26		from validation.models import ValidationSummary
27
28		# Get an instance of a logger
29		logger = logging.getLogger(__name__)
30
31
32		# A class to deal with cryoweb import errors
33		class CRBAnimImportError(Exception):
34		pass
35
36
37	View Code Duplication	def send_message(submission_obj, send_validation=False):
		0 ignored issues – show Duplication introduced 2019-06-20 15:48 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
38		"""
39		Update submission.status and submission message using django
40		channels
41
42		Args:
43		submission_obj (image_app.models.Submission): an UID submission
44		object
45		send_validation (bool): send validation message or not
46		"""
47
48		# define a message to send
49		message = {
50		'message': STATUSES.get_value_display(submission_obj.status),
51		'notification_message': submission_obj.message,
52		}
53
54		# if validation message is needed, add to the final message
55		if send_validation:
56		message['validation_message'] = construct_validation_message(
57		submission_obj)
58
59		# now send the message to its submission
60		asyncio.get_event_loop().run_until_complete(
61		send_message_to_websocket(
62		message,
63		submission_obj.pk
64		)
65		)
66
67
68		class CRBAnimReader():
69		mandatory_columns = [
70		'sex',
71		'species_latin_name',
72		'country_of_origin',
73		'breed_name',
74		'animal_ID',
75		'sample_bibliographic_references',
76		'sample_identifier',
77		'animal_birth_date',
78		'sample_storage_temperature',
79		'sample_type_name',
80		'body_part_name',
81		'sampling_date',
82		'sampling_protocol_url',
83		'sample_availability',
84		'EBI_Biosample_identifier',
85		]
86
87		def __init__(self):
88		self.data = None
89		self.header = None
90		self.dialect = None
91		self.items = None
92		self.filename = None
93
94		@classmethod
95		def get_dialect(cls, chunk):
96		"""Determine dialect of a CSV from a chunk"""
97
98		return csv.Sniffer().sniff(chunk)
99
100		@classmethod
101		def is_valid(cls, chunk):
102		"""Try to determine if CRBanim has at least the required columns
103		or not"""
104
105		dialect = cls.get_dialect(chunk)
106
107		# get a handle from a string
108		handle = io.StringIO(chunk)
109
110		# read chunk
111		reader = csv.reader(handle, dialect)
112		header = next(reader)
113
114		not_found = []
115
116		for column in cls.mandatory_columns:
117		if column not in header:
118		not_found.append(column)
119
120		if len(not_found) == 0:
121		logger.debug("This seems to be a valid CRBanim file")
122		return True, []
123
124		else:
125		logger.error("Couldn't not find mandatory CRBanim columns %s" % (
126		not_found))
127		return False, not_found
128
129		def read_file(self, filename):
130		"""Read crb anim files and set tit to class attribute"""
131
132		with open(filename, newline='') as handle:
133		# initialize data
134		self.filename = filename
135		self.data = []
136
137		# get dialect
138		chunk = handle.read(2048)
139		self.dialect = self.get_dialect(chunk)
140
141		# restart filename from the beginning
142		handle.seek(0)
143
144		# read csv file
145		reader = csv.reader(handle, self.dialect)
146		self.header = next(reader)
147
148		# find sex index column
149		sex_idx = self.header.index('sex')
150
151		# create a namedtuple object
152		Data = namedtuple("Data", self.header)
153
154		# add records to data
155		for record in reader:
156		# replace all "\\N" occurences in a list
157		record = [None if col in ["\\N", ""]
158		else col for col in record]
159
160		# 'unknown' sex should be replaced with 'record of unknown sex'
161		if record[sex_idx].lower() == 'unknown':
162		logger.debug(
163		"Changing '%s' with '%s'" % (
164		record[sex_idx], 'record of unknown sex'))
165		record[sex_idx] = 'record of unknown sex'
166
167		record = Data._make(record)
168		self.data.append(record)
169
170		self.items = self.eval_columns()
171
172		def eval_columns(self):
173		"""define a set from column data"""
174
175		# target_columns = ['sex', 'species_latin_name', 'breed_name']
176		target_columns = self.header
177
178		items = defaultdict(list)
179
180		for line in self.data:
181		for column in target_columns:
182		idx = self.header.index(column)
183		items[column].append(line[idx])
184
185		# now get a set of object
186		for column in target_columns:
187		items[column] = set(items[column])
188
189		return items
190
191		def print_line(self, num):
192		"""print a record with its column names"""
193
194		for i, column in enumerate(self.header):
195		logger.debug("%s: %s" % (column, self.data[num][i]))
196
197		def filter_by_column_values(self, column, values, ignorecase=False):
198		if ignorecase is True:
199		# lower values
200		values = [value.lower() for value in values]
201
202		for line in self.data:
203		# search for case insensitive value (lower attrib in lower values)
204		if ignorecase is True:
205		if getattr(line, column).lower() in values:
206		yield line
207
208		else:
209		logger.debug("Filtering: %s" % (str(line)))
210
211		else:
212		if getattr(line, column) in values:
213		yield line
214
215		else:
216		logger.debug("Filtering: %s" % (str(line)))
217
218		# ignore case or not
219
220		# cicle for line
221
222		def __check_items(self, item_set, model, column):
223		"""General check of CRBanim items into database"""
224
225		# a list of not found terms and a status to see if something is missing
226		# or not
227		not_found = []
228		result = True
229
230		for item in item_set:
231		# check for species in database
232		if not model.objects.filter(label=item).exists():
233		not_found.append(item)
234
235		if len(not_found) != 0:
236		result = False
237		logger.warning(
238		"Those %s are not present in UID database:" % (column))
239		logger.warning(not_found)
240
241		return result, not_found
242
243		# a function to detect if crbanim species are in UID database or not
244		def check_species(self, country):
245		"""Check if all species are defined in UID DictSpecies"""
246
247		# CRBAnim usually have species in the form required for UID
248		# However sometimes there could be a common name, not a DictSpecie one
249		column = 'species_latin_name'
250
251		check, not_found = self.__check_items(
252		self.items[column], DictSpecie, column)
253
254		if check is False:
255		# try to check in dictionary table
256		logger.info("Searching for %s in dictionary tables" % (not_found))
257
258		# if this function return True, I found all synonyms
259		if check_species_synonyms(not_found, country) is True:
260		logger.info("Found %s in dictionary tables" % not_found)
261
262		# return True and an empty list for check and not found items
263		return True, []
264
265		# if I arrive here, there are species that I couldn't find
266		logger.error("Couldnt' find those species in dictionary tables:")
267		logger.error(not_found)
268
269		return check, not_found
270
271		# check that dict sex table contains data
272		def check_sex(self):
273		"""check that dict sex table contains data"""
274
275		# item.sex are in uppercase
276		column = 'sex'
277		item_set = [item.lower() for item in self.items[column]]
278
279		return self.__check_items(item_set, DictSex, column)
280
281
282		def fill_uid_breed(record, language):
283		"""Fill DioctBreed from a crbanim record"""
284
285		# get a DictSpecie object. Species are in latin names, but I can
286		# find also a common name in translation tables
287		try:
288		specie = DictSpecie.objects.get(label=record.species_latin_name)
289
290		except DictSpecie.DoesNotExist:
291		logger.info("Search %s in synonyms" % (record.species_latin_name))
292		# search for language synonym (if I arrived here a synonym should
293		# exists)
294		specie = DictSpecie.get_by_synonym(
295		synonym=record.species_latin_name,
296		language=language)
297
298		# get country name using pycountries
299		country_name = pycountry.countries.get(
300		alpha_2=record.country_of_origin).name
301
302		# get country for breeds. Ideally will be the same of submission,
303		# however, it could be possible to store data from other contries
304		country, created = DictCountry.objects.get_or_create(
305		label=country_name)
306
307		# I could create a country from a v_breed_specie instance. That's
308		# ok, maybe I could have a lot of breed from different countries and
309		# a few organizations submitting them
310		if created:
311		logger.info("Created %s" % country)
312
313		else:
314		logger.debug("Found %s" % country)
315
316		breed, created = DictBreed.objects.get_or_create(
317		supplied_breed=record.breed_name,
318		specie=specie,
319		country=country)
320
321		if created:
322		logger.info("Created %s" % breed)
323
324		else:
325		logger.debug("Found %s" % breed)
326
327		# return a DictBreed object
328		return breed
329
330
331		def fill_uid_names(record, submission):
332		"""fill Names table from crbanim record"""
333
334		# in the same record I have the sample identifier and animal identifier
335		# a name record for animal
336		animal_name, created = Name.objects.get_or_create(
337		name=record.animal_ID,
338		submission=submission,
339		owner=submission.owner)
340
341		if created:
342		logger.debug("Created animal name %s" % animal_name)
343
344		else:
345		logger.debug("Found animal name %s" % animal_name)
346
347		# get a publication (if present)
348		publication = None
349
350		# HINT: mind this mispelling
351		if record.sample_bibliographic_references:
352		publication, created = Publication.objects.get_or_create(
353		doi=record.sample_bibliographic_references)
354
355		if created:
356		logger.debug("Created publication %s" % publication)
357
358		# name record for sample
359		sample_name, created = Name.objects.get_or_create(
360		name=record.sample_identifier,
361		submission=submission,
362		owner=submission.owner,
363		publication=publication)
364
365		if created:
366		logger.debug("Created sample name %s" % sample_name)
367
368		else:
369		logger.debug("Found sample name %s" % sample_name)
370
371		# returning 2 Name instances
372		return animal_name, sample_name
373
374
375		def fill_uid_animal(record, animal_name, breed, submission, animals):
376		"""Helper function to fill animal data in UID animal table"""
377
378		# HINT: does CRBAnim models mother and father?
379
380		# check if such animal is already beed updated
381		if animal_name.name in animals:
382		logger.debug(
383		"Ignoring %s: already created or updated" % (animal_name))
384
385		# return an animal object
386		animal = animals[animal_name.name]
387
388		else:
389		# determine sex. Check for values
390		sex = DictSex.objects.get(label__iexact=record.sex)
391
392		# there's no birth_location for animal in CRBAnim
393		accuracy = MISSING
394
395		# create a new object. Using defaults to avoid collisions when
396		# updating data
397		# HINT: CRBanim has less attribute than cryoweb
398		defaults = {
399		# HINT: is a duplication of name. Can this be non-mandatory?
400		'alternative_id': animal_name.name,
401		'breed': breed,
402		'sex': sex,
403		'birth_date': record.animal_birth_date,
404		'birth_location_accuracy': accuracy,
405		'owner': submission.owner
406		}
407
408		# HINT: I could have the same animal again and again. Should I update
409		# every times?
410		animal, created = Animal.objects.update_or_create(
411		name=animal_name,
412		defaults=defaults)
413
414		if created:
415		logger.debug("Created animal %s" % animal)
416
417		else:
418		logger.debug("Updating animal %s" % animal)
419
420		# track this animal in dictionary
421		animals[animal_name.name] = animal
422
423		# I need to track animal to relate the sample
424		return animal
425
426
427		def find_storage_type(record):
428		"""Determine a sample storage relying on a dictionary"""
429
430		mapping = {
431		'-196°C': 'frozen, liquid nitrogen',
432		'-20°C': 'frozen, -20 degrees Celsius freezer',
433		'-30°C': 'frozen, -20 degrees Celsius freezer',
434		'-80°C': 'frozen, -80 degrees Celsius freezer'}
435
436		if record.sample_storage_temperature in mapping:
437		return mapping[record.sample_storage_temperature]
438
439		else:
440		logging.warning("Couldn't find %s in storage types mapping" % (
441		record.sample_storage_temperature))
442
443		return None
444
445
446		def fill_uid_sample(record, sample_name, animal, submission):
447		"""Helper function to fill animal data in UID sample table"""
448
449		# name and animal name come from parameters
450		organism_part_label = None
451		sample_type_name = record.sample_type_name.lower()
452		body_part_name = record.body_part_name.lower()
453
454		# sylvain has proposed to apply the following decision rule:
455		if body_part_name != "unknown" and body_part_name != "not relevant":
456		organism_part_label = body_part_name
457
458		else:
459		organism_part_label = sample_type_name
460
461		# get a organism part. Organism parts need to be in lowercases
462		organism_part, created = DictUberon.objects.get_or_create(
463		label=organism_part_label
464		)
465
466		if created:
467		logger.info("Created uberon %s" % organism_part)
468
469		else:
470		logger.debug("Found uberon %s" % organism_part)
471
472		# calculate animal age at collection
473		animal_birth_date = parse_date(record.animal_birth_date)
474		sampling_date = parse_date(record.sampling_date)
475		animal_age_at_collection, time_units = image_timedelta(
476		sampling_date, animal_birth_date)
477
478		# create a new object. Using defaults to avoid collisions when
479		# updating data
480		defaults = {
481		# HINT: is a duplication of name. Can this be non-mandatory?
482		'alternative_id': sample_name.name,
483		'collection_date': record.sampling_date,
484		'protocol': record.sampling_protocol_url,
485		'organism_part': organism_part,
486		'animal': animal,
487		# 'description': v_vessel.comment,
488		'owner': submission.owner,
489		'storage': find_storage_type(record),
490		'availability': record.sample_availability,
491		'animal_age_at_collection': animal_age_at_collection,
492		'animal_age_at_collection_units': time_units
493		}
494
495		sample, created = Sample.objects.update_or_create(
496		name=sample_name,
497		defaults=defaults)
498
499		if created:
500		logger.debug("Created sample %s" % sample)
501
502		else:
503		logger.debug("Updating sample %s" % sample)
504
505		return sample
506
507
508		def process_record(record, submission, animals, language):
509		# Peter mail 26/02/19 18:30: I agree that it sounds like we will
510		# need to create sameAs BioSamples for the IMAGE project, and it makes
511		# sense that the inject tool is able to do this. It may be that we
512		# tackle these cases after getting the main part of the inject tool
513		# functioning and hold or ignore these existing BioSamples for now.
514		# HINT: record with a biosample id should be ignored, for the moment
515		if record.EBI_Biosample_identifier is not None:
516		logger.warning("Ignoring %s: already in biosample!" % str(record))
517		return
518
519		# filling breeds
520		breed = fill_uid_breed(record, language)
521
522		# filling name tables
523		animal_name, sample_name = fill_uid_names(record, submission)
524
525		# fill animal
526		animal = fill_uid_animal(record, animal_name, breed, submission, animals)
527
528		# fill sample
529		fill_uid_sample(record, sample_name, animal, submission)
530
531
532		def upload_crbanim(submission):
533		# debug
534		logger.info("Importing from CRB-Anim file")
535
536		# this is the full path in docker container
537		fullpath = submission.get_uploaded_file_path()
538
539		# read submission data
540		reader = CRBAnimReader()
541		reader.read_file(fullpath)
542
543		# start data loading
544		try:
545		# check for species and sex in a similar way as cryoweb does
546		check, not_found = reader.check_sex()
547
548		if not check:
549		message = (
550		"Not all Sex terms are loaded into database: "
551		"check for %s in your dataset" % (not_found))
552
553		raise CRBAnimImportError(message)
554
555		check, not_found = reader.check_species(submission.gene_bank_country)
556
557		if not check:
558		raise CRBAnimImportError(
559		"Some species are not loaded in UID database: "
560		"%s" % (not_found))
561
562		# ok get languages from submission (useful for translation)
563		# HINT: no traslations implemented, at the moment
564		language = submission.gene_bank_country.label
565
566		# a dictionary in which store animal data
567		animals = {}
568
569		for record in reader.data:
570		process_record(record, submission, animals, language)
571
572		# after processing records, initilize validationsummary objects
573		# create a validation summary object and set all_count
574		vs_animal, created = ValidationSummary.objects.get_or_create(
575		submission=submission, type="animal")
576
577		if created:
578		logger.debug(
579		"ValidationSummary animal created for "
580		"submission %s" % submission)
581
582		# reset counts
583		vs_animal.reset_all_count()
584
585		vs_sample, created = ValidationSummary.objects.get_or_create(
586		submission=submission, type="sample")
587
588		if created:
589		logger.debug(
590		"ValidationSummary sample created for "
591		"submission %s" % submission)
592
593		# reset counts
594		vs_sample.reset_all_count()
595
596		except Exception as exc:
597		# set message:
598		message = "Error in importing data: %s" % (str(exc))
599
600		# save a message in database
601		submission.status = ERROR
602		submission.message = message
603		submission.save()
604
605		# send async message
606		send_message(submission)
607
608		# debug
609		logger.error("error in importing from crbanim: %s" % (exc))
610		logger.exception(exc)
611
612		return False
613
614		else:
615		message = "CRBAnim import completed for submission: %s" % (
616		submission.id)
617
618		submission.message = message
619		submission.status = LOADED
620		submission.save()
621
622		# send async message
623		send_message(submission, send_validation=True)
624
625		logger.info("Import from CRBAnim is complete")
626
627		return True
628

cnr-ibba / IMAGE-InjectTool

Pull Request — master (#35)

crbanim.helpers.CRBAnimReader.get_dialect() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like