excel.helpers.exceltemplate - Code Metrics - Inspection of ":sparkles: import from Template file" - cnr-ibba/IMAGE-InjectTool - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#40)

by Paolo

created 2019-07-09 14:58 UTC

excel.helpers.exceltemplate A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	275
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	28
eloc	152
dl	0
loc	275
rs	10
c	0
b	0
f	0

12 Methods

Rating	Name	Size	Complexity
A	ExcelTemplateReader.check_columns()	28	5
A	ExcelTemplateReader.__init__()	4	1
A	ExcelTemplateReader.check_sheets()	19	4
A	ExcelTemplateReader.__check_accuracy()	20	4
A	ExcelTemplateReader.get_sample_records()	6	1
A	ExcelTemplateReader.check_accuracies()	25	2
A	ExcelTemplateReader.read_file()	4	1
A	ExcelTemplateReader.check_sex()	8	1
A	ExcelTemplateReader.get_animal_records()	6	1
A	ExcelTemplateReader.check_species()	8	1
A	ExcelTemplateReader.get_breed_records()	6	1
B	ExcelTemplateReader.get_sheet_records()	55	6

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul  5 16:31:45 2019

@author: Paolo Cozzi <[email protected]>
"""

import xlrd
import logging
import datetime

from collections import defaultdict, namedtuple

from common.constants import ACCURACIES
from image_app.helpers import FileDataSourceMixin
from image_app.models import DictSex

# Get an instance of a logger
logger = logging.getLogger(__name__)

# defining the template columns in need for data import
TEMPLATE_COLUMNS = {
    'breed': [
        'Supplied breed',
        # 'Mapped breed',
        # 'Mapped breed ontology library',
        # 'Mapped breed ontology accession',
        'EFABIS Breed country',
        'Species',
        # 'Species ontology library',
        # 'Species ontology accession'
    ],
    'animal': [
        'Animal id in data source',
        'Animal description',
        'Alternative animal ID',
        'Father id in data source',
        'Mother id in data source',
        'Breed',
        'Species',
        'Sex',
        'Birth date',
        'Birth location',
        'Birth location longitude',
        'Birth location latitude',
        'Birth location accuracy'
    ],
    'sample': [
        'Sample id in data source',
        'Alternative sample ID',
        'Sample description',
        'Animal id in data source',
        'Specimen collection protocol',
        'availability',
        'Collection date',
        'Collection place latitude',
        'Collection place longitude',
        'Collection place',
        'Collection place accuracy',
        'Organism part',
        'Developmental stage',
        'Physiological stage',
        'Animal age at collection',
        'Sample storage',
        'Sample storage processing',
        'Sampling to preparation interval'
    ]
}


class ExcelTemplateReader(FileDataSourceMixin):
    """A class to read template excel files"""

    def __init__(self):
        # read xls file and track it
        self.book = None
        self.sheet_names = []

    def read_file(self, filename):
        # read xls file and track it
        self.book = xlrd.open_workbook(filename)
        self.sheet_names = self.book.sheet_names()

    def check_sheets(self):
        """Test for the minimal sheets required to upload data"""

        not_found = []

        for sheet_name in TEMPLATE_COLUMNS.keys():
            if sheet_name not in self.sheet_names:
                not_found.append(sheet_name)
                logger.error(
                    "required sheet {name} not found in template".format(
                        name=sheet_name)
                )

        if len(not_found) > 0:
            return False, not_found

        else:
            logger.debug("This seems to be a valid Template file")
            return True, not_found

    def check_columns(self):
        """Test for minimal column required for template load"""

        not_found = defaultdict(list)

        for sheet_name in TEMPLATE_COLUMNS.keys():
            # get a sheet from xls workbook
            sheet = self.book.sheet_by_name(sheet_name)

            # get header from sheet
            header = sheet.row_values(0)

            for column in TEMPLATE_COLUMNS[sheet_name]:
                if column not in header:
                    not_found[sheet_name].append(column)
                    logger.error(
                        "required column {column} not found in sheet "
                        "{sheet_name}".format(
                            sheet_name=sheet_name,
                            column=column)
                    )

        if len(not_found) > 0:
            return False, not_found

        else:
            logger.debug("This seems to be a valid Template file")
            return True, not_found

    def get_sheet_records(self, sheet_name):
        """Generic functions to iterate on excel records"""

        # this is the sheet I need
        sheet = self.book.sheet_by_name(sheet_name)

        # now get columns to create a collection objects
        header = sheet.row_values(0)

        column_idxs = {}

        # get the column index I need
        for column in TEMPLATE_COLUMNS[sheet_name]:
            idx = header.index(column)
            column_idxs[column.lower().replace(" ", "_")] = idx

        # get new column names
        columns = column_idxs.keys()

        # create a namedtuple object
        Record = namedtuple(sheet_name.capitalize(), columns)

        # iterate over record, mind the header column
        for i in range(1, sheet.nrows):
            # get a row from excel file
            row = sheet.row_values(i)

            # get the data I need
            data = [row[column_idxs[column]] for column in columns]

            # replace all empty  occurences in a list
            data = [None if col in [""]
                    else col for col in data]

            # fix date fields. Search for 'date' in column names
            date_idxs = [column_idxs[column] for column in columns if
                         'date' in column]

            # fix date objects using datetime, as described here:
            # https://stackoverflow.com/a/13962976/4385116
            for idx in date_idxs:
                if not data[idx]:
                    continue

                data[idx] = datetime.datetime(
                    *xlrd.xldate_as_tuple(
                        data[idx],
                        self.book.datemode
                    )
                )

            # get a new object
            record = Record._make(data)

            yield record

    def get_breed_records(self):
        """Iterate among breeds record"""

        # this is the sheet I need
        sheet_name = "breed"
        return self.get_sheet_records(sheet_name)

    def get_animal_records(self):
        """Iterate among animal records"""

        # this is the sheet I need
        sheet_name = "animal"
        return self.get_sheet_records(sheet_name)

    def get_sample_records(self):
        """Iterate among sample records"""

        # this is the sheet I need
        sheet_name = "sample"
        return self.get_sheet_records(sheet_name)

    def check_species(self, country):
        """Check if all species are defined in UID DictSpecies"""

        column = 'species'
        item_set = set([breed.species for breed in self.get_breed_records()])

        # call FileDataSourceMixin.check_species
        return super().check_species(column, item_set, country)

    def check_sex(self):
        """Check that all sex records are present in database"""

        column = 'sex'
        item_set = set([animal.sex for animal in self.get_animal_records()])

        # call FileDataSourceMixin.check_items
        return self.check_items(item_set, DictSex, column)

    def __check_accuracy(self, item_set):
        """A generic method to test for accuracies"""

        # a list of not found terms and a status to see if something is missing
        # or not
        not_found = []
        result = True

        for item in item_set:
            try:
                ACCURACIES.get_value_by_desc(item)

            except KeyError:
                logger.warning("accuracy level '%s' not found" % (item))
                not_found.append(item)

        if len(not_found) != 0:
            result = False

        return result, not_found

    def check_accuracies(self):
        """Check accuracy specified in table"""

        item_set = set([animal.birth_location_accuracy
                        for animal in self.get_animal_records()])

        # test for accuracy in animal table
        result_animal, not_found_animal = self.__check_accuracy(item_set)

        item_set = set([sample.collection_place_accuracy
                        for sample in self.get_sample_records()])

        # test for accuracy in sample table
        result_sample, not_found_sample = self.__check_accuracy(item_set)

        # merge two results
        check = result_animal and result_sample
        not_found = set(not_found_animal + not_found_sample)

        if check is False:
            logger.error(
                    "Couldnt' find those accuracies in constants:")
            logger.error(not_found)

        return check, not_found


1			#!/usr/bin/env python3
2			# -- coding: utf-8 --
3			"""
4			Created on Fri Jul 5 16:31:45 2019
5
6			@author: Paolo Cozzi <[email protected]>
7			"""
8
9			import xlrd
10			import logging
11			import datetime
12
13			from collections import defaultdict, namedtuple
14
15			from common.constants import ACCURACIES
16			from image_app.helpers import FileDataSourceMixin
17			from image_app.models import DictSex
18
19			# Get an instance of a logger
20			logger = logging.getLogger(__name__)
21
22			# defining the template columns in need for data import
23			TEMPLATE_COLUMNS = {
24			'breed': [
25			'Supplied breed',
26			# 'Mapped breed',
27			# 'Mapped breed ontology library',
28			# 'Mapped breed ontology accession',
29			'EFABIS Breed country',
30			'Species',
31			# 'Species ontology library',
32			# 'Species ontology accession'
33			],
34			'animal': [
35			'Animal id in data source',
36			'Animal description',
37			'Alternative animal ID',
38			'Father id in data source',
39			'Mother id in data source',
40			'Breed',
41			'Species',
42			'Sex',
43			'Birth date',
44			'Birth location',
45			'Birth location longitude',
46			'Birth location latitude',
47			'Birth location accuracy'
48			],
49			'sample': [
50			'Sample id in data source',
51			'Alternative sample ID',
52			'Sample description',
53			'Animal id in data source',
54			'Specimen collection protocol',
55			'availability',
56			'Collection date',
57			'Collection place latitude',
58			'Collection place longitude',
59			'Collection place',
60			'Collection place accuracy',
61			'Organism part',
62			'Developmental stage',
63			'Physiological stage',
64			'Animal age at collection',
65			'Sample storage',
66			'Sample storage processing',
67			'Sampling to preparation interval'
68			]
69			}
70
71
72			class ExcelTemplateReader(FileDataSourceMixin):
73			"""A class to read template excel files"""
74
75			def __init__(self):
76			# read xls file and track it
77			self.book = None
78			self.sheet_names = []
79
80			def read_file(self, filename):
81			# read xls file and track it
82			self.book = xlrd.open_workbook(filename)
83			self.sheet_names = self.book.sheet_names()
84
85			def check_sheets(self):
86			"""Test for the minimal sheets required to upload data"""
87
88			not_found = []
89
90			for sheet_name in TEMPLATE_COLUMNS.keys():
91			if sheet_name not in self.sheet_names:
92			not_found.append(sheet_name)
93			logger.error(
94			"required sheet {name} not found in template".format(
95			name=sheet_name)
96			)
97
98			if len(not_found) > 0:
99			return False, not_found
100
101			else:
102			logger.debug("This seems to be a valid Template file")
103			return True, not_found
104
105			def check_columns(self):
106			"""Test for minimal column required for template load"""
107
108			not_found = defaultdict(list)
109
110			for sheet_name in TEMPLATE_COLUMNS.keys():
111			# get a sheet from xls workbook
112			sheet = self.book.sheet_by_name(sheet_name)
113
114			# get header from sheet
115			header = sheet.row_values(0)
116
117			for column in TEMPLATE_COLUMNS[sheet_name]:
118			if column not in header:
119			not_found[sheet_name].append(column)
120			logger.error(
121			"required column {column} not found in sheet "
122			"{sheet_name}".format(
123			sheet_name=sheet_name,
124			column=column)
125			)
126
127			if len(not_found) > 0:
128			return False, not_found
129
130			else:
131			logger.debug("This seems to be a valid Template file")
132			return True, not_found
133
134			def get_sheet_records(self, sheet_name):
135			"""Generic functions to iterate on excel records"""
136
137			# this is the sheet I need
138			sheet = self.book.sheet_by_name(sheet_name)
139
140			# now get columns to create a collection objects
141			header = sheet.row_values(0)
142
143			column_idxs = {}
144
145			# get the column index I need
146			for column in TEMPLATE_COLUMNS[sheet_name]:
147			idx = header.index(column)
148			column_idxs[column.lower().replace(" ", "_")] = idx
149
150			# get new column names
151			columns = column_idxs.keys()
152
153			# create a namedtuple object
154			Record = namedtuple(sheet_name.capitalize(), columns)
155
156			# iterate over record, mind the header column
157			for i in range(1, sheet.nrows):
158			# get a row from excel file
159			row = sheet.row_values(i)
160
161			# get the data I need
162			data = [row[column_idxs[column]] for column in columns]
163
164			# replace all empty occurences in a list
165			data = [None if col in [""]
166			else col for col in data]
167
168			# fix date fields. Search for 'date' in column names
169			date_idxs = [column_idxs[column] for column in columns if
170			'date' in column]
171
172			# fix date objects using datetime, as described here:
173			# https://stackoverflow.com/a/13962976/4385116
174			for idx in date_idxs:
175			if not data[idx]:
176			continue
177
178			data[idx] = datetime.datetime(
179			*xlrd.xldate_as_tuple(
180			data[idx],
181			self.book.datemode
182			)
183			)
184
185			# get a new object
186			record = Record._make(data)
187
188			yield record
189
190			def get_breed_records(self):
191			"""Iterate among breeds record"""
192
193			# this is the sheet I need
194			sheet_name = "breed"
195			return self.get_sheet_records(sheet_name)
196
197			def get_animal_records(self):
198			"""Iterate among animal records"""
199
200			# this is the sheet I need
201			sheet_name = "animal"
202			return self.get_sheet_records(sheet_name)
203
204			def get_sample_records(self):
205			"""Iterate among sample records"""
206
207			# this is the sheet I need
208			sheet_name = "sample"
209			return self.get_sheet_records(sheet_name)
210
211			def check_species(self, country):
212			"""Check if all species are defined in UID DictSpecies"""
213
214			column = 'species'
215			item_set = set([breed.species for breed in self.get_breed_records()])
216
217			# call FileDataSourceMixin.check_species
218			return super().check_species(column, item_set, country)
219
220			def check_sex(self):
221			"""Check that all sex records are present in database"""
222
223			column = 'sex'
224			item_set = set([animal.sex for animal in self.get_animal_records()])
225
226			# call FileDataSourceMixin.check_items
227			return self.check_items(item_set, DictSex, column)
228
229			def __check_accuracy(self, item_set):
230			"""A generic method to test for accuracies"""
231
232			# a list of not found terms and a status to see if something is missing
233			# or not
234			not_found = []
235			result = True
236
237			for item in item_set:
238			try:
239			ACCURACIES.get_value_by_desc(item)
240
241			except KeyError:
242			logger.warning("accuracy level '%s' not found" % (item))
243			not_found.append(item)
244
245			if len(not_found) != 0:
246			result = False
247
248			return result, not_found
249
250			def check_accuracies(self):
251			"""Check accuracy specified in table"""
252
253			item_set = set([animal.birth_location_accuracy
254			for animal in self.get_animal_records()])
255
256			# test for accuracy in animal table
257			result_animal, not_found_animal = self.__check_accuracy(item_set)
258
259			item_set = set([sample.collection_place_accuracy
260			for sample in self.get_sample_records()])
261
262			# test for accuracy in sample table
263			result_sample, not_found_sample = self.__check_accuracy(item_set)
264
265			# merge two results
266			check = result_animal and result_sample
267			not_found = set(not_found_animal + not_found_sample)
268
269			if check is False:
270			logger.error(
271			"Couldnt' find those accuracies in constants:")
272			logger.error(not_found)
273
274			return check, not_found
275

cnr-ibba / IMAGE-InjectTool

Pull Request — master (#40)

excel.helpers.exceltemplate A

Complexity

Size/Duplication

Importance

12 Methods

Duplication Side-by-Side

Filter issues like