ExcelTemplateReader.check_sheets()   A
last analyzed

Complexity

Conditions 4

Size

Total Lines 19
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 12
dl 0
loc 19
rs 9.8
c 0
b 0
f 0
cc 4
nop 1
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Fri Jul  5 16:31:45 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import xlrd
10
import logging
11
import datetime
12
13
from collections import defaultdict, namedtuple
14
15
from common.constants import ACCURACIES
16
from uid.helpers import FileDataSourceMixin
17
from uid.models import DictSex, DictCountry
18
19
from .exceptions import ExcelImportError
20
21
# Get an instance of a logger
22
logger = logging.getLogger(__name__)
23
24
# defining the template columns in need for data import
25
TEMPLATE_COLUMNS = {
26
    'breed': [
27
        'Supplied breed',
28
        # 'Mapped breed',
29
        # 'Mapped breed ontology library',
30
        # 'Mapped breed ontology accession',
31
        'EFABIS Breed country',
32
        'Species',
33
        # 'Species ontology library',
34
        # 'Species ontology accession'
35
    ],
36
    'animal': [
37
        'Animal id in data source',
38
        'Animal description',
39
        'Alternative animal ID',
40
        'Father id in data source',
41
        'Mother id in data source',
42
        'Breed',
43
        'Species',
44
        'Sex',
45
        'Birth date',
46
        'Birth location',
47
        'Birth location longitude',
48
        'Birth location latitude',
49
        'Birth location accuracy'
50
    ],
51
    'sample': [
52
        'Sample id in data source',
53
        'Alternative sample ID',
54
        'Sample description',
55
        'Animal id in data source',
56
        'Specimen collection protocol',
57
        'availability',
58
        'Collection date',
59
        'Collection place latitude',
60
        'Collection place longitude',
61
        'Collection place',
62
        'Collection place accuracy',
63
        'Organism part',
64
        'Developmental stage',
65
        'Physiological stage',
66
        'Animal age at collection',
67
        'Sample storage',
68
        'Sample storage processing',
69
        'Sampling to preparation interval'
70
    ]
71
}
72
73
74
class ExcelTemplateReader(FileDataSourceMixin):
75
    """A class to read template excel files"""
76
77
    def __init__(self):
78
        # read xls file and track it
79
        self.book = None
80
        self.sheet_names = []
81
82
    def read_file(self, filename):
83
        # read xls file and track it
84
        self.book = xlrd.open_workbook(filename)
85
        self.sheet_names = self.book.sheet_names()
86
87
    def check_sheets(self):
88
        """Test for the minimal sheets required to upload data"""
89
90
        not_found = []
91
92
        for sheet_name in TEMPLATE_COLUMNS.keys():
93
            if sheet_name not in self.sheet_names:
94
                not_found.append(sheet_name)
95
                logger.error(
96
                    "required sheet {name} not found in template".format(
97
                        name=sheet_name)
98
                )
99
100
        if len(not_found) > 0:
101
            return False, not_found
102
103
        else:
104
            logger.debug("This seems to be a valid Template file")
105
            return True, not_found
106
107
    def check_columns(self):
108
        """Test for minimal column required for template load"""
109
110
        not_found = defaultdict(list)
111
112
        for sheet_name in TEMPLATE_COLUMNS.keys():
113
            # get a sheet from xls workbook
114
            sheet = self.book.sheet_by_name(sheet_name)
115
116
            # get header from sheet
117
            header = sheet.row_values(0)
118
119
            for column in TEMPLATE_COLUMNS[sheet_name]:
120
                if column not in header:
121
                    not_found[sheet_name].append(column)
122
                    logger.error(
123
                        "required column {column} not found in sheet "
124
                        "{sheet_name}".format(
125
                            sheet_name=sheet_name,
126
                            column=column)
127
                    )
128
129
        if len(not_found) > 0:
130
            return False, not_found
131
132
        else:
133
            logger.debug("This seems to be a valid Template file")
134
            return True, not_found
135
136
    def get_sheet_records(self, sheet_name):
137
        """Generic functions to iterate on excel records"""
138
139
        # this is the sheet I need
140
        sheet = self.book.sheet_by_name(sheet_name)
141
142
        # now get columns to create a collection objects
143
        header = sheet.row_values(0)
144
145
        column_idxs = {}
146
147
        # get the column index I need
148
        for column in TEMPLATE_COLUMNS[sheet_name]:
149
            try:
150
                idx = header.index(column)
151
152
            except ValueError as e:
153
                logger.error(e)
154
                raise ExcelImportError(
155
                    "Column '%s' not found in '%s' sheet" % (
156
                        column, sheet_name))
157
158
            column_idxs[column.lower().replace(" ", "_")] = idx
159
160
        # get new column names
161
        columns = column_idxs.keys()
162
163
        # create a namedtuple object
164
        Record = namedtuple(sheet_name.capitalize(), columns)
165
166
        # iterate over record, mind the header column
167
        for i in range(1, sheet.nrows):
168
            # get a row from excel file
169
            row = sheet.row_values(i)
170
171
            # get the data I need
172
            data = [row[column_idxs[column]] for column in columns]
173
174
            # replace all empty  occurences in a list
175
            data = [None if col in [""]
176
                    else col for col in data]
177
178
            # stripping columns
179
            data = [col.strip() if type(col) is str
180
                    else col for col in data]
181
182
            # treat integers as integers
183
            data = [int(col) if type(col) is float and col.is_integer()
184
                    else col for col in data]
185
186
            # fix date fields. Search for 'date' in column names
187
            date_idxs = [column_idxs[column] for column in columns if
188
                         'date' in column]
189
190
            # fix date objects using datetime, as described here:
191
            # https://stackoverflow.com/a/13962976/4385116
192
            for idx in date_idxs:
193
                if not data[idx]:
194
                    continue
195
196
                # forcing a date object
197
                data[idx] = datetime.datetime(
198
                    *xlrd.xldate_as_tuple(
199
                        data[idx],
200
                        self.book.datemode
201
                    )
202
                ).date()
203
204
            # get a new object
205
            record = Record._make(data)
206
207
            yield record
208
209
    def get_breed_records(self):
210
        """Iterate among breeds record"""
211
212
        # this is the sheet I need
213
        sheet_name = "breed"
214
        return self.get_sheet_records(sheet_name)
215
216
    def get_animal_records(self):
217
        """Iterate among animal records"""
218
219
        # this is the sheet I need
220
        sheet_name = "animal"
221
        return self.get_sheet_records(sheet_name)
222
223
    def get_sample_records(self):
224
        """Iterate among sample records"""
225
226
        # this is the sheet I need
227
        sheet_name = "sample"
228
        return self.get_sheet_records(sheet_name)
229
230
    def get_animal_from_sample(self, sample):
231
        """get an animal record from a sample record"""
232
233
        animals = [
234
            animal for animal in self.get_animal_records() if
235
            animal.animal_id_in_data_source == sample.animal_id_in_data_source
236
        ]
237
238
        # animal is supposed to be unique
239
        if len(animals) != 1:
240
            raise ExcelImportError(
241
                "Can't determine a unique animal from '{sample}' "
242
                "record data. Animal objects found: {animals}".format(
243
                    sample=sample, animals=animals))
244
245
        return animals[0]
246
247
    def get_breed_from_animal(self, record):
248
        """Get a breed record from an animal record"""
249
250
        breeds = [
251
            breed for breed in self.get_breed_records()
252
            if breed.supplied_breed == record.breed and
253
            breed.species == record.species]
254
255
        # breed is supposed to be unique, from UID constraints. However
256
        # I could place the same breed name for two countries. In that case,
257
        # I cant derive a unique breed from users data
258
        if len(breeds) != 1:
259
            raise ExcelImportError(
260
                "Can't determine a unique breed for '%s:%s' from user data" %
261
                (record.breed, record.species))
262
263
        return breeds[0]
264
265
    def check_species(self, country):
266
        """Check if all species are defined in UID DictSpecies. If not,
267
        create dictionary term"""
268
269
        column = 'species'
270
        item_set = set([breed.species for breed in self.get_breed_records()])
271
272
        # call FileDataSourceMixin.check_species
273
        return super().check_species(column, item_set, country, create=True)
274
275
    def check_species_in_animal_sheet(self):
276
        """Check if all animal species are defined in breed sheet"""
277
278
        check = True
279
        not_found = []
280
281
        reference_set = set(
282
            [breed.species for breed in self.get_breed_records()])
283
284
        test_set = set(
285
            [animal.species for animal in self.get_animal_records()])
286
287
        for specie in test_set:
288
            if specie not in reference_set:
289
                check = False
290
                not_found.append(specie)
291
292
        return check, not_found
293
294
    def check_sex(self):
295
        """Check that all sex records are present in database"""
296
297
        column = 'sex'
298
        item_set = set([animal.sex for animal in self.get_animal_records()])
299
300
        # call FileDataSourceMixin.check_items
301
        return self.check_items(item_set, DictSex, column)
302
303
    def check_countries(self):
304
        """Check that all efabis countries are present in database"""
305
306
        column = "efabis_breed_country"
307
        item_set = set([breed.efabis_breed_country for
308
                        breed in self.get_breed_records()])
309
310
        # call FileDataSourceMixin.check_items
311
        return self.check_items(item_set, DictCountry, column)
312
313
    def __check_accuracy(self, item_set):
314
        """A generic method to test for accuracies"""
315
316
        # a list of not found terms and a status to see if something is missing
317
        # or not
318
        not_found = []
319
        result = True
320
321
        for item in item_set:
322
            try:
323
                ACCURACIES.get_value_by_desc(item)
324
325
            except KeyError:
326
                logger.warning("accuracy level '%s' not found" % (item))
327
                not_found.append(item)
328
329
        if len(not_found) != 0:
330
            result = False
331
332
        return result, not_found
333
334
    def check_accuracies(self):
335
        """Check accuracy specified in table"""
336
337
        item_set = set([animal.birth_location_accuracy
338
                        for animal in self.get_animal_records()])
339
340
        # test for accuracy in animal table
341
        result_animal, not_found_animal = self.__check_accuracy(item_set)
342
343
        item_set = set([sample.collection_place_accuracy
344
                        for sample in self.get_sample_records()])
345
346
        # test for accuracy in sample table
347
        result_sample, not_found_sample = self.__check_accuracy(item_set)
348
349
        # merge two results
350
        check = result_animal and result_sample
351
        not_found = set(not_found_animal + not_found_sample)
352
353
        if check is False:
354
            logger.error(
355
                    "Couldnt' find those accuracies in constants:")
356
            logger.error(not_found)
357
358
        return check, not_found
359