Passed
Pull Request — master (#40)
by Paolo
01:16
created

ExcelTemplateReader.get_breed_records()   A

Complexity

Conditions 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Fri Jul  5 16:31:45 2019
5
6
@author: Paolo Cozzi <[email protected]>
7
"""
8
9
import xlrd
10
import logging
11
import datetime
12
13
from collections import defaultdict, namedtuple
14
15
from common.constants import ACCURACIES
16
from image_app.helpers import FileDataSourceMixin
17
from image_app.models import DictSex
18
19
# Get an instance of a logger
20
logger = logging.getLogger(__name__)
21
22
# defining the template columns in need for data import
23
TEMPLATE_COLUMNS = {
24
    'breed': [
25
        'Supplied breed',
26
        # 'Mapped breed',
27
        # 'Mapped breed ontology library',
28
        # 'Mapped breed ontology accession',
29
        'EFABIS Breed country',
30
        'Species',
31
        # 'Species ontology library',
32
        # 'Species ontology accession'
33
    ],
34
    'animal': [
35
        'Animal id in data source',
36
        'Animal description',
37
        'Alternative animal ID',
38
        'Father id in data source',
39
        'Mother id in data source',
40
        'Breed',
41
        'Species',
42
        'Sex',
43
        'Birth date',
44
        'Birth location',
45
        'Birth location longitude',
46
        'Birth location latitude',
47
        'Birth location accuracy'
48
    ],
49
    'sample': [
50
        'Sample id in data source',
51
        'Alternative sample ID',
52
        'Sample description',
53
        'Animal id in data source',
54
        'Specimen collection protocol',
55
        'availability',
56
        'Collection date',
57
        'Collection place latitude',
58
        'Collection place longitude',
59
        'Collection place',
60
        'Collection place accuracy',
61
        'Organism part',
62
        'Developmental stage',
63
        'Physiological stage',
64
        'Animal age at collection',
65
        'Sample storage',
66
        'Sample storage processing',
67
        'Sampling to preparation interval'
68
    ]
69
}
70
71
72
class ExcelTemplateReader(FileDataSourceMixin):
73
    """A class to read template excel files"""
74
75
    def __init__(self):
76
        # read xls file and track it
77
        self.book = None
78
        self.sheet_names = []
79
80
    def read_file(self, filename):
81
        # read xls file and track it
82
        self.book = xlrd.open_workbook(filename)
83
        self.sheet_names = self.book.sheet_names()
84
85
    def check_sheets(self):
86
        """Test for the minimal sheets required to upload data"""
87
88
        not_found = []
89
90
        for sheet_name in TEMPLATE_COLUMNS.keys():
91
            if sheet_name not in self.sheet_names:
92
                not_found.append(sheet_name)
93
                logger.error(
94
                    "required sheet {name} not found in template".format(
95
                        name=sheet_name)
96
                )
97
98
        if len(not_found) > 0:
99
            return False, not_found
100
101
        else:
102
            logger.debug("This seems to be a valid Template file")
103
            return True, not_found
104
105
    def check_columns(self):
106
        """Test for minimal column required for template load"""
107
108
        not_found = defaultdict(list)
109
110
        for sheet_name in TEMPLATE_COLUMNS.keys():
111
            # get a sheet from xls workbook
112
            sheet = self.book.sheet_by_name(sheet_name)
113
114
            # get header from sheet
115
            header = sheet.row_values(0)
116
117
            for column in TEMPLATE_COLUMNS[sheet_name]:
118
                if column not in header:
119
                    not_found[sheet_name].append(column)
120
                    logger.error(
121
                        "required column {column} not found in sheet "
122
                        "{sheet_name}".format(
123
                            sheet_name=sheet_name,
124
                            column=column)
125
                    )
126
127
        if len(not_found) > 0:
128
            return False, not_found
129
130
        else:
131
            logger.debug("This seems to be a valid Template file")
132
            return True, not_found
133
134
    def get_sheet_records(self, sheet_name):
135
        """Generic functions to iterate on excel records"""
136
137
        # this is the sheet I need
138
        sheet = self.book.sheet_by_name(sheet_name)
139
140
        # now get columns to create a collection objects
141
        header = sheet.row_values(0)
142
143
        column_idxs = {}
144
145
        # get the column index I need
146
        for column in TEMPLATE_COLUMNS[sheet_name]:
147
            idx = header.index(column)
148
            column_idxs[column.lower().replace(" ", "_")] = idx
149
150
        # get new column names
151
        columns = column_idxs.keys()
152
153
        # create a namedtuple object
154
        Record = namedtuple(sheet_name.capitalize(), columns)
155
156
        # iterate over record, mind the header column
157
        for i in range(1, sheet.nrows):
158
            # get a row from excel file
159
            row = sheet.row_values(i)
160
161
            # get the data I need
162
            data = [row[column_idxs[column]] for column in columns]
163
164
            # replace all empty  occurences in a list
165
            data = [None if col in [""]
166
                    else col for col in data]
167
168
            # fix date fields. Search for 'date' in column names
169
            date_idxs = [column_idxs[column] for column in columns if
170
                         'date' in column]
171
172
            # fix date objects using datetime, as described here:
173
            # https://stackoverflow.com/a/13962976/4385116
174
            for idx in date_idxs:
175
                if not data[idx]:
176
                    continue
177
178
                data[idx] = datetime.datetime(
179
                    *xlrd.xldate_as_tuple(
180
                        data[idx],
181
                        self.book.datemode
182
                    )
183
                )
184
185
            # get a new object
186
            record = Record._make(data)
187
188
            yield record
189
190
    def get_breed_records(self):
191
        """Iterate among breeds record"""
192
193
        # this is the sheet I need
194
        sheet_name = "breed"
195
        return self.get_sheet_records(sheet_name)
196
197
    def get_animal_records(self):
198
        """Iterate among animal records"""
199
200
        # this is the sheet I need
201
        sheet_name = "animal"
202
        return self.get_sheet_records(sheet_name)
203
204
    def get_sample_records(self):
205
        """Iterate among sample records"""
206
207
        # this is the sheet I need
208
        sheet_name = "sample"
209
        return self.get_sheet_records(sheet_name)
210
211
    def check_species(self, country):
212
        """Check if all species are defined in UID DictSpecies"""
213
214
        column = 'species'
215
        item_set = set([breed.species for breed in self.get_breed_records()])
216
217
        # call FileDataSourceMixin.check_species
218
        return super().check_species(column, item_set, country)
219
220
    def check_sex(self):
221
        """Check that all sex records are present in database"""
222
223
        column = 'sex'
224
        item_set = set([animal.sex for animal in self.get_animal_records()])
225
226
        # call FileDataSourceMixin.check_items
227
        return self.check_items(item_set, DictSex, column)
228
229
    def __check_accuracy(self, item_set):
230
        """A generic method to test for accuracies"""
231
232
        # a list of not found terms and a status to see if something is missing
233
        # or not
234
        not_found = []
235
        result = True
236
237
        for item in item_set:
238
            try:
239
                ACCURACIES.get_value_by_desc(item)
240
241
            except KeyError:
242
                logger.warning("accuracy level '%s' not found" % (item))
243
                not_found.append(item)
244
245
        if len(not_found) != 0:
246
            result = False
247
248
        return result, not_found
249
250
    def check_accuracies(self):
251
        """Check accuracy specified in table"""
252
253
        item_set = set([animal.birth_location_accuracy
254
                        for animal in self.get_animal_records()])
255
256
        # test for accuracy in animal table
257
        result_animal, not_found_animal = self.__check_accuracy(item_set)
258
259
        item_set = set([sample.collection_place_accuracy
260
                        for sample in self.get_sample_records()])
261
262
        # test for accuracy in sample table
263
        result_sample, not_found_sample = self.__check_accuracy(item_set)
264
265
        # merge two results
266
        check = result_animal and result_sample
267
        not_found = set(not_found_animal + not_found_sample)
268
269
        if check is False:
270
            logger.error(
271
                    "Couldnt' find those accuracies in constants:")
272
            logger.error(not_found)
273
274
        return check, not_found
275