1
|
|
|
#!/usr/bin/env python3 |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
""" |
4
|
|
|
Created on Fri Jul 5 16:31:45 2019 |
5
|
|
|
|
6
|
|
|
@author: Paolo Cozzi <[email protected]> |
7
|
|
|
""" |
8
|
|
|
|
9
|
|
|
import xlrd |
10
|
|
|
import logging |
11
|
|
|
import datetime |
12
|
|
|
|
13
|
|
|
from collections import defaultdict, namedtuple |
14
|
|
|
|
15
|
|
|
from common.constants import ACCURACIES |
16
|
|
|
from image_app.helpers import FileDataSourceMixin |
17
|
|
|
from image_app.models import DictSex |
18
|
|
|
|
19
|
|
|
# Get an instance of a logger |
20
|
|
|
logger = logging.getLogger(__name__) |
21
|
|
|
|
22
|
|
|
# defining the template columns in need for data import |
23
|
|
|
TEMPLATE_COLUMNS = { |
24
|
|
|
'breed': [ |
25
|
|
|
'Supplied breed', |
26
|
|
|
# 'Mapped breed', |
27
|
|
|
# 'Mapped breed ontology library', |
28
|
|
|
# 'Mapped breed ontology accession', |
29
|
|
|
'EFABIS Breed country', |
30
|
|
|
'Species', |
31
|
|
|
# 'Species ontology library', |
32
|
|
|
# 'Species ontology accession' |
33
|
|
|
], |
34
|
|
|
'animal': [ |
35
|
|
|
'Animal id in data source', |
36
|
|
|
'Animal description', |
37
|
|
|
'Alternative animal ID', |
38
|
|
|
'Father id in data source', |
39
|
|
|
'Mother id in data source', |
40
|
|
|
'Breed', |
41
|
|
|
'Species', |
42
|
|
|
'Sex', |
43
|
|
|
'Birth date', |
44
|
|
|
'Birth location', |
45
|
|
|
'Birth location longitude', |
46
|
|
|
'Birth location latitude', |
47
|
|
|
'Birth location accuracy' |
48
|
|
|
], |
49
|
|
|
'sample': [ |
50
|
|
|
'Sample id in data source', |
51
|
|
|
'Alternative sample ID', |
52
|
|
|
'Sample description', |
53
|
|
|
'Animal id in data source', |
54
|
|
|
'Specimen collection protocol', |
55
|
|
|
'availability', |
56
|
|
|
'Collection date', |
57
|
|
|
'Collection place latitude', |
58
|
|
|
'Collection place longitude', |
59
|
|
|
'Collection place', |
60
|
|
|
'Collection place accuracy', |
61
|
|
|
'Organism part', |
62
|
|
|
'Developmental stage', |
63
|
|
|
'Physiological stage', |
64
|
|
|
'Animal age at collection', |
65
|
|
|
'Sample storage', |
66
|
|
|
'Sample storage processing', |
67
|
|
|
'Sampling to preparation interval' |
68
|
|
|
] |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
|
72
|
|
|
class ExcelTemplateReader(FileDataSourceMixin): |
73
|
|
|
"""A class to read template excel files""" |
74
|
|
|
|
75
|
|
|
def __init__(self): |
76
|
|
|
# read xls file and track it |
77
|
|
|
self.book = None |
78
|
|
|
self.sheet_names = [] |
79
|
|
|
|
80
|
|
|
def read_file(self, filename): |
81
|
|
|
# read xls file and track it |
82
|
|
|
self.book = xlrd.open_workbook(filename) |
83
|
|
|
self.sheet_names = self.book.sheet_names() |
84
|
|
|
|
85
|
|
|
def check_sheets(self): |
86
|
|
|
"""Test for the minimal sheets required to upload data""" |
87
|
|
|
|
88
|
|
|
not_found = [] |
89
|
|
|
|
90
|
|
|
for sheet_name in TEMPLATE_COLUMNS.keys(): |
91
|
|
|
if sheet_name not in self.sheet_names: |
92
|
|
|
not_found.append(sheet_name) |
93
|
|
|
logger.error( |
94
|
|
|
"required sheet {name} not found in template".format( |
95
|
|
|
name=sheet_name) |
96
|
|
|
) |
97
|
|
|
|
98
|
|
|
if len(not_found) > 0: |
99
|
|
|
return False, not_found |
100
|
|
|
|
101
|
|
|
else: |
102
|
|
|
logger.debug("This seems to be a valid Template file") |
103
|
|
|
return True, not_found |
104
|
|
|
|
105
|
|
|
def check_columns(self): |
106
|
|
|
"""Test for minimal column required for template load""" |
107
|
|
|
|
108
|
|
|
not_found = defaultdict(list) |
109
|
|
|
|
110
|
|
|
for sheet_name in TEMPLATE_COLUMNS.keys(): |
111
|
|
|
# get a sheet from xls workbook |
112
|
|
|
sheet = self.book.sheet_by_name(sheet_name) |
113
|
|
|
|
114
|
|
|
# get header from sheet |
115
|
|
|
header = sheet.row_values(0) |
116
|
|
|
|
117
|
|
|
for column in TEMPLATE_COLUMNS[sheet_name]: |
118
|
|
|
if column not in header: |
119
|
|
|
not_found[sheet_name].append(column) |
120
|
|
|
logger.error( |
121
|
|
|
"required column {column} not found in sheet " |
122
|
|
|
"{sheet_name}".format( |
123
|
|
|
sheet_name=sheet_name, |
124
|
|
|
column=column) |
125
|
|
|
) |
126
|
|
|
|
127
|
|
|
if len(not_found) > 0: |
128
|
|
|
return False, not_found |
129
|
|
|
|
130
|
|
|
else: |
131
|
|
|
logger.debug("This seems to be a valid Template file") |
132
|
|
|
return True, not_found |
133
|
|
|
|
134
|
|
|
def get_sheet_records(self, sheet_name): |
135
|
|
|
"""Generic functions to iterate on excel records""" |
136
|
|
|
|
137
|
|
|
# this is the sheet I need |
138
|
|
|
sheet = self.book.sheet_by_name(sheet_name) |
139
|
|
|
|
140
|
|
|
# now get columns to create a collection objects |
141
|
|
|
header = sheet.row_values(0) |
142
|
|
|
|
143
|
|
|
column_idxs = {} |
144
|
|
|
|
145
|
|
|
# get the column index I need |
146
|
|
|
for column in TEMPLATE_COLUMNS[sheet_name]: |
147
|
|
|
idx = header.index(column) |
148
|
|
|
column_idxs[column.lower().replace(" ", "_")] = idx |
149
|
|
|
|
150
|
|
|
# get new column names |
151
|
|
|
columns = column_idxs.keys() |
152
|
|
|
|
153
|
|
|
# create a namedtuple object |
154
|
|
|
Record = namedtuple(sheet_name.capitalize(), columns) |
155
|
|
|
|
156
|
|
|
# iterate over record, mind the header column |
157
|
|
|
for i in range(1, sheet.nrows): |
158
|
|
|
# get a row from excel file |
159
|
|
|
row = sheet.row_values(i) |
160
|
|
|
|
161
|
|
|
# get the data I need |
162
|
|
|
data = [row[column_idxs[column]] for column in columns] |
163
|
|
|
|
164
|
|
|
# replace all empty occurences in a list |
165
|
|
|
data = [None if col in [""] |
166
|
|
|
else col for col in data] |
167
|
|
|
|
168
|
|
|
# fix date fields. Search for 'date' in column names |
169
|
|
|
date_idxs = [column_idxs[column] for column in columns if |
170
|
|
|
'date' in column] |
171
|
|
|
|
172
|
|
|
# fix date objects using datetime, as described here: |
173
|
|
|
# https://stackoverflow.com/a/13962976/4385116 |
174
|
|
|
for idx in date_idxs: |
175
|
|
|
if not data[idx]: |
176
|
|
|
continue |
177
|
|
|
|
178
|
|
|
data[idx] = datetime.datetime( |
179
|
|
|
*xlrd.xldate_as_tuple( |
180
|
|
|
data[idx], |
181
|
|
|
self.book.datemode |
182
|
|
|
) |
183
|
|
|
) |
184
|
|
|
|
185
|
|
|
# get a new object |
186
|
|
|
record = Record._make(data) |
187
|
|
|
|
188
|
|
|
yield record |
189
|
|
|
|
190
|
|
|
def get_breed_records(self): |
191
|
|
|
"""Iterate among breeds record""" |
192
|
|
|
|
193
|
|
|
# this is the sheet I need |
194
|
|
|
sheet_name = "breed" |
195
|
|
|
return self.get_sheet_records(sheet_name) |
196
|
|
|
|
197
|
|
|
def get_animal_records(self): |
198
|
|
|
"""Iterate among animal records""" |
199
|
|
|
|
200
|
|
|
# this is the sheet I need |
201
|
|
|
sheet_name = "animal" |
202
|
|
|
return self.get_sheet_records(sheet_name) |
203
|
|
|
|
204
|
|
|
def get_sample_records(self): |
205
|
|
|
"""Iterate among sample records""" |
206
|
|
|
|
207
|
|
|
# this is the sheet I need |
208
|
|
|
sheet_name = "sample" |
209
|
|
|
return self.get_sheet_records(sheet_name) |
210
|
|
|
|
211
|
|
|
def check_species(self, country): |
212
|
|
|
"""Check if all species are defined in UID DictSpecies""" |
213
|
|
|
|
214
|
|
|
column = 'species' |
215
|
|
|
item_set = set([breed.species for breed in self.get_breed_records()]) |
216
|
|
|
|
217
|
|
|
# call FileDataSourceMixin.check_species |
218
|
|
|
return super().check_species(column, item_set, country) |
219
|
|
|
|
220
|
|
|
def check_sex(self): |
221
|
|
|
"""Check that all sex records are present in database""" |
222
|
|
|
|
223
|
|
|
column = 'sex' |
224
|
|
|
item_set = set([animal.sex for animal in self.get_animal_records()]) |
225
|
|
|
|
226
|
|
|
# call FileDataSourceMixin.check_items |
227
|
|
|
return self.check_items(item_set, DictSex, column) |
228
|
|
|
|
229
|
|
|
def __check_accuracy(self, item_set): |
230
|
|
|
"""A generic method to test for accuracies""" |
231
|
|
|
|
232
|
|
|
# a list of not found terms and a status to see if something is missing |
233
|
|
|
# or not |
234
|
|
|
not_found = [] |
235
|
|
|
result = True |
236
|
|
|
|
237
|
|
|
for item in item_set: |
238
|
|
|
try: |
239
|
|
|
ACCURACIES.get_value_by_desc(item) |
240
|
|
|
|
241
|
|
|
except KeyError: |
242
|
|
|
logger.warning("accuracy level '%s' not found" % (item)) |
243
|
|
|
not_found.append(item) |
244
|
|
|
|
245
|
|
|
if len(not_found) != 0: |
246
|
|
|
result = False |
247
|
|
|
|
248
|
|
|
return result, not_found |
249
|
|
|
|
250
|
|
|
def check_accuracies(self): |
251
|
|
|
"""Check accuracy specified in table""" |
252
|
|
|
|
253
|
|
|
item_set = set([animal.birth_location_accuracy |
254
|
|
|
for animal in self.get_animal_records()]) |
255
|
|
|
|
256
|
|
|
# test for accuracy in animal table |
257
|
|
|
result_animal, not_found_animal = self.__check_accuracy(item_set) |
258
|
|
|
|
259
|
|
|
item_set = set([sample.collection_place_accuracy |
260
|
|
|
for sample in self.get_sample_records()]) |
261
|
|
|
|
262
|
|
|
# test for accuracy in sample table |
263
|
|
|
result_sample, not_found_sample = self.__check_accuracy(item_set) |
264
|
|
|
|
265
|
|
|
# merge two results |
266
|
|
|
check = result_animal and result_sample |
267
|
|
|
not_found = set(not_found_animal + not_found_sample) |
268
|
|
|
|
269
|
|
|
if check is False: |
270
|
|
|
logger.error( |
271
|
|
|
"Couldnt' find those accuracies in constants:") |
272
|
|
|
logger.error(not_found) |
273
|
|
|
|
274
|
|
|
return check, not_found |
275
|
|
|
|