1
|
|
|
""" |
2
|
|
|
Groundwork Excel read/write routines using openpyxl |
3
|
|
|
""" |
4
|
|
|
import datetime |
5
|
|
|
import json |
6
|
|
|
import os |
7
|
|
|
import re |
8
|
|
|
|
9
|
|
|
import openpyxl |
10
|
|
|
from groundwork.patterns.gw_base_pattern import GwBasePattern |
11
|
|
|
from jsonschema import validate, ValidationError, SchemaError |
12
|
|
|
from openpyxl.utils import get_column_letter |
|
|
|
|
13
|
|
|
|
14
|
|
|
JSON_SCHEMA_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'assets', 'excel_config_schema.json') |
15
|
|
|
|
16
|
|
|
|
17
|
|
|
class ExcelValidationPattern(GwBasePattern): |
18
|
|
|
""" |
19
|
|
|
Groundwork Excel read/write routines using openpyxl |
20
|
|
|
""" |
21
|
|
|
|
22
|
|
|
def __init__(self, *args, **kwargs): |
23
|
|
|
super(ExcelValidationPattern, self).__init__(*args, **kwargs) |
24
|
|
|
self.excel_validation = ExcelValidationPlugin(plugin=self) |
25
|
|
|
|
26
|
|
|
|
27
|
|
|
class ExcelValidationPlugin: |
28
|
|
|
""" |
29
|
|
|
Plugin level class of the Excel validation pattern. |
30
|
|
|
Note: There are no instances of the ExcelValidationPattern on the app. |
31
|
|
|
""" |
32
|
|
|
|
33
|
|
|
def __init__(self, plugin): |
34
|
|
|
""" |
35
|
|
|
:param plugin: The plugin, which wants to use documents |
36
|
|
|
:type plugin: GwBasePattern |
37
|
|
|
""" |
38
|
|
|
self._plugin = plugin |
39
|
|
|
self._app = plugin.app |
40
|
|
|
self.excel_config = None |
41
|
|
|
|
42
|
|
|
def read_excel(self, excel_config_json_path, excel_workbook_path): |
|
|
|
|
43
|
|
|
""" |
44
|
|
|
Main routine to read an Excel sheet. |
45
|
|
|
|
46
|
|
|
:param excel_config_json_path: The configuration json file |
47
|
|
|
:param excel_workbook_path: Relative or absolute path to an Excel workbook |
48
|
|
|
:return: Data dictionary with rows/colums as keys and a dictionary of "header": value as items |
49
|
|
|
""" |
50
|
|
|
|
51
|
|
|
# The exceptions raised in this method shall be raised to the plugin level |
52
|
|
|
self.excel_config = self._validate_json(excel_config_json_path) |
53
|
|
|
|
54
|
|
|
######################### |
55
|
|
|
# Get index configuration |
56
|
|
|
######################### |
57
|
|
|
orientation = self.excel_config['orientation'] |
58
|
|
|
header_idx_cfg_row_first = self.excel_config['headers_index_config']['row_index']['first'] |
59
|
|
|
header_idx_cfg_row_last = self.excel_config['headers_index_config']['row_index']['last'] |
60
|
|
|
header_idx_cfg_col_first = self.excel_config['headers_index_config']['column_index']['first'] |
61
|
|
|
header_idx_cfg_col_last = self.excel_config['headers_index_config']['column_index']['last'] |
62
|
|
|
data_idx_cfg_row_first = self.excel_config['data_index_config']['row_index']['first'] |
63
|
|
|
data_idx_cfg_row_last = self.excel_config['data_index_config']['row_index']['last'] |
64
|
|
|
data_idx_cfg_col_first = self.excel_config['data_index_config']['column_index']['first'] |
65
|
|
|
data_idx_cfg_col_last = self.excel_config['data_index_config']['column_index']['last'] |
66
|
|
|
|
67
|
|
|
############################################################################ |
68
|
|
|
# Rotate coordinates so we can work with virtually column based all the time |
69
|
|
|
# Just before addressing the cells a coordinate transformation is done again |
70
|
|
|
############################################################################ |
71
|
|
|
if orientation == 'column_based': |
72
|
|
|
# Correct the header and data matrix according to the orientation |
73
|
|
|
# We rotate the row_based layout so we can work with column_based in mind all the time |
74
|
|
|
# Name schema: corrected headers/data index config column/row first/last |
75
|
|
|
corr_header_idx_cfg_row_first = header_idx_cfg_row_first |
76
|
|
|
corr_header_idx_cfg_row_last = header_idx_cfg_row_last |
77
|
|
|
corr_header_idx_cfg_col_first = header_idx_cfg_col_first |
78
|
|
|
corr_header_idx_cfg_col_last = header_idx_cfg_col_last |
79
|
|
|
corr_data_idx_cfg_row_first = data_idx_cfg_row_first |
80
|
|
|
corr_data_idx_cfg_row_last = data_idx_cfg_row_last |
81
|
|
|
corr_data_idx_cfg_col_first = data_idx_cfg_col_first |
82
|
|
|
corr_data_idx_cfg_col_last = data_idx_cfg_col_last |
83
|
|
|
else: |
84
|
|
|
# row_based layout |
85
|
|
|
corr_header_idx_cfg_row_first = header_idx_cfg_col_first |
86
|
|
|
corr_header_idx_cfg_row_last = header_idx_cfg_col_last |
87
|
|
|
corr_header_idx_cfg_col_first = header_idx_cfg_row_first |
88
|
|
|
corr_header_idx_cfg_col_last = header_idx_cfg_row_last |
89
|
|
|
corr_data_idx_cfg_row_first = data_idx_cfg_col_first |
90
|
|
|
corr_data_idx_cfg_row_last = data_idx_cfg_col_last |
91
|
|
|
corr_data_idx_cfg_col_first = data_idx_cfg_row_first |
92
|
|
|
corr_data_idx_cfg_col_last = data_idx_cfg_row_last |
93
|
|
|
|
94
|
|
|
# Used for log and exception messages |
95
|
|
|
oriented_row_text = "row" if orientation == 'column_based' else "column" |
96
|
|
|
oriented_column_text = "column" if orientation == 'column_based' else "row" |
97
|
|
|
|
98
|
|
|
###################################### |
99
|
|
|
# Set defaults of headers_index_config |
100
|
|
|
###################################### |
101
|
|
|
if type(corr_header_idx_cfg_row_first) is not int: |
|
|
|
|
102
|
|
|
# Assume the user wants to use the first row as header row |
103
|
|
|
corr_header_idx_cfg_row_first = 1 |
104
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> first to 1".format( |
105
|
|
|
oriented_row_text)) |
106
|
|
|
|
107
|
|
|
if type(corr_header_idx_cfg_row_last) is not int: |
|
|
|
|
108
|
|
|
# Only 1 header row is supported currently, set last header row to first header row |
109
|
|
|
corr_header_idx_cfg_row_last = corr_header_idx_cfg_row_first |
110
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> last to {1}".format( |
111
|
|
|
oriented_row_text, corr_header_idx_cfg_row_first)) |
112
|
|
|
|
113
|
|
|
if type(corr_header_idx_cfg_col_first) is not int: |
|
|
|
|
114
|
|
|
# Assume the user wants to start at the first column |
115
|
|
|
corr_header_idx_cfg_col_first = 1 |
116
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> first to 1".format( |
117
|
|
|
oriented_column_text)) |
118
|
|
|
|
119
|
|
|
if type(corr_header_idx_cfg_col_last) is not int: |
|
|
|
|
120
|
|
|
if type(corr_data_idx_cfg_col_last) is int: |
|
|
|
|
121
|
|
|
# We don't have a last column in the header config, |
122
|
|
|
# so we use what we have in the data config |
123
|
|
|
corr_header_idx_cfg_col_last = corr_data_idx_cfg_col_last |
124
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> first to " |
125
|
|
|
"{1}".format(oriented_column_text, corr_header_idx_cfg_col_last)) |
126
|
|
|
|
127
|
|
|
################################### |
128
|
|
|
# Set defaults of data_index_config |
129
|
|
|
################################### |
130
|
|
|
if type(corr_data_idx_cfg_row_first) is not int: |
|
|
|
|
131
|
|
|
# Assume the first data row is the next after corr_header_idx_cfg_row_last |
132
|
|
|
corr_data_idx_cfg_row_first = corr_header_idx_cfg_row_last + 1 |
133
|
|
|
self._plugin.log.debug("Config update: Setting data_index_config -> {0}_index -> first to {1}".format( |
134
|
|
|
oriented_row_text, corr_data_idx_cfg_row_first)) |
135
|
|
|
|
136
|
|
|
# corr_data_idx_cfg_row_last has no defaults - the user input is master |
137
|
|
|
|
138
|
|
|
if type(corr_data_idx_cfg_col_first) is not int: |
|
|
|
|
139
|
|
|
# Assume the first data column is equal to the first header column |
140
|
|
|
corr_data_idx_cfg_col_first = corr_header_idx_cfg_col_first |
141
|
|
|
self._plugin.log.debug("Config update: Setting data_index_config -> {0}_index -> first to {1}".format( |
142
|
|
|
oriented_column_text, corr_data_idx_cfg_col_first)) |
143
|
|
|
|
144
|
|
|
if type(corr_data_idx_cfg_col_last) is not int: |
|
|
|
|
145
|
|
|
if type(corr_header_idx_cfg_col_last) is int: |
|
|
|
|
146
|
|
|
# We don't have a last column in the data config, |
147
|
|
|
# so we use what we have in the header config |
148
|
|
|
corr_data_idx_cfg_col_last = corr_header_idx_cfg_col_last |
149
|
|
|
self._plugin.log.debug("Config update: Setting data_index_config -> {0}_index -> last to " |
150
|
|
|
"{1}".format(oriented_column_text, corr_data_idx_cfg_col_last)) |
151
|
|
|
|
152
|
|
|
################################ |
153
|
|
|
# Some more logic checks on rows |
154
|
|
|
# (matrix size and row order) |
155
|
|
|
################################ |
156
|
|
|
if corr_header_idx_cfg_row_last != corr_header_idx_cfg_row_first: |
157
|
|
|
# We can compare both because at this point they have to be integer |
158
|
|
|
# Multi line headers given |
159
|
|
|
self._raise_value_error("Config error: Multi line (grouped) headers are not yet supported. " |
160
|
|
|
"First and last header {0} must be equal.".format(oriented_row_text)) |
161
|
|
|
|
162
|
|
|
if corr_data_idx_cfg_row_first <= corr_header_idx_cfg_row_last: |
163
|
|
|
# We can compare both because at this point they have to be integer |
164
|
|
|
# The data section is above the header section |
165
|
|
|
self._raise_value_error("Config error: headers_index_config -> {0}_index -> last is greater than " |
166
|
|
|
"data_index_config -> {0}_index -> first.".format(oriented_row_text)) |
167
|
|
|
|
168
|
|
|
if type(corr_data_idx_cfg_row_last) is int: |
|
|
|
|
169
|
|
|
if corr_data_idx_cfg_row_last < corr_data_idx_cfg_row_first: |
170
|
|
|
# The last data row is smaller than the first |
171
|
|
|
self._raise_value_error("Config error: data_index_config -> {0}_index -> first is greater than " |
172
|
|
|
"data_index_config -> {0}_index -> last.".format(oriented_row_text)) |
173
|
|
|
|
174
|
|
|
################################### |
175
|
|
|
# Some more logic checks on columns |
176
|
|
|
# (mismatches) |
177
|
|
|
################################### |
178
|
|
|
if corr_header_idx_cfg_col_first != corr_data_idx_cfg_col_first: |
179
|
|
|
# We can compare both because at this point they have to be integer |
180
|
|
|
# First column mismatch |
181
|
|
|
self._raise_value_error("Config error: header_index_config -> {0}_index -> first is not equal to " |
182
|
|
|
"data_index_config -> {0}_index -> first.".format(oriented_column_text)) |
183
|
|
|
|
184
|
|
|
if type(corr_header_idx_cfg_col_last) is int: |
|
|
|
|
185
|
|
|
if type(corr_data_idx_cfg_col_last) is int: |
|
|
|
|
186
|
|
|
if corr_header_idx_cfg_col_last != corr_data_idx_cfg_col_last: |
187
|
|
|
# Last columns are given but do not match |
188
|
|
|
self._raise_value_error( |
189
|
|
|
"Config error: header_index_config -> {0}_index -> last ({1}) is not equal to " |
190
|
|
|
"data_index_config -> {0}_index -> last ({2}).".format( |
191
|
|
|
oriented_column_text, |
192
|
|
|
corr_header_idx_cfg_col_last, |
193
|
|
|
corr_data_idx_cfg_col_last |
194
|
|
|
)) |
195
|
|
|
|
196
|
|
|
if type(corr_header_idx_cfg_col_last) is not int: |
|
|
|
|
197
|
|
|
if type(corr_data_idx_cfg_col_last) is not int: |
|
|
|
|
198
|
|
|
# The column count search is done on the header row, not on data |
199
|
|
|
if corr_data_idx_cfg_col_last != 'automatic': |
200
|
|
|
self._raise_value_error( |
201
|
|
|
"Config error: data_index_config -> {0}_index -> last ({1}) may only be an integer or " |
202
|
|
|
"contain the value 'automatic'.".format(oriented_column_text, |
203
|
|
|
corr_data_idx_cfg_col_last)) |
204
|
|
|
|
205
|
|
|
######################################### |
206
|
|
|
# Set defaults for optional config values |
207
|
|
|
######################################### |
208
|
|
|
if 'sheet_config' not in self.excel_config: |
209
|
|
|
self.excel_config['sheet_config'] = 'active' |
210
|
|
|
|
211
|
|
|
for data_type_config in self.excel_config['data_type_config']: |
212
|
|
|
# default for possible problems should be strict if user tells nothing |
213
|
|
|
if 'fail_on_type_error' not in data_type_config: |
214
|
|
|
data_type_config['fail_on_type_error'] = True |
215
|
|
|
if 'fail_on_empty_cell' not in data_type_config: |
216
|
|
|
data_type_config['fail_on_empty_cell'] = True |
217
|
|
|
if 'fail_on_header_not_found' not in data_type_config: |
218
|
|
|
data_type_config['fail_on_header_not_found'] = True |
219
|
|
|
|
220
|
|
|
# default type is automatic |
221
|
|
|
if 'type' not in data_type_config: |
222
|
|
|
data_type_config['type'] = {'base': 'automatic'} |
223
|
|
|
|
224
|
|
|
# Set global filter properties |
225
|
|
|
# Defensive variant is True for all options |
226
|
|
|
if 'filter_properties' not in self.excel_config: |
227
|
|
|
self.excel_config['filter_properties'] = {} |
228
|
|
|
if 'excluded_fail_on_type_error' not in self.excel_config['filter_properties']: |
229
|
|
|
self.excel_config['filter_properties']['excluded_fail_on_type_error'] = True |
230
|
|
|
if 'excluded_fail_on_empty_cell' not in self.excel_config['filter_properties']: |
231
|
|
|
self.excel_config['filter_properties']['excluded_fail_on_empty_cell'] = True |
232
|
|
|
if 'excluded_enable_logging' not in self.excel_config['filter_properties']: |
233
|
|
|
self.excel_config['filter_properties']['excluded_enable_logging'] = True |
234
|
|
|
|
235
|
|
|
############################ |
236
|
|
|
# Get the workbook and sheet |
237
|
|
|
############################ |
238
|
|
|
wb = openpyxl.load_workbook(excel_workbook_path, data_only=True) |
|
|
|
|
239
|
|
|
ws = self._get_sheet(wb) |
|
|
|
|
240
|
|
|
|
241
|
|
|
############################# |
242
|
|
|
# Determine header row length |
243
|
|
|
############################# |
244
|
|
|
if type(corr_header_idx_cfg_col_last) == int: |
|
|
|
|
245
|
|
|
# corr_header_idx_cfg_col_last already has the final value |
246
|
|
|
pass |
247
|
|
|
elif corr_header_idx_cfg_col_last == 'automatic': |
248
|
|
|
# automatic: use the length of the header row |
249
|
|
|
corr_header_idx_cfg_col_last = len( |
250
|
|
|
ws[self._transform_coordinates(row=corr_header_idx_cfg_row_first)]) |
251
|
|
|
self._plugin.log.debug("Config update: Last header {0} was set to {1} using the 'automatic' " |
252
|
|
|
"mechanism.".format(oriented_column_text, corr_header_idx_cfg_col_last)) |
253
|
|
|
else: |
254
|
|
|
# severalEmptyCells chosen |
255
|
|
|
target_empty_cell_count = int(corr_header_idx_cfg_col_last.split(':')[1]) |
256
|
|
|
empty_cell_count = 0 |
257
|
|
|
curr_column = corr_header_idx_cfg_col_first |
258
|
|
|
while empty_cell_count < target_empty_cell_count: |
259
|
|
|
value = ws[self._transform_coordinates(corr_header_idx_cfg_row_first, |
260
|
|
|
curr_column)].value |
261
|
|
|
if value is None: |
262
|
|
|
empty_cell_count += 1 |
263
|
|
|
curr_column += 1 |
264
|
|
|
corr_header_idx_cfg_col_last = curr_column - target_empty_cell_count - 1 |
265
|
|
|
self._plugin.log.debug("Config update: Last header {0} was set to {1} using the 'automatic' " |
266
|
|
|
"mechanism.".format(oriented_column_text, corr_header_idx_cfg_col_last)) |
267
|
|
|
|
268
|
|
|
################################### |
269
|
|
|
# Determine header column locations |
270
|
|
|
################################### |
271
|
|
|
spreadsheet_headers2columns = {} |
272
|
|
|
for column in range(corr_header_idx_cfg_col_first, corr_header_idx_cfg_col_last + 1): |
273
|
|
|
value = ws[self._transform_coordinates(corr_header_idx_cfg_row_first, column)].value |
274
|
|
|
if value is not None: |
275
|
|
|
spreadsheet_headers2columns[value] = column |
276
|
|
|
else: |
277
|
|
|
# if the value is None we have either |
278
|
|
|
# - one or more empty header cell in between 2 filled header cells. |
279
|
|
|
# - some empty header cells at the end of the row. |
280
|
|
|
# That might happen because when choosing 'automatic' header row detection, openpyxl functionality |
281
|
|
|
# is used. It always returns the length of the longest row in the whole sheet. |
282
|
|
|
# If that is not the header row, we have empty cells at the end of the header row. |
283
|
|
|
# However, that is not a problem as header values 'None' are not added. |
284
|
|
|
pass |
285
|
|
|
spreadsheet_headers = spreadsheet_headers2columns.keys() |
286
|
|
|
|
287
|
|
|
############################################################### |
288
|
|
|
# Check for not existing headers on spreadsheet and config side |
289
|
|
|
############################################################### |
290
|
|
|
# Build a data_type_config dictionary with header as key |
291
|
|
|
config_header_dict = {x['header']: x for x in self.excel_config['data_type_config']} |
292
|
|
|
|
293
|
|
|
# Check: Are config data_type_config headers unique? |
294
|
|
|
if len(config_header_dict.keys()) != len(set(config_header_dict.keys())): |
295
|
|
|
self._raise_value_error("Config error: data_type_config -> header duplicate entries found.") |
296
|
|
|
|
297
|
|
|
# Check for configured headers not found in spreadsheet |
298
|
|
|
missing_headers_in_spreadsheet = list(set(config_header_dict.keys()) - set(spreadsheet_headers)) |
299
|
|
|
for header in missing_headers_in_spreadsheet: |
300
|
|
|
# Check if the fail_on_header_not_found is true |
301
|
|
|
data_type = [x for x in self.excel_config['data_type_config'] if x['header'] == header][0] |
302
|
|
|
msg = u"Config error: The header '{0}' could not be found in the spreadsheet.".format(header) |
303
|
|
|
if data_type['fail_on_header_not_found']: |
304
|
|
|
self._raise_value_error(msg) |
305
|
|
|
else: |
306
|
|
|
self._plugin.log.error(msg) |
307
|
|
|
|
308
|
|
|
# Check for spreadsheet headers not found in config |
309
|
|
|
missing_headers_in_config = list(set(spreadsheet_headers) - set(config_header_dict.keys())) |
310
|
|
|
if missing_headers_in_config: |
311
|
|
|
self._plugin.log.debug(u"The following spreadsheet headers are not configured for reading: {0}".format( |
312
|
|
|
', '.join([header.replace('\n', ' ') for header in missing_headers_in_config]))) |
313
|
|
|
for header in missing_headers_in_config: |
314
|
|
|
del spreadsheet_headers2columns[header] |
315
|
|
|
|
316
|
|
|
######################### |
317
|
|
|
# Determine last data row |
318
|
|
|
######################### |
319
|
|
|
if type(corr_data_idx_cfg_row_last) == int: |
|
|
|
|
320
|
|
|
# do nothing, the value is already set |
321
|
|
|
pass |
322
|
|
|
elif corr_data_idx_cfg_row_last == 'automatic': |
323
|
|
|
# Take data from library |
324
|
|
|
corr_data_idx_cfg_row_last = corr_data_idx_cfg_row_first |
325
|
|
|
for curr_column in range(corr_header_idx_cfg_col_first, |
326
|
|
|
corr_header_idx_cfg_col_last): |
327
|
|
|
len_curr_column = len(ws[self._transform_coordinates(column=curr_column)]) |
328
|
|
|
if len_curr_column > corr_data_idx_cfg_row_last: |
329
|
|
|
corr_data_idx_cfg_row_last = len_curr_column |
330
|
|
|
self._plugin.log.debug("Config update: Last data {0} was set to {1} using the 'automatic' " |
331
|
|
|
"mechanism.".format(oriented_row_text, corr_data_idx_cfg_row_last)) |
332
|
|
|
else: |
333
|
|
|
# severalEmptyCells is chosen |
334
|
|
|
target_empty_rows_count = int(corr_data_idx_cfg_row_last.split(':')[1]) |
335
|
|
|
last_row_detected = False |
336
|
|
|
curr_row = corr_data_idx_cfg_row_first |
337
|
|
|
empty_rows_count = 0 |
338
|
|
|
while not last_row_detected: |
339
|
|
|
# go through rows |
340
|
|
|
all_columns_empty = True |
341
|
|
|
for header in spreadsheet_headers2columns: |
342
|
|
|
curr_column = spreadsheet_headers2columns[header] |
343
|
|
|
value = ws[self._transform_coordinates(curr_row, curr_column)].value |
344
|
|
|
if value is not None: |
345
|
|
|
all_columns_empty = False |
346
|
|
|
break |
347
|
|
|
if all_columns_empty: |
348
|
|
|
empty_rows_count += 1 |
349
|
|
|
if empty_rows_count >= target_empty_rows_count: |
350
|
|
|
last_row_detected = True |
351
|
|
|
else: |
352
|
|
|
curr_row += 1 |
353
|
|
|
corr_data_idx_cfg_row_last = curr_row - target_empty_rows_count |
354
|
|
|
self._plugin.log.debug("Config update: Last data {0} was set to {1} using the 'severalEmptyCells' " |
355
|
|
|
"mechanism.".format(oriented_row_text, corr_data_idx_cfg_row_last)) |
356
|
|
|
|
357
|
|
|
################################################# |
358
|
|
|
# Go through the rows, read and validate the data |
359
|
|
|
################################################# |
360
|
|
|
final_dict = {} |
361
|
|
|
for curr_row in range(corr_data_idx_cfg_row_first, corr_data_idx_cfg_row_last + 1): |
|
|
|
|
362
|
|
|
# Go through rows |
363
|
|
|
final_dict[curr_row] = {} |
364
|
|
|
|
365
|
|
|
msg_queue = { |
366
|
|
|
'fail_on_empty_cell': { |
367
|
|
|
'exceptions': [], |
368
|
|
|
'logs': [] |
369
|
|
|
}, |
370
|
|
|
'fail_on_type_error': { |
371
|
|
|
'exceptions': [], |
372
|
|
|
'logs': [] |
373
|
|
|
} |
374
|
|
|
} |
375
|
|
|
is_row_excluded = False |
376
|
|
|
|
377
|
|
|
for header in spreadsheet_headers2columns: |
378
|
|
|
# Go through columns |
379
|
|
|
curr_column = spreadsheet_headers2columns[header] |
380
|
|
|
cell_index_str = self._transform_coordinates(curr_row, curr_column) |
381
|
|
|
value = ws[cell_index_str].value |
382
|
|
|
|
383
|
|
|
# Start the validation |
384
|
|
|
config_header = config_header_dict[header] |
385
|
|
|
|
386
|
|
|
if value is None: |
387
|
|
|
msg = u"The '{0}' in cell {1} is empty".format(header, cell_index_str) |
388
|
|
|
if config_header['fail_on_empty_cell']: |
389
|
|
|
msg_queue['fail_on_empty_cell']['exceptions'].append(msg) |
390
|
|
|
else: |
391
|
|
|
msg_queue['fail_on_empty_cell']['logs'].append(msg) |
392
|
|
|
else: |
393
|
|
|
if config_header['type']['base'] == 'automatic': |
394
|
|
|
pass |
395
|
|
|
elif config_header['type']['base'] == 'date': |
396
|
|
|
if not isinstance(value, datetime.datetime): |
397
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
398
|
|
|
u'specification is datetime'.format(value, cell_index_str, type(value)) |
399
|
|
|
if config_header['fail_on_type_error']: |
400
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
401
|
|
|
else: |
402
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
403
|
|
|
elif config_header['type']['base'] == 'enum': |
404
|
|
|
filtered_enum_values = [] |
405
|
|
|
if 'filter' in config_header['type']: |
406
|
|
|
filtered_enum_values = config_header['type']['filter']['whitelist_values'] |
407
|
|
|
if not self._is_string(value): |
408
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
409
|
|
|
u'specification is a string type (enum)'.format(value, cell_index_str, type(value)) |
410
|
|
|
if config_header['fail_on_type_error']: |
411
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
412
|
|
|
else: |
413
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
414
|
|
|
if filtered_enum_values: |
415
|
|
|
msg = 'Cannot apply enum filter to cell {1} because the type check failed.' |
416
|
|
|
self._plugin.log.error(msg) |
417
|
|
|
else: |
418
|
|
|
valid_values = config_header['type']['enum_values'] |
419
|
|
|
if value not in valid_values: |
420
|
|
|
msg = u'The value {0} in cell {1} is not contained in the given enum ' \ |
421
|
|
|
u'[{2}]'.format(value, cell_index_str, ', '.join(valid_values)) |
422
|
|
|
if config_header['fail_on_type_error']: |
423
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
424
|
|
|
else: |
425
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
426
|
|
|
if filtered_enum_values: |
427
|
|
|
msg = 'Cannot apply enum filter to cell {1} because the enum values check failed.' |
428
|
|
|
self._plugin.log.error(msg) |
429
|
|
|
else: |
430
|
|
|
if filtered_enum_values and value not in filtered_enum_values: |
431
|
|
|
is_row_excluded = True |
432
|
|
|
self._plugin.log.debug(u"The {0} {1} was excluded due to an exclude filter on " |
433
|
|
|
u"cell {2} ({3} not in [{4}]).".format( |
434
|
|
|
oriented_row_text, |
435
|
|
|
curr_row, |
436
|
|
|
cell_index_str, |
437
|
|
|
value, |
438
|
|
|
', '.join(filtered_enum_values))) |
439
|
|
|
|
440
|
|
|
elif config_header['type']['base'] == 'float': |
441
|
|
|
# TODO Allow int, too |
|
|
|
|
442
|
|
View Code Duplication |
if not isinstance(value, float): |
|
|
|
|
443
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
444
|
|
|
u'specification is float'.format(value, cell_index_str, type(value)) |
445
|
|
|
if config_header['fail_on_type_error']: |
446
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
447
|
|
|
else: |
448
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
449
|
|
|
else: |
450
|
|
|
if 'minimum' in config_header['type']: |
451
|
|
|
if value < config_header['type']['minimum']: |
452
|
|
|
msg = u'The value {0} in cell {1} is smaller than the given minimum ' \ |
453
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['minimum']) |
454
|
|
|
if config_header['fail_on_type_error']: |
455
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
456
|
|
|
else: |
457
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
458
|
|
|
if 'maximum' in config_header['type']: |
459
|
|
|
if value > config_header['type']['maximum']: |
460
|
|
|
msg = u'The value {0} in cell {1} is greater than the given maximum ' \ |
461
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['maximum']) |
462
|
|
|
if config_header['fail_on_type_error']: |
463
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
464
|
|
|
else: |
465
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
466
|
|
|
elif config_header['type']['base'] == 'integer': |
467
|
|
|
# Integer values stored by Excel are returned as float (e.g. 3465.0) |
468
|
|
|
# So we have to check if the float can be converted to int without precision loss |
469
|
|
|
if isinstance(value, float): |
470
|
|
|
if value.is_integer(): |
471
|
|
|
# the typecast to int may return int or long, depending on the size of value |
472
|
|
|
value = int(value) |
473
|
|
View Code Duplication |
if self._is_type_int_long(value): |
|
|
|
|
474
|
|
|
if 'minimum' in config_header['type']: |
475
|
|
|
if value < config_header['type']['minimum']: |
476
|
|
|
msg = u'The value {0} in cell {1} is smaller than the given minimum ' \ |
477
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['minimum']) |
478
|
|
|
if config_header['fail_on_type_error']: |
479
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
480
|
|
|
else: |
481
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
482
|
|
|
if 'maximum' in config_header['type']: |
483
|
|
|
if value > config_header['type']['maximum']: |
484
|
|
|
msg = u'The value {0} in cell {1} is greater than the given maximum ' \ |
485
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['maximum']) |
486
|
|
|
if config_header['fail_on_type_error']: |
487
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
488
|
|
|
else: |
489
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
490
|
|
|
else: |
491
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
492
|
|
|
u'specification is int'.format(value, cell_index_str, type(value)) |
493
|
|
|
if config_header['fail_on_type_error']: |
494
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
495
|
|
|
else: |
496
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
497
|
|
|
|
498
|
|
|
elif config_header['type']['base'] == 'string': |
499
|
|
|
if self._is_type_numeric(value): |
500
|
|
|
convert_numbers = False |
501
|
|
|
if 'convert_numbers' in config_header['type']: |
502
|
|
|
convert_numbers = config_header['type']['convert_numbers'] |
503
|
|
|
if convert_numbers: |
504
|
|
|
value = str(value) |
505
|
|
|
if not self._is_string(value): |
506
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
507
|
|
|
u'specification is string'.format(value, cell_index_str, type(value)) |
508
|
|
|
if config_header['fail_on_type_error']: |
509
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
510
|
|
|
else: |
511
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
512
|
|
|
else: |
513
|
|
|
if 'pattern' in config_header['type']: |
514
|
|
|
if re.search(config_header['type']['pattern'], value) is None: |
515
|
|
|
msg = u'The value {0} in cell {1} does not follow the ' \ |
516
|
|
|
u'given pattern {2}'.format(value, cell_index_str, |
517
|
|
|
config_header['type']['pattern']) |
518
|
|
|
if config_header['fail_on_type_error']: |
519
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
520
|
|
|
else: |
521
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
522
|
|
|
|
523
|
|
|
final_dict[curr_row][header] = value |
524
|
|
|
|
525
|
|
|
if is_row_excluded: |
526
|
|
|
# All messages are either raised as exception or logged |
527
|
|
|
# If at all depends on the settings in self.excel_config['filter_properties'] |
528
|
|
|
|
529
|
|
|
for msg_type, messages in msg_queue.items(): |
530
|
|
|
for msg in messages['exceptions']: |
531
|
|
|
if self.excel_config['filter_properties']['excluded_' + msg_type]: |
532
|
|
|
# This ends the program on the first exception message |
533
|
|
|
self._raise_value_error(msg) |
534
|
|
|
elif self.excel_config['filter_properties']['excluded_enable_logging']: |
535
|
|
|
# In case we don't want to raise type errors as exception |
536
|
|
|
# we log them in case the user configured so |
537
|
|
|
self._plugin.log.warn(msg) |
538
|
|
|
for msg in messages['logs']: |
539
|
|
|
if self.excel_config['filter_properties']['excluded_enable_logging']: |
540
|
|
|
self._plugin.log.warn(msg) |
541
|
|
|
del final_dict[curr_row] |
542
|
|
|
else: |
543
|
|
|
for msg_type, messages in msg_queue.items(): |
544
|
|
|
for msg in messages['exceptions']: |
545
|
|
|
# This ends the program on the first exception message |
546
|
|
|
self._raise_value_error(msg) |
547
|
|
|
for msg in messages['logs']: |
548
|
|
|
self._plugin.log.warn(msg) |
549
|
|
|
|
550
|
|
|
return final_dict |
551
|
|
|
|
552
|
|
|
@staticmethod |
553
|
|
|
def _is_string(value): |
554
|
|
|
""" |
555
|
|
|
Tests if a value is of a string type |
556
|
|
|
Python2: unicode, str |
557
|
|
|
Python3: str |
558
|
|
|
|
559
|
|
|
:param value: Variable of any type |
560
|
|
|
:return: True if value is of string type else false |
561
|
|
|
""" |
562
|
|
|
try: |
563
|
|
|
str_type = basestring |
564
|
|
|
except NameError: |
565
|
|
|
str_type = str |
566
|
|
|
return isinstance(value, str_type) |
567
|
|
|
|
568
|
|
|
@staticmethod |
569
|
|
|
def _is_type_numeric(value): |
570
|
|
|
""" |
571
|
|
|
Tests if a value is of an numeric type. |
572
|
|
|
Python2: int, long, float |
573
|
|
|
Python3: int, float |
574
|
|
|
|
575
|
|
|
:param value: Variable of any type |
576
|
|
|
:return: True if value is of integer type else false |
577
|
|
|
""" |
578
|
|
|
numeric_types = (int, float) |
579
|
|
|
try: |
580
|
|
|
long |
|
|
|
|
581
|
|
|
numeric_types = (int, long, float) |
582
|
|
|
except NameError: |
583
|
|
|
pass |
584
|
|
|
|
585
|
|
|
return isinstance(value, numeric_types) |
586
|
|
|
|
587
|
|
|
@staticmethod |
588
|
|
|
def _is_type_int_long(value): |
589
|
|
|
""" |
590
|
|
|
Tests if a value is of an integer type. |
591
|
|
|
Python2: int, long |
592
|
|
|
Python3: int |
593
|
|
|
|
594
|
|
|
:param value: Variable of any type |
595
|
|
|
:return: True if value is of integer type else false |
596
|
|
|
""" |
597
|
|
|
numeric_types = (int) |
598
|
|
|
try: |
599
|
|
|
long |
|
|
|
|
600
|
|
|
numeric_types = (int, long) |
601
|
|
|
except NameError: |
602
|
|
|
pass |
603
|
|
|
|
604
|
|
|
return isinstance(value, numeric_types) |
605
|
|
|
|
606
|
|
|
def _validate_json(self, excel_config_json_path): |
607
|
|
|
|
608
|
|
|
try: |
609
|
|
|
with open(excel_config_json_path) as file_pointer: |
610
|
|
|
json_obj = json.load(file_pointer) |
611
|
|
|
# the file is not deserializable as a json object |
612
|
|
|
except ValueError as exc: |
613
|
|
|
self._plugin.log.error('Malformed JSON file: {0} \n {1}'.format(excel_config_json_path, exc)) |
614
|
|
|
raise exc |
615
|
|
|
# some os error occured (e.g file not found or malformed path string) |
616
|
|
|
# have to catch two exception classes: in py2 : IOError; py3: OSError |
617
|
|
|
except (IOError, OSError) as exc: |
618
|
|
|
self._plugin.log.error(exc) |
619
|
|
|
# raise only OSError to make error handling in caller easier |
620
|
|
|
raise OSError() |
621
|
|
|
|
622
|
|
|
# validate json object if schema file path is there; otherwise throw warning |
623
|
|
|
try: |
624
|
|
|
with open(JSON_SCHEMA_FILE_PATH) as file_pointer: |
625
|
|
|
schema_obj = json.load(file_pointer) |
626
|
|
|
# the file is not deserializable as a json object |
627
|
|
|
except ValueError as exc: |
628
|
|
|
self._plugin.log.error('Malformed JSON schema file: {0} \n {1}'.format(JSON_SCHEMA_FILE_PATH, exc)) |
629
|
|
|
raise exc |
630
|
|
|
# some os error occured (e.g file not found or malformed path string) |
631
|
|
|
# have to catch two exception classes: in py2 : IOError; py3: OSError |
632
|
|
|
except (IOError, OSError) as exc: |
633
|
|
|
self._plugin.log.error(exc) |
634
|
|
|
# raise only OSError to make error handling in caller easier |
635
|
|
|
raise OSError() |
636
|
|
|
|
637
|
|
|
# do the validation |
638
|
|
|
try: |
639
|
|
|
validate(json_obj, schema_obj) |
640
|
|
|
except ValidationError as error: |
641
|
|
|
self._plugin.log.error("Validation failed: {0}".format(error.message)) |
642
|
|
|
raise |
643
|
|
|
except SchemaError: |
644
|
|
|
self._plugin.log.error("Invalid schema file: {0}".format(JSON_SCHEMA_FILE_PATH)) |
645
|
|
|
raise |
646
|
|
|
|
647
|
|
|
return json_obj |
648
|
|
|
|
649
|
|
|
def _get_sheet(self, workbook): |
650
|
|
|
|
651
|
|
|
# get sheet |
652
|
|
|
worksheet = None |
653
|
|
|
if type(self.excel_config['sheet_config']) == int: |
|
|
|
|
654
|
|
|
worksheet = workbook.worksheets[self.excel_config['sheet_config'] - 1] |
655
|
|
|
elif self.excel_config['sheet_config'] == 'active': |
656
|
|
|
worksheet = workbook.active |
657
|
|
|
elif self.excel_config['sheet_config'].startswith('name'): |
658
|
|
|
worksheet = workbook[self.excel_config['sheet_config'].split(':')[1]] |
659
|
|
|
elif self.excel_config['sheet_config'] == 'first': |
660
|
|
|
worksheet = workbook.worksheets[0] |
661
|
|
|
elif self.excel_config['sheet_config'] == 'last': |
662
|
|
|
worksheet = workbook.worksheets[len(workbook.get_sheet_names()) - 1] |
663
|
|
|
else: |
664
|
|
|
# This cannot happen if json validation was ok |
665
|
|
|
pass |
666
|
|
|
return worksheet |
667
|
|
|
|
668
|
|
|
def _raise_value_error(self, msg): |
669
|
|
|
self._plugin.log.error(msg) |
670
|
|
|
raise ValueError(msg.encode('UTF-8', 'ignore')) |
671
|
|
|
|
672
|
|
|
def _transform_coordinates(self, row=None, column=None): |
673
|
|
|
if row is None and column is None: |
674
|
|
|
raise ValueError("_transform_coordinates: row and column cannot both be None.") |
675
|
|
|
target_str = '' |
676
|
|
|
if self.excel_config['orientation'] == 'column_based': |
677
|
|
|
if column is not None: |
678
|
|
|
target_str = get_column_letter(column) |
679
|
|
|
if row is not None: |
680
|
|
|
target_str += str(row) |
681
|
|
|
else: |
682
|
|
|
if row is not None: |
683
|
|
|
target_str = get_column_letter(row) |
684
|
|
|
if column is not None: |
685
|
|
|
target_str += str(column) |
686
|
|
|
return target_str |
687
|
|
|
|