|
1
|
|
|
""" |
|
2
|
|
|
Groundwork Excel read/write routines using openpyxl |
|
3
|
|
|
""" |
|
4
|
|
|
import datetime |
|
5
|
|
|
import json |
|
6
|
|
|
import os |
|
7
|
|
|
import re |
|
8
|
|
|
|
|
9
|
|
|
import openpyxl |
|
10
|
|
|
from groundwork.patterns.gw_base_pattern import GwBasePattern |
|
11
|
|
|
from jsonschema import validate, ValidationError, SchemaError |
|
12
|
|
|
from openpyxl.utils import get_column_letter |
|
|
|
|
|
|
13
|
|
|
|
|
14
|
|
|
JSON_SCHEMA_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'assets', 'excel_config_schema.json') |
|
15
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
class ExcelValidationPattern(GwBasePattern): |
|
18
|
|
|
""" |
|
19
|
|
|
Groundwork Excel read/write routines using openpyxl |
|
20
|
|
|
""" |
|
21
|
|
|
|
|
22
|
|
|
def __init__(self, *args, **kwargs): |
|
23
|
|
|
super(ExcelValidationPattern, self).__init__(*args, **kwargs) |
|
24
|
|
|
self.excel_validation = ExcelValidationPlugin(plugin=self) |
|
25
|
|
|
|
|
26
|
|
|
|
|
27
|
|
|
class ExcelValidationPlugin: |
|
28
|
|
|
""" |
|
29
|
|
|
Plugin level class of the Excel validation pattern. |
|
30
|
|
|
Note: There are no instances of the ExcelValidationPattern on the app. |
|
31
|
|
|
""" |
|
32
|
|
|
|
|
33
|
|
|
def __init__(self, plugin): |
|
34
|
|
|
""" |
|
35
|
|
|
:param plugin: The plugin, which wants to use documents |
|
36
|
|
|
:type plugin: GwBasePattern |
|
37
|
|
|
""" |
|
38
|
|
|
self._plugin = plugin |
|
39
|
|
|
self._app = plugin.app |
|
40
|
|
|
self.excel_config = None |
|
41
|
|
|
|
|
42
|
|
|
def read_excel(self, excel_config_json_path, excel_workbook_path): |
|
|
|
|
|
|
43
|
|
|
""" |
|
44
|
|
|
Main routine to read an Excel sheet. |
|
45
|
|
|
|
|
46
|
|
|
:param excel_config_json_path: The configuration json file |
|
47
|
|
|
:param excel_workbook_path: Relative or absolute path to an Excel workbook |
|
48
|
|
|
:return: Data dictionary with rows/colums as keys and a dictionary of "header": value as items |
|
49
|
|
|
""" |
|
50
|
|
|
|
|
51
|
|
|
# The exceptions raised in this method shall be raised to the plugin level |
|
52
|
|
|
self.excel_config = self._validate_json(excel_config_json_path) |
|
53
|
|
|
|
|
54
|
|
|
######################### |
|
55
|
|
|
# Get index configuration |
|
56
|
|
|
######################### |
|
57
|
|
|
orientation = self.excel_config['orientation'] |
|
58
|
|
|
header_idx_cfg_row_first = self.excel_config['headers_index_config']['row_index']['first'] |
|
59
|
|
|
header_idx_cfg_row_last = self.excel_config['headers_index_config']['row_index']['last'] |
|
60
|
|
|
header_idx_cfg_col_first = self.excel_config['headers_index_config']['column_index']['first'] |
|
61
|
|
|
header_idx_cfg_col_last = self.excel_config['headers_index_config']['column_index']['last'] |
|
62
|
|
|
data_idx_cfg_row_first = self.excel_config['data_index_config']['row_index']['first'] |
|
63
|
|
|
data_idx_cfg_row_last = self.excel_config['data_index_config']['row_index']['last'] |
|
64
|
|
|
data_idx_cfg_col_first = self.excel_config['data_index_config']['column_index']['first'] |
|
65
|
|
|
data_idx_cfg_col_last = self.excel_config['data_index_config']['column_index']['last'] |
|
66
|
|
|
|
|
67
|
|
|
############################################################################ |
|
68
|
|
|
# Rotate coordinates so we can work with virtually column based all the time |
|
69
|
|
|
# Just before addressing the cells a coordinate transformation is done again |
|
70
|
|
|
############################################################################ |
|
71
|
|
|
if orientation == 'column_based': |
|
72
|
|
|
# Correct the header and data matrix according to the orientation |
|
73
|
|
|
# We rotate the row_based layout so we can work with column_based in mind all the time |
|
74
|
|
|
# Name schema: corrected headers/data index config column/row first/last |
|
75
|
|
|
corr_header_idx_cfg_row_first = header_idx_cfg_row_first |
|
76
|
|
|
corr_header_idx_cfg_row_last = header_idx_cfg_row_last |
|
77
|
|
|
corr_header_idx_cfg_col_first = header_idx_cfg_col_first |
|
78
|
|
|
corr_header_idx_cfg_col_last = header_idx_cfg_col_last |
|
79
|
|
|
corr_data_idx_cfg_row_first = data_idx_cfg_row_first |
|
80
|
|
|
corr_data_idx_cfg_row_last = data_idx_cfg_row_last |
|
81
|
|
|
corr_data_idx_cfg_col_first = data_idx_cfg_col_first |
|
82
|
|
|
corr_data_idx_cfg_col_last = data_idx_cfg_col_last |
|
83
|
|
|
else: |
|
84
|
|
|
# row_based layout |
|
85
|
|
|
corr_header_idx_cfg_row_first = header_idx_cfg_col_first |
|
86
|
|
|
corr_header_idx_cfg_row_last = header_idx_cfg_col_last |
|
87
|
|
|
corr_header_idx_cfg_col_first = header_idx_cfg_row_first |
|
88
|
|
|
corr_header_idx_cfg_col_last = header_idx_cfg_row_last |
|
89
|
|
|
corr_data_idx_cfg_row_first = data_idx_cfg_col_first |
|
90
|
|
|
corr_data_idx_cfg_row_last = data_idx_cfg_col_last |
|
91
|
|
|
corr_data_idx_cfg_col_first = data_idx_cfg_row_first |
|
92
|
|
|
corr_data_idx_cfg_col_last = data_idx_cfg_row_last |
|
93
|
|
|
|
|
94
|
|
|
# Used for log and exception messages |
|
95
|
|
|
oriented_row_text = "row" if orientation == 'column_based' else "column" |
|
96
|
|
|
oriented_column_text = "column" if orientation == 'column_based' else "row" |
|
97
|
|
|
|
|
98
|
|
|
###################################### |
|
99
|
|
|
# Set defaults of headers_index_config |
|
100
|
|
|
###################################### |
|
101
|
|
|
if type(corr_header_idx_cfg_row_first) is not int: |
|
|
|
|
|
|
102
|
|
|
# Assume the user wants to use the first row as header row |
|
103
|
|
|
corr_header_idx_cfg_row_first = 1 |
|
104
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> first to 1".format( |
|
105
|
|
|
oriented_row_text)) |
|
106
|
|
|
|
|
107
|
|
|
if type(corr_header_idx_cfg_row_last) is not int: |
|
|
|
|
|
|
108
|
|
|
# Only 1 header row is supported currently, set last header row to first header row |
|
109
|
|
|
corr_header_idx_cfg_row_last = corr_header_idx_cfg_row_first |
|
110
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> last to {1}".format( |
|
111
|
|
|
oriented_row_text, corr_header_idx_cfg_row_first)) |
|
112
|
|
|
|
|
113
|
|
|
if type(corr_header_idx_cfg_col_first) is not int: |
|
|
|
|
|
|
114
|
|
|
# Assume the user wants to start at the first column |
|
115
|
|
|
corr_header_idx_cfg_col_first = 1 |
|
116
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> first to 1".format( |
|
117
|
|
|
oriented_column_text)) |
|
118
|
|
|
|
|
119
|
|
|
if type(corr_header_idx_cfg_col_last) is not int: |
|
|
|
|
|
|
120
|
|
|
if type(corr_data_idx_cfg_col_last) is int: |
|
|
|
|
|
|
121
|
|
|
# We don't have a last column in the header config, |
|
122
|
|
|
# so we use what we have in the data config |
|
123
|
|
|
corr_header_idx_cfg_col_last = corr_data_idx_cfg_col_last |
|
124
|
|
|
self._plugin.log.debug("Config update: Setting headers_index_config -> {0}_index -> first to " |
|
125
|
|
|
"{1}".format(oriented_column_text, corr_header_idx_cfg_col_last)) |
|
126
|
|
|
|
|
127
|
|
|
################################### |
|
128
|
|
|
# Set defaults of data_index_config |
|
129
|
|
|
################################### |
|
130
|
|
|
if type(corr_data_idx_cfg_row_first) is not int: |
|
|
|
|
|
|
131
|
|
|
# Assume the first data row is the next after corr_header_idx_cfg_row_last |
|
132
|
|
|
corr_data_idx_cfg_row_first = corr_header_idx_cfg_row_last + 1 |
|
133
|
|
|
self._plugin.log.debug("Config update: Setting data_index_config -> {0}_index -> first to {1}".format( |
|
134
|
|
|
oriented_row_text, corr_data_idx_cfg_row_first)) |
|
135
|
|
|
|
|
136
|
|
|
# corr_data_idx_cfg_row_last has no defaults - the user input is master |
|
137
|
|
|
|
|
138
|
|
|
if type(corr_data_idx_cfg_col_first) is not int: |
|
|
|
|
|
|
139
|
|
|
# Assume the first data column is equal to the first header column |
|
140
|
|
|
corr_data_idx_cfg_col_first = corr_header_idx_cfg_col_first |
|
141
|
|
|
self._plugin.log.debug("Config update: Setting data_index_config -> {0}_index -> first to {1}".format( |
|
142
|
|
|
oriented_column_text, corr_data_idx_cfg_col_first)) |
|
143
|
|
|
|
|
144
|
|
|
if type(corr_data_idx_cfg_col_last) is not int: |
|
|
|
|
|
|
145
|
|
|
if type(corr_header_idx_cfg_col_last) is int: |
|
|
|
|
|
|
146
|
|
|
# We don't have a last column in the data config, |
|
147
|
|
|
# so we use what we have in the header config |
|
148
|
|
|
corr_data_idx_cfg_col_last = corr_header_idx_cfg_col_last |
|
149
|
|
|
self._plugin.log.debug("Config update: Setting data_index_config -> {0}_index -> last to " |
|
150
|
|
|
"{1}".format(oriented_column_text, corr_data_idx_cfg_col_last)) |
|
151
|
|
|
|
|
152
|
|
|
################################ |
|
153
|
|
|
# Some more logic checks on rows |
|
154
|
|
|
# (matrix size and row order) |
|
155
|
|
|
################################ |
|
156
|
|
|
if corr_header_idx_cfg_row_last != corr_header_idx_cfg_row_first: |
|
157
|
|
|
# We can compare both because at this point they have to be integer |
|
158
|
|
|
# Multi line headers given |
|
159
|
|
|
self._raise_value_error("Config error: Multi line (grouped) headers are not yet supported. " |
|
160
|
|
|
"First and last header {0} must be equal.".format(oriented_row_text)) |
|
161
|
|
|
|
|
162
|
|
|
if corr_data_idx_cfg_row_first <= corr_header_idx_cfg_row_last: |
|
163
|
|
|
# We can compare both because at this point they have to be integer |
|
164
|
|
|
# The data section is above the header section |
|
165
|
|
|
self._raise_value_error("Config error: headers_index_config -> {0}_index -> last is greater than " |
|
166
|
|
|
"data_index_config -> {0}_index -> first.".format(oriented_row_text)) |
|
167
|
|
|
|
|
168
|
|
|
if type(corr_data_idx_cfg_row_last) is int: |
|
|
|
|
|
|
169
|
|
|
if corr_data_idx_cfg_row_last < corr_data_idx_cfg_row_first: |
|
170
|
|
|
# The last data row is smaller than the first |
|
171
|
|
|
self._raise_value_error("Config error: data_index_config -> {0}_index -> first is greater than " |
|
172
|
|
|
"data_index_config -> {0}_index -> last.".format(oriented_row_text)) |
|
173
|
|
|
|
|
174
|
|
|
################################### |
|
175
|
|
|
# Some more logic checks on columns |
|
176
|
|
|
# (mismatches) |
|
177
|
|
|
################################### |
|
178
|
|
|
if corr_header_idx_cfg_col_first != corr_data_idx_cfg_col_first: |
|
179
|
|
|
# We can compare both because at this point they have to be integer |
|
180
|
|
|
# First column mismatch |
|
181
|
|
|
self._raise_value_error("Config error: header_index_config -> {0}_index -> first is not equal to " |
|
182
|
|
|
"data_index_config -> {0}_index -> first.".format(oriented_column_text)) |
|
183
|
|
|
|
|
184
|
|
|
if type(corr_header_idx_cfg_col_last) is int: |
|
|
|
|
|
|
185
|
|
|
if type(corr_data_idx_cfg_col_last) is int: |
|
|
|
|
|
|
186
|
|
|
if corr_header_idx_cfg_col_last != corr_data_idx_cfg_col_last: |
|
187
|
|
|
# Last columns are given but do not match |
|
188
|
|
|
self._raise_value_error( |
|
189
|
|
|
"Config error: header_index_config -> {0}_index -> last ({1}) is not equal to " |
|
190
|
|
|
"data_index_config -> {0}_index -> last ({2}).".format( |
|
191
|
|
|
oriented_column_text, |
|
192
|
|
|
corr_header_idx_cfg_col_last, |
|
193
|
|
|
corr_data_idx_cfg_col_last |
|
194
|
|
|
)) |
|
195
|
|
|
|
|
196
|
|
|
if type(corr_header_idx_cfg_col_last) is not int: |
|
|
|
|
|
|
197
|
|
|
if type(corr_data_idx_cfg_col_last) is not int: |
|
|
|
|
|
|
198
|
|
|
# The column count search is done on the header row, not on data |
|
199
|
|
|
if corr_data_idx_cfg_col_last != 'automatic': |
|
200
|
|
|
self._raise_value_error( |
|
201
|
|
|
"Config error: data_index_config -> {0}_index -> last ({1}) may only be an integer or " |
|
202
|
|
|
"contain the value 'automatic'.".format(oriented_column_text, |
|
203
|
|
|
corr_data_idx_cfg_col_last)) |
|
204
|
|
|
|
|
205
|
|
|
######################################### |
|
206
|
|
|
# Set defaults for optional config values |
|
207
|
|
|
######################################### |
|
208
|
|
|
if 'sheet_config' not in self.excel_config: |
|
209
|
|
|
self.excel_config['sheet_config'] = 'active' |
|
210
|
|
|
|
|
211
|
|
|
for data_type_config in self.excel_config['data_type_config']: |
|
212
|
|
|
# default for possible problems should be strict if user tells nothing |
|
213
|
|
|
if 'fail_on_type_error' not in data_type_config: |
|
214
|
|
|
data_type_config['fail_on_type_error'] = True |
|
215
|
|
|
if 'fail_on_empty_cell' not in data_type_config: |
|
216
|
|
|
data_type_config['fail_on_empty_cell'] = True |
|
217
|
|
|
if 'fail_on_header_not_found' not in data_type_config: |
|
218
|
|
|
data_type_config['fail_on_header_not_found'] = True |
|
219
|
|
|
|
|
220
|
|
|
# default type is automatic |
|
221
|
|
|
if 'type' not in data_type_config: |
|
222
|
|
|
data_type_config['type'] = {'base': 'automatic'} |
|
223
|
|
|
|
|
224
|
|
|
# Set global filter properties |
|
225
|
|
|
# Defensive variant is True for all options |
|
226
|
|
|
if 'filter_properties' not in self.excel_config: |
|
227
|
|
|
self.excel_config['filter_properties'] = {} |
|
228
|
|
|
if 'excluded_fail_on_type_error' not in self.excel_config['filter_properties']: |
|
229
|
|
|
self.excel_config['filter_properties']['excluded_fail_on_type_error'] = True |
|
230
|
|
|
if 'excluded_fail_on_empty_cell' not in self.excel_config['filter_properties']: |
|
231
|
|
|
self.excel_config['filter_properties']['excluded_fail_on_empty_cell'] = True |
|
232
|
|
|
if 'excluded_enable_logging' not in self.excel_config['filter_properties']: |
|
233
|
|
|
self.excel_config['filter_properties']['excluded_enable_logging'] = True |
|
234
|
|
|
|
|
235
|
|
|
############################ |
|
236
|
|
|
# Get the workbook and sheet |
|
237
|
|
|
############################ |
|
238
|
|
|
wb = openpyxl.load_workbook(excel_workbook_path, data_only=True) |
|
|
|
|
|
|
239
|
|
|
ws = self._get_sheet(wb) |
|
|
|
|
|
|
240
|
|
|
|
|
241
|
|
|
############################# |
|
242
|
|
|
# Determine header row length |
|
243
|
|
|
############################# |
|
244
|
|
|
if type(corr_header_idx_cfg_col_last) == int: |
|
|
|
|
|
|
245
|
|
|
# corr_header_idx_cfg_col_last already has the final value |
|
246
|
|
|
pass |
|
247
|
|
|
elif corr_header_idx_cfg_col_last == 'automatic': |
|
248
|
|
|
# automatic: use the length of the header row |
|
249
|
|
|
corr_header_idx_cfg_col_last = len( |
|
250
|
|
|
ws[self._transform_coordinates(row=corr_header_idx_cfg_row_first)]) |
|
251
|
|
|
self._plugin.log.debug("Config update: Last header {0} was set to {1} using the 'automatic' " |
|
252
|
|
|
"mechanism.".format(oriented_column_text, corr_header_idx_cfg_col_last)) |
|
253
|
|
|
else: |
|
254
|
|
|
# severalEmptyCells chosen |
|
255
|
|
|
target_empty_cell_count = int(corr_header_idx_cfg_col_last.split(':')[1]) |
|
256
|
|
|
empty_cell_count = 0 |
|
257
|
|
|
curr_column = corr_header_idx_cfg_col_first |
|
258
|
|
|
while empty_cell_count < target_empty_cell_count: |
|
259
|
|
|
value = ws[self._transform_coordinates(corr_header_idx_cfg_row_first, |
|
260
|
|
|
curr_column)].value |
|
261
|
|
|
if value is None: |
|
262
|
|
|
empty_cell_count += 1 |
|
263
|
|
|
curr_column += 1 |
|
264
|
|
|
corr_header_idx_cfg_col_last = curr_column - target_empty_cell_count - 1 |
|
265
|
|
|
self._plugin.log.debug("Config update: Last header {0} was set to {1} using the 'automatic' " |
|
266
|
|
|
"mechanism.".format(oriented_column_text, corr_header_idx_cfg_col_last)) |
|
267
|
|
|
|
|
268
|
|
|
################################### |
|
269
|
|
|
# Determine header column locations |
|
270
|
|
|
################################### |
|
271
|
|
|
spreadsheet_headers2columns = {} |
|
272
|
|
|
for column in range(corr_header_idx_cfg_col_first, corr_header_idx_cfg_col_last + 1): |
|
273
|
|
|
value = ws[self._transform_coordinates(corr_header_idx_cfg_row_first, column)].value |
|
274
|
|
|
if value is not None: |
|
275
|
|
|
spreadsheet_headers2columns[value] = column |
|
276
|
|
|
else: |
|
277
|
|
|
# if the value is None we have either |
|
278
|
|
|
# - one or more empty header cell in between 2 filled header cells. |
|
279
|
|
|
# - some empty header cells at the end of the row. |
|
280
|
|
|
# That might happen because when choosing 'automatic' header row detection, openpyxl functionality |
|
281
|
|
|
# is used. It always returns the length of the longest row in the whole sheet. |
|
282
|
|
|
# If that is not the header row, we have empty cells at the end of the header row. |
|
283
|
|
|
# However, that is not a problem as header values 'None' are not added. |
|
284
|
|
|
pass |
|
285
|
|
|
spreadsheet_headers = spreadsheet_headers2columns.keys() |
|
286
|
|
|
|
|
287
|
|
|
############################################################### |
|
288
|
|
|
# Check for not existing headers on spreadsheet and config side |
|
289
|
|
|
############################################################### |
|
290
|
|
|
# Build a data_type_config dictionary with header as key |
|
291
|
|
|
config_header_dict = {x['header']: x for x in self.excel_config['data_type_config']} |
|
292
|
|
|
|
|
293
|
|
|
# Check: Are config data_type_config headers unique? |
|
294
|
|
|
if len(config_header_dict.keys()) != len(set(config_header_dict.keys())): |
|
295
|
|
|
self._raise_value_error("Config error: data_type_config -> header duplicate entries found.") |
|
296
|
|
|
|
|
297
|
|
|
# Check for configured headers not found in spreadsheet |
|
298
|
|
|
missing_headers_in_spreadsheet = list(set(config_header_dict.keys()) - set(spreadsheet_headers)) |
|
299
|
|
|
for header in missing_headers_in_spreadsheet: |
|
300
|
|
|
# Check if the fail_on_header_not_found is true |
|
301
|
|
|
data_type = [x for x in self.excel_config['data_type_config'] if x['header'] == header][0] |
|
302
|
|
|
msg = u"Config error: The header '{0}' could not be found in the spreadsheet.".format(header) |
|
303
|
|
|
if data_type['fail_on_header_not_found']: |
|
304
|
|
|
self._raise_value_error(msg) |
|
305
|
|
|
else: |
|
306
|
|
|
self._plugin.log.error(msg) |
|
307
|
|
|
|
|
308
|
|
|
# Check for spreadsheet headers not found in config |
|
309
|
|
|
missing_headers_in_config = list(set(spreadsheet_headers) - set(config_header_dict.keys())) |
|
310
|
|
|
if missing_headers_in_config: |
|
311
|
|
|
self._plugin.log.debug(u"The following spreadsheet headers are not configured for reading: {0}".format( |
|
312
|
|
|
', '.join([header.replace('\n', ' ') for header in missing_headers_in_config]))) |
|
313
|
|
|
for header in missing_headers_in_config: |
|
314
|
|
|
del spreadsheet_headers2columns[header] |
|
315
|
|
|
|
|
316
|
|
|
######################### |
|
317
|
|
|
# Determine last data row |
|
318
|
|
|
######################### |
|
319
|
|
|
if type(corr_data_idx_cfg_row_last) == int: |
|
|
|
|
|
|
320
|
|
|
# do nothing, the value is already set |
|
321
|
|
|
pass |
|
322
|
|
|
elif corr_data_idx_cfg_row_last == 'automatic': |
|
323
|
|
|
# Take data from library |
|
324
|
|
|
corr_data_idx_cfg_row_last = corr_data_idx_cfg_row_first |
|
325
|
|
|
for curr_column in range(corr_header_idx_cfg_col_first, |
|
326
|
|
|
corr_header_idx_cfg_col_last): |
|
327
|
|
|
len_curr_column = len(ws[self._transform_coordinates(column=curr_column)]) |
|
328
|
|
|
if len_curr_column > corr_data_idx_cfg_row_last: |
|
329
|
|
|
corr_data_idx_cfg_row_last = len_curr_column |
|
330
|
|
|
self._plugin.log.debug("Config update: Last data {0} was set to {1} using the 'automatic' " |
|
331
|
|
|
"mechanism.".format(oriented_row_text, corr_data_idx_cfg_row_last)) |
|
332
|
|
|
else: |
|
333
|
|
|
# severalEmptyCells is chosen |
|
334
|
|
|
target_empty_rows_count = int(corr_data_idx_cfg_row_last.split(':')[1]) |
|
335
|
|
|
last_row_detected = False |
|
336
|
|
|
curr_row = corr_data_idx_cfg_row_first |
|
337
|
|
|
empty_rows_count = 0 |
|
338
|
|
|
while not last_row_detected: |
|
339
|
|
|
# go through rows |
|
340
|
|
|
all_columns_empty = True |
|
341
|
|
|
for header in spreadsheet_headers2columns: |
|
342
|
|
|
curr_column = spreadsheet_headers2columns[header] |
|
343
|
|
|
value = ws[self._transform_coordinates(curr_row, curr_column)].value |
|
344
|
|
|
if value is not None: |
|
345
|
|
|
all_columns_empty = False |
|
346
|
|
|
break |
|
347
|
|
|
if all_columns_empty: |
|
348
|
|
|
empty_rows_count += 1 |
|
349
|
|
|
if empty_rows_count >= target_empty_rows_count: |
|
350
|
|
|
last_row_detected = True |
|
351
|
|
|
else: |
|
352
|
|
|
curr_row += 1 |
|
353
|
|
|
corr_data_idx_cfg_row_last = curr_row - target_empty_rows_count |
|
354
|
|
|
self._plugin.log.debug("Config update: Last data {0} was set to {1} using the 'severalEmptyCells' " |
|
355
|
|
|
"mechanism.".format(oriented_row_text, corr_data_idx_cfg_row_last)) |
|
356
|
|
|
|
|
357
|
|
|
################################################# |
|
358
|
|
|
# Go through the rows, read and validate the data |
|
359
|
|
|
################################################# |
|
360
|
|
|
final_dict = {} |
|
361
|
|
|
for curr_row in range(corr_data_idx_cfg_row_first, corr_data_idx_cfg_row_last + 1): |
|
|
|
|
|
|
362
|
|
|
# Go through rows |
|
363
|
|
|
final_dict[curr_row] = {} |
|
364
|
|
|
|
|
365
|
|
|
msg_queue = { |
|
366
|
|
|
'fail_on_empty_cell': { |
|
367
|
|
|
'exceptions': [], |
|
368
|
|
|
'logs': [] |
|
369
|
|
|
}, |
|
370
|
|
|
'fail_on_type_error': { |
|
371
|
|
|
'exceptions': [], |
|
372
|
|
|
'logs': [] |
|
373
|
|
|
} |
|
374
|
|
|
} |
|
375
|
|
|
is_row_excluded = False |
|
376
|
|
|
|
|
377
|
|
|
for header in spreadsheet_headers2columns: |
|
378
|
|
|
# Go through columns |
|
379
|
|
|
curr_column = spreadsheet_headers2columns[header] |
|
380
|
|
|
cell_index_str = self._transform_coordinates(curr_row, curr_column) |
|
381
|
|
|
value = ws[cell_index_str].value |
|
382
|
|
|
|
|
383
|
|
|
# Start the validation |
|
384
|
|
|
config_header = config_header_dict[header] |
|
385
|
|
|
|
|
386
|
|
|
if value is None: |
|
387
|
|
|
msg = u"The '{0}' in cell {1} is empty".format(header, cell_index_str) |
|
388
|
|
|
if config_header['fail_on_empty_cell']: |
|
389
|
|
|
msg_queue['fail_on_empty_cell']['exceptions'].append(msg) |
|
390
|
|
|
else: |
|
391
|
|
|
msg_queue['fail_on_empty_cell']['logs'].append(msg) |
|
392
|
|
|
else: |
|
393
|
|
|
if config_header['type']['base'] == 'automatic': |
|
394
|
|
|
pass |
|
395
|
|
|
elif config_header['type']['base'] == 'date': |
|
396
|
|
|
if not isinstance(value, datetime.datetime): |
|
397
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
|
398
|
|
|
u'specification is datetime'.format(value, cell_index_str, type(value)) |
|
399
|
|
|
if config_header['fail_on_type_error']: |
|
400
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
401
|
|
|
else: |
|
402
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
403
|
|
|
elif config_header['type']['base'] == 'enum': |
|
404
|
|
|
filtered_enum_values = [] |
|
405
|
|
|
if 'filter' in config_header['type']: |
|
406
|
|
|
filtered_enum_values = config_header['type']['filter']['whitelist_values'] |
|
407
|
|
|
if not self._is_string(value): |
|
408
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
|
409
|
|
|
u'specification is a string type (enum)'.format(value, cell_index_str, type(value)) |
|
410
|
|
|
if config_header['fail_on_type_error']: |
|
411
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
412
|
|
|
else: |
|
413
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
414
|
|
|
if filtered_enum_values: |
|
415
|
|
|
msg = 'Cannot apply enum filter to cell {1} because the type check failed.' |
|
416
|
|
|
self._plugin.log.error(msg) |
|
417
|
|
|
else: |
|
418
|
|
|
valid_values = config_header['type']['enum_values'] |
|
419
|
|
|
if value not in valid_values: |
|
420
|
|
|
msg = u'The value {0} in cell {1} is not contained in the given enum ' \ |
|
421
|
|
|
u'[{2}]'.format(value, cell_index_str, ', '.join(valid_values)) |
|
422
|
|
|
if config_header['fail_on_type_error']: |
|
423
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
424
|
|
|
else: |
|
425
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
426
|
|
|
if filtered_enum_values: |
|
427
|
|
|
msg = 'Cannot apply enum filter to cell {1} because the enum values check failed.' |
|
428
|
|
|
self._plugin.log.error(msg) |
|
429
|
|
|
else: |
|
430
|
|
|
if filtered_enum_values and value not in filtered_enum_values: |
|
431
|
|
|
is_row_excluded = True |
|
432
|
|
|
self._plugin.log.debug(u"The {0} {1} was excluded due to an exclude filter on " |
|
433
|
|
|
u"cell {2} ({3} not in [{4}]).".format( |
|
434
|
|
|
oriented_row_text, |
|
435
|
|
|
curr_row, |
|
436
|
|
|
cell_index_str, |
|
437
|
|
|
value, |
|
438
|
|
|
', '.join(filtered_enum_values))) |
|
439
|
|
|
|
|
440
|
|
|
elif config_header['type']['base'] == 'float': |
|
441
|
|
|
# TODO Allow int, too |
|
|
|
|
|
|
442
|
|
View Code Duplication |
if not isinstance(value, float): |
|
|
|
|
|
|
443
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
|
444
|
|
|
u'specification is float'.format(value, cell_index_str, type(value)) |
|
445
|
|
|
if config_header['fail_on_type_error']: |
|
446
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
447
|
|
|
else: |
|
448
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
449
|
|
|
else: |
|
450
|
|
|
if 'minimum' in config_header['type']: |
|
451
|
|
|
if value < config_header['type']['minimum']: |
|
452
|
|
|
msg = u'The value {0} in cell {1} is smaller than the given minimum ' \ |
|
453
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['minimum']) |
|
454
|
|
|
if config_header['fail_on_type_error']: |
|
455
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
456
|
|
|
else: |
|
457
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
458
|
|
|
if 'maximum' in config_header['type']: |
|
459
|
|
|
if value > config_header['type']['maximum']: |
|
460
|
|
|
msg = u'The value {0} in cell {1} is greater than the given maximum ' \ |
|
461
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['maximum']) |
|
462
|
|
|
if config_header['fail_on_type_error']: |
|
463
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
464
|
|
|
else: |
|
465
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
466
|
|
|
elif config_header['type']['base'] == 'integer': |
|
467
|
|
|
# Integer values stored by Excel are returned as float (e.g. 3465.0) |
|
468
|
|
|
# So we have to check if the float can be converted to int without precision loss |
|
469
|
|
|
if isinstance(value, float): |
|
470
|
|
|
if value.is_integer(): |
|
471
|
|
|
# the typecast to int may return int or long, depending on the size of value |
|
472
|
|
|
value = int(value) |
|
473
|
|
View Code Duplication |
if self._is_type_int_long(value): |
|
|
|
|
|
|
474
|
|
|
if 'minimum' in config_header['type']: |
|
475
|
|
|
if value < config_header['type']['minimum']: |
|
476
|
|
|
msg = u'The value {0} in cell {1} is smaller than the given minimum ' \ |
|
477
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['minimum']) |
|
478
|
|
|
if config_header['fail_on_type_error']: |
|
479
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
480
|
|
|
else: |
|
481
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
482
|
|
|
if 'maximum' in config_header['type']: |
|
483
|
|
|
if value > config_header['type']['maximum']: |
|
484
|
|
|
msg = u'The value {0} in cell {1} is greater than the given maximum ' \ |
|
485
|
|
|
u'of {2}'.format(value, cell_index_str, config_header['type']['maximum']) |
|
486
|
|
|
if config_header['fail_on_type_error']: |
|
487
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
488
|
|
|
else: |
|
489
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
490
|
|
|
else: |
|
491
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
|
492
|
|
|
u'specification is int'.format(value, cell_index_str, type(value)) |
|
493
|
|
|
if config_header['fail_on_type_error']: |
|
494
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
495
|
|
|
else: |
|
496
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
497
|
|
|
|
|
498
|
|
|
elif config_header['type']['base'] == 'string': |
|
499
|
|
|
if self._is_type_numeric(value): |
|
500
|
|
|
convert_numbers = False |
|
501
|
|
|
if 'convert_numbers' in config_header['type']: |
|
502
|
|
|
convert_numbers = config_header['type']['convert_numbers'] |
|
503
|
|
|
if convert_numbers: |
|
504
|
|
|
value = str(value) |
|
505
|
|
|
if not self._is_string(value): |
|
506
|
|
|
msg = u'The value {0} in cell {1} is of type {2}; required by ' \ |
|
507
|
|
|
u'specification is string'.format(value, cell_index_str, type(value)) |
|
508
|
|
|
if config_header['fail_on_type_error']: |
|
509
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
510
|
|
|
else: |
|
511
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
512
|
|
|
else: |
|
513
|
|
|
if 'pattern' in config_header['type']: |
|
514
|
|
|
if re.search(config_header['type']['pattern'], value) is None: |
|
515
|
|
|
msg = u'The value {0} in cell {1} does not follow the ' \ |
|
516
|
|
|
u'given pattern {2}'.format(value, cell_index_str, |
|
517
|
|
|
config_header['type']['pattern']) |
|
518
|
|
|
if config_header['fail_on_type_error']: |
|
519
|
|
|
msg_queue['fail_on_type_error']['exceptions'].append(msg) |
|
520
|
|
|
else: |
|
521
|
|
|
msg_queue['fail_on_type_error']['logs'].append(msg) |
|
522
|
|
|
|
|
523
|
|
|
final_dict[curr_row][header] = value |
|
524
|
|
|
|
|
525
|
|
|
if is_row_excluded: |
|
526
|
|
|
# All messages are either raised as exception or logged |
|
527
|
|
|
# If at all depends on the settings in self.excel_config['filter_properties'] |
|
528
|
|
|
|
|
529
|
|
|
for msg_type, messages in msg_queue.items(): |
|
530
|
|
|
for msg in messages['exceptions']: |
|
531
|
|
|
if self.excel_config['filter_properties']['excluded_' + msg_type]: |
|
532
|
|
|
# This ends the program on the first exception message |
|
533
|
|
|
self._raise_value_error(msg) |
|
534
|
|
|
elif self.excel_config['filter_properties']['excluded_enable_logging']: |
|
535
|
|
|
# In case we don't want to raise type errors as exception |
|
536
|
|
|
# we log them in case the user configured so |
|
537
|
|
|
self._plugin.log.warn(msg) |
|
538
|
|
|
for msg in messages['logs']: |
|
539
|
|
|
if self.excel_config['filter_properties']['excluded_enable_logging']: |
|
540
|
|
|
self._plugin.log.warn(msg) |
|
541
|
|
|
del final_dict[curr_row] |
|
542
|
|
|
else: |
|
543
|
|
|
for msg_type, messages in msg_queue.items(): |
|
544
|
|
|
for msg in messages['exceptions']: |
|
545
|
|
|
# This ends the program on the first exception message |
|
546
|
|
|
self._raise_value_error(msg) |
|
547
|
|
|
for msg in messages['logs']: |
|
548
|
|
|
self._plugin.log.warn(msg) |
|
549
|
|
|
|
|
550
|
|
|
return final_dict |
|
551
|
|
|
|
|
552
|
|
|
@staticmethod |
|
553
|
|
|
def _is_string(value): |
|
554
|
|
|
""" |
|
555
|
|
|
Tests if a value is of a string type |
|
556
|
|
|
Python2: unicode, str |
|
557
|
|
|
Python3: str |
|
558
|
|
|
|
|
559
|
|
|
:param value: Variable of any type |
|
560
|
|
|
:return: True if value is of string type else false |
|
561
|
|
|
""" |
|
562
|
|
|
try: |
|
563
|
|
|
str_type = basestring |
|
564
|
|
|
except NameError: |
|
565
|
|
|
str_type = str |
|
566
|
|
|
return isinstance(value, str_type) |
|
567
|
|
|
|
|
568
|
|
|
@staticmethod |
|
569
|
|
|
def _is_type_numeric(value): |
|
570
|
|
|
""" |
|
571
|
|
|
Tests if a value is of an numeric type. |
|
572
|
|
|
Python2: int, long, float |
|
573
|
|
|
Python3: int, float |
|
574
|
|
|
|
|
575
|
|
|
:param value: Variable of any type |
|
576
|
|
|
:return: True if value is of integer type else false |
|
577
|
|
|
""" |
|
578
|
|
|
numeric_types = (int, float) |
|
579
|
|
|
try: |
|
580
|
|
|
long |
|
|
|
|
|
|
581
|
|
|
numeric_types = (int, long, float) |
|
582
|
|
|
except NameError: |
|
583
|
|
|
pass |
|
584
|
|
|
|
|
585
|
|
|
return isinstance(value, numeric_types) |
|
586
|
|
|
|
|
587
|
|
|
@staticmethod |
|
588
|
|
|
def _is_type_int_long(value): |
|
589
|
|
|
""" |
|
590
|
|
|
Tests if a value is of an integer type. |
|
591
|
|
|
Python2: int, long |
|
592
|
|
|
Python3: int |
|
593
|
|
|
|
|
594
|
|
|
:param value: Variable of any type |
|
595
|
|
|
:return: True if value is of integer type else false |
|
596
|
|
|
""" |
|
597
|
|
|
numeric_types = (int) |
|
598
|
|
|
try: |
|
599
|
|
|
long |
|
|
|
|
|
|
600
|
|
|
numeric_types = (int, long) |
|
601
|
|
|
except NameError: |
|
602
|
|
|
pass |
|
603
|
|
|
|
|
604
|
|
|
return isinstance(value, numeric_types) |
|
605
|
|
|
|
|
606
|
|
|
def _validate_json(self, excel_config_json_path): |
|
607
|
|
|
|
|
608
|
|
|
try: |
|
609
|
|
|
with open(excel_config_json_path) as file_pointer: |
|
610
|
|
|
json_obj = json.load(file_pointer) |
|
611
|
|
|
# the file is not deserializable as a json object |
|
612
|
|
|
except ValueError as exc: |
|
613
|
|
|
self._plugin.log.error('Malformed JSON file: {0} \n {1}'.format(excel_config_json_path, exc)) |
|
614
|
|
|
raise exc |
|
615
|
|
|
# some os error occured (e.g file not found or malformed path string) |
|
616
|
|
|
# have to catch two exception classes: in py2 : IOError; py3: OSError |
|
617
|
|
|
except (IOError, OSError) as exc: |
|
618
|
|
|
self._plugin.log.error(exc) |
|
619
|
|
|
# raise only OSError to make error handling in caller easier |
|
620
|
|
|
raise OSError() |
|
621
|
|
|
|
|
622
|
|
|
# validate json object if schema file path is there; otherwise throw warning |
|
623
|
|
|
try: |
|
624
|
|
|
with open(JSON_SCHEMA_FILE_PATH) as file_pointer: |
|
625
|
|
|
schema_obj = json.load(file_pointer) |
|
626
|
|
|
# the file is not deserializable as a json object |
|
627
|
|
|
except ValueError as exc: |
|
628
|
|
|
self._plugin.log.error('Malformed JSON schema file: {0} \n {1}'.format(JSON_SCHEMA_FILE_PATH, exc)) |
|
629
|
|
|
raise exc |
|
630
|
|
|
# some os error occured (e.g file not found or malformed path string) |
|
631
|
|
|
# have to catch two exception classes: in py2 : IOError; py3: OSError |
|
632
|
|
|
except (IOError, OSError) as exc: |
|
633
|
|
|
self._plugin.log.error(exc) |
|
634
|
|
|
# raise only OSError to make error handling in caller easier |
|
635
|
|
|
raise OSError() |
|
636
|
|
|
|
|
637
|
|
|
# do the validation |
|
638
|
|
|
try: |
|
639
|
|
|
validate(json_obj, schema_obj) |
|
640
|
|
|
except ValidationError as error: |
|
641
|
|
|
self._plugin.log.error("Validation failed: {0}".format(error.message)) |
|
642
|
|
|
raise |
|
643
|
|
|
except SchemaError: |
|
644
|
|
|
self._plugin.log.error("Invalid schema file: {0}".format(JSON_SCHEMA_FILE_PATH)) |
|
645
|
|
|
raise |
|
646
|
|
|
|
|
647
|
|
|
return json_obj |
|
648
|
|
|
|
|
649
|
|
|
def _get_sheet(self, workbook): |
|
650
|
|
|
|
|
651
|
|
|
# get sheet |
|
652
|
|
|
worksheet = None |
|
653
|
|
|
if type(self.excel_config['sheet_config']) == int: |
|
|
|
|
|
|
654
|
|
|
worksheet = workbook.worksheets[self.excel_config['sheet_config'] - 1] |
|
655
|
|
|
elif self.excel_config['sheet_config'] == 'active': |
|
656
|
|
|
worksheet = workbook.active |
|
657
|
|
|
elif self.excel_config['sheet_config'].startswith('name'): |
|
658
|
|
|
worksheet = workbook[self.excel_config['sheet_config'].split(':')[1]] |
|
659
|
|
|
elif self.excel_config['sheet_config'] == 'first': |
|
660
|
|
|
worksheet = workbook.worksheets[0] |
|
661
|
|
|
elif self.excel_config['sheet_config'] == 'last': |
|
662
|
|
|
worksheet = workbook.worksheets[len(workbook.get_sheet_names()) - 1] |
|
663
|
|
|
else: |
|
664
|
|
|
# This cannot happen if json validation was ok |
|
665
|
|
|
pass |
|
666
|
|
|
return worksheet |
|
667
|
|
|
|
|
668
|
|
|
def _raise_value_error(self, msg): |
|
669
|
|
|
self._plugin.log.error(msg) |
|
670
|
|
|
raise ValueError(msg.encode('UTF-8', 'ignore')) |
|
671
|
|
|
|
|
672
|
|
|
def _transform_coordinates(self, row=None, column=None): |
|
673
|
|
|
if row is None and column is None: |
|
674
|
|
|
raise ValueError("_transform_coordinates: row and column cannot both be None.") |
|
675
|
|
|
target_str = '' |
|
676
|
|
|
if self.excel_config['orientation'] == 'column_based': |
|
677
|
|
|
if column is not None: |
|
678
|
|
|
target_str = get_column_letter(column) |
|
679
|
|
|
if row is not None: |
|
680
|
|
|
target_str += str(row) |
|
681
|
|
|
else: |
|
682
|
|
|
if row is not None: |
|
683
|
|
|
target_str = get_column_letter(row) |
|
684
|
|
|
if column is not None: |
|
685
|
|
|
target_str += str(column) |
|
686
|
|
|
return target_str |
|
687
|
|
|
|