1
|
|
|
import numpy as np |
2
|
|
|
import re |
3
|
|
|
|
4
|
|
|
from . import BasicNeeds as ClassBN |
5
|
|
|
|
6
|
|
|
|
7
|
|
View Code Duplication |
class TypeDetermination: |
|
|
|
|
8
|
|
|
|
9
|
|
|
def fn_analyze_field_content_to_establish_data_type(self, |
10
|
|
|
field_idx, |
11
|
|
|
field_name, |
12
|
|
|
field_counted_nulls, |
13
|
|
|
field_unique_values, |
14
|
|
|
field_panda_type, |
15
|
|
|
data_type_and_their_formats_to_evaluate, |
16
|
|
|
verbose): |
17
|
|
|
field_structure = [] |
18
|
|
|
# Analyze unique values |
19
|
|
|
for unique_row_index, current_value in enumerate(field_unique_values): |
20
|
|
|
# determine the field type by current content |
21
|
|
|
crt_field_type = self.fn_type_determination(current_value, data_type_and_their_formats_to_evaluate) |
22
|
|
|
# write aside the determined value |
23
|
|
|
if unique_row_index == 0: |
24
|
|
|
field_structure = { |
25
|
|
|
'order': field_idx, |
26
|
|
|
'name': field_name, |
27
|
|
|
'nulls': field_counted_nulls, |
28
|
|
|
'panda_type': field_panda_type, |
29
|
|
|
'type': crt_field_type |
30
|
|
|
} |
31
|
|
|
ClassBN.fn_optional_print(verbose, f'Column {field_idx} having the name [{field_name}] ' |
32
|
|
|
+ f'has the value <{current_value}> ' |
33
|
|
|
+ f'which mean is of type "{crt_field_type}"') |
34
|
|
|
else: |
35
|
|
|
crt_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(crt_field_type) |
36
|
|
|
prv_type = field_structure['type'] |
37
|
|
|
prv_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(prv_type) |
38
|
|
|
# if CSV structure for current field (column) exists, does the current type is more important? |
39
|
|
|
if crt_type_index > prv_type_index: |
40
|
|
|
ClassBN.fn_optional_print(verbose, f' column {field_idx} having the name [{field_name}] ' |
41
|
|
|
+ f'has the value <{current_value}> ' |
42
|
|
|
+ f'which means is of type "{crt_field_type}" ' |
43
|
|
|
+ 'and this is stronger than previously thought to be ' |
44
|
|
|
+ f'as "{prv_type}"') |
45
|
|
|
field_structure['type'] = crt_field_type |
46
|
|
|
# If currently determined field type is string makes not sense to scan any further |
47
|
|
|
if crt_field_type == 'str': |
48
|
|
|
return field_structure |
49
|
|
|
return field_structure |
50
|
|
|
|
51
|
|
|
@staticmethod |
52
|
|
|
def fn_detect_csv_structure(self, input_csv_data_frame, formats_to_evaluate, verbose): |
53
|
|
|
col_idx = 0 |
54
|
|
|
csv_structure = [] |
55
|
|
|
# Cycle through all found columns |
56
|
|
|
for label, content in input_csv_data_frame.items(): |
57
|
|
|
panda_determined_type = content.infer_objects().dtypes |
58
|
|
|
ClassBN.fn_optional_print(verbose, f'Field "{label}" according to Pandas package ' |
59
|
|
|
+ f'is of type "{panda_determined_type}"') |
60
|
|
|
counted_nulls = content.isnull().sum() |
61
|
|
|
if panda_determined_type in ('float64', 'object'): |
62
|
|
|
list_unique_values = content.dropna().unique() |
63
|
|
|
self.fn_optional_column_statistics(self, verbose, label, content, list_unique_values) |
64
|
|
|
csv_structure.append(col_idx) |
65
|
|
|
csv_structure[col_idx] = self.fn_analyze_field_content_to_establish_data_type(self, |
66
|
|
|
col_idx, |
67
|
|
|
label, |
68
|
|
|
counted_nulls, |
69
|
|
|
list_unique_values[0:200], |
70
|
|
|
panda_determined_type, |
71
|
|
|
formats_to_evaluate, |
72
|
|
|
verbose) |
73
|
|
|
elif panda_determined_type == 'int64': |
74
|
|
|
csv_structure.append(col_idx) |
75
|
|
|
csv_structure[col_idx] = { |
76
|
|
|
'order': col_idx, |
77
|
|
|
'name': label, |
78
|
|
|
'nulls': counted_nulls, |
79
|
|
|
'panda_type': panda_determined_type, |
80
|
|
|
'type': 'int' |
81
|
|
|
} |
82
|
|
|
col_idx += 1 |
83
|
|
|
return csv_structure |
84
|
|
|
|
85
|
|
|
@staticmethod |
86
|
|
|
def fn_optional_column_statistics(self, verbose, field_name, field_content, field_unique_values): |
87
|
|
|
if verbose: |
88
|
|
|
counted_values_null = field_content.isnull().sum() |
89
|
|
|
counted_values_not_null = field_content.notnull().sum() |
90
|
|
|
counted_values_unique = field_content.nunique() |
91
|
|
|
ClassBN.fn_optional_print(verbose, f'"{field_name}" has following characteristics: ' + |
92
|
|
|
f'count of null values: {counted_values_null}, ' + |
93
|
|
|
f'count of not-null values: {counted_values_not_null}, ' + |
94
|
|
|
f'count of unique values: {counted_values_unique}, ' + |
95
|
|
|
f'list of not-null and unique values is: <' + |
96
|
|
|
'>, <'.join(np.array(field_unique_values, dtype=str)) + '>') |
97
|
|
|
|
98
|
|
|
@staticmethod |
99
|
|
|
def fn_type_determination(intput_variable_to_assess, evaluation_formats): |
100
|
|
|
# Website https://regex101.com/ was used to validate below code |
101
|
|
|
variable_to_assess = str(intput_variable_to_assess) |
102
|
|
|
if variable_to_assess == '': |
103
|
|
|
return 'empty' |
104
|
|
|
else: |
105
|
|
|
for current_dtype, current_format in evaluation_formats.items(): |
106
|
|
|
if re.match(current_format, variable_to_assess): |
107
|
|
|
return current_dtype |
108
|
|
|
return 'str' |
109
|
|
|
|