|
1
|
|
|
import numpy as np |
|
2
|
|
|
import re |
|
3
|
|
|
|
|
4
|
|
|
from . import BasicNeeds as ClassBN |
|
5
|
|
|
|
|
6
|
|
|
|
|
7
|
|
View Code Duplication |
class TypeDetermination: |
|
|
|
|
|
|
8
|
|
|
|
|
9
|
|
|
def fn_analyze_field_content_to_establish_data_type(self, |
|
10
|
|
|
field_idx, |
|
11
|
|
|
field_name, |
|
12
|
|
|
field_counted_nulls, |
|
13
|
|
|
field_unique_values, |
|
14
|
|
|
field_panda_type, |
|
15
|
|
|
data_type_and_their_formats_to_evaluate, |
|
16
|
|
|
verbose): |
|
17
|
|
|
field_structure = [] |
|
18
|
|
|
# Analyze unique values |
|
19
|
|
|
for unique_row_index, current_value in enumerate(field_unique_values): |
|
20
|
|
|
# determine the field type by current content |
|
21
|
|
|
crt_field_type = self.fn_type_determination(current_value, data_type_and_their_formats_to_evaluate) |
|
22
|
|
|
# write aside the determined value |
|
23
|
|
|
if unique_row_index == 0: |
|
24
|
|
|
field_structure = { |
|
25
|
|
|
'order': field_idx, |
|
26
|
|
|
'name': field_name, |
|
27
|
|
|
'nulls': field_counted_nulls, |
|
28
|
|
|
'panda_type': field_panda_type, |
|
29
|
|
|
'type': crt_field_type |
|
30
|
|
|
} |
|
31
|
|
|
ClassBN.fn_optional_print(verbose, f'Column {field_idx} having the name [{field_name}] ' |
|
32
|
|
|
+ f'has the value <{current_value}> ' |
|
33
|
|
|
+ f'which mean is of type "{crt_field_type}"') |
|
34
|
|
|
else: |
|
35
|
|
|
crt_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(crt_field_type) |
|
36
|
|
|
prv_type = field_structure['type'] |
|
37
|
|
|
prv_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(prv_type) |
|
38
|
|
|
# if CSV structure for current field (column) exists, does the current type is more important? |
|
39
|
|
|
if crt_type_index > prv_type_index: |
|
40
|
|
|
ClassBN.fn_optional_print(verbose, f' column {field_idx} having the name [{field_name}] ' |
|
41
|
|
|
+ f'has the value <{current_value}> ' |
|
42
|
|
|
+ f'which means is of type "{crt_field_type}" ' |
|
43
|
|
|
+ 'and this is stronger than previously thought to be ' |
|
44
|
|
|
+ f'as "{prv_type}"') |
|
45
|
|
|
field_structure['type'] = crt_field_type |
|
46
|
|
|
# If currently determined field type is string makes not sense to scan any further |
|
47
|
|
|
if crt_field_type == 'str': |
|
48
|
|
|
return field_structure |
|
49
|
|
|
return field_structure |
|
50
|
|
|
|
|
51
|
|
|
@staticmethod |
|
52
|
|
|
def fn_detect_csv_structure(self, input_csv_data_frame, formats_to_evaluate, verbose): |
|
53
|
|
|
col_idx = 0 |
|
54
|
|
|
csv_structure = [] |
|
55
|
|
|
# Cycle through all found columns |
|
56
|
|
|
for label, content in input_csv_data_frame.items(): |
|
57
|
|
|
panda_determined_type = content.infer_objects().dtypes |
|
58
|
|
|
ClassBN.fn_optional_print(verbose, f'Field "{label}" according to Pandas package ' |
|
59
|
|
|
+ f'is of type "{panda_determined_type}"') |
|
60
|
|
|
counted_nulls = content.isnull().sum() |
|
61
|
|
|
if panda_determined_type in ('float64', 'object'): |
|
62
|
|
|
list_unique_values = content.dropna().unique() |
|
63
|
|
|
self.fn_optional_column_statistics(self, verbose, label, content, list_unique_values) |
|
64
|
|
|
csv_structure.append(col_idx) |
|
65
|
|
|
csv_structure[col_idx] = self.fn_analyze_field_content_to_establish_data_type(self, |
|
66
|
|
|
col_idx, |
|
67
|
|
|
label, |
|
68
|
|
|
counted_nulls, |
|
69
|
|
|
list_unique_values[0:200], |
|
70
|
|
|
panda_determined_type, |
|
71
|
|
|
formats_to_evaluate, |
|
72
|
|
|
verbose) |
|
73
|
|
|
elif panda_determined_type == 'int64': |
|
74
|
|
|
csv_structure.append(col_idx) |
|
75
|
|
|
csv_structure[col_idx] = { |
|
76
|
|
|
'order': col_idx, |
|
77
|
|
|
'name': label, |
|
78
|
|
|
'nulls': counted_nulls, |
|
79
|
|
|
'panda_type': panda_determined_type, |
|
80
|
|
|
'type': 'int' |
|
81
|
|
|
} |
|
82
|
|
|
col_idx += 1 |
|
83
|
|
|
return csv_structure |
|
84
|
|
|
|
|
85
|
|
|
@staticmethod |
|
86
|
|
|
def fn_optional_column_statistics(self, verbose, field_name, field_content, field_unique_values): |
|
87
|
|
|
if verbose: |
|
88
|
|
|
counted_values_null = field_content.isnull().sum() |
|
89
|
|
|
counted_values_not_null = field_content.notnull().sum() |
|
90
|
|
|
counted_values_unique = field_content.nunique() |
|
91
|
|
|
ClassBN.fn_optional_print(verbose, f'"{field_name}" has following characteristics: ' + |
|
92
|
|
|
f'count of null values: {counted_values_null}, ' + |
|
93
|
|
|
f'count of not-null values: {counted_values_not_null}, ' + |
|
94
|
|
|
f'count of unique values: {counted_values_unique}, ' + |
|
95
|
|
|
f'list of not-null and unique values is: <' + |
|
96
|
|
|
'>, <'.join(np.array(field_unique_values, dtype=str)) + '>') |
|
97
|
|
|
|
|
98
|
|
|
@staticmethod |
|
99
|
|
|
def fn_type_determination(intput_variable_to_assess, evaluation_formats): |
|
100
|
|
|
# Website https://regex101.com/ was used to validate below code |
|
101
|
|
|
variable_to_assess = str(intput_variable_to_assess) |
|
102
|
|
|
if variable_to_assess == '': |
|
103
|
|
|
return 'empty' |
|
104
|
|
|
else: |
|
105
|
|
|
for current_dtype, current_format in evaluation_formats.items(): |
|
106
|
|
|
if re.match(current_format, variable_to_assess): |
|
107
|
|
|
return current_dtype |
|
108
|
|
|
return 'str' |
|
109
|
|
|
|