Test Failed
Push — master ( 206750...bcacd1 )
by Daniel
02:10
created

TypeDetermination   A

Complexity

Total Complexity 15

Size/Duplication

Total Lines 109
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 92
dl 0
loc 109
rs 10
c 0
b 0
f 0
wmc 15

4 Methods

Rating   Name   Duplication   Size   Complexity  
B tableau_hyper_management.TypeDetermination.TypeDetermination.fn_analyze_field_content_to_establish_data_type() 41 41 5
A tableau_hyper_management.TypeDetermination.TypeDetermination.fn_type_determination() 11 11 4
A tableau_hyper_management.TypeDetermination.TypeDetermination.fn_detect_csv_structure() 33 33 4
A tableau_hyper_management.TypeDetermination.TypeDetermination.fn_optional_column_statistics() 12 12 2
1
import numpy as np
2
import re
3
4
from . import BasicNeeds as ClassBN
5
6
7 View Code Duplication
class TypeDetermination:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
8
9
    def fn_analyze_field_content_to_establish_data_type(self,
10
                                                        field_idx, 
11
                                                        field_name,
12
                                                        field_counted_nulls,
13
                                                        field_unique_values,
14
                                                        field_panda_type,
15
                                                        data_type_and_their_formats_to_evaluate,
16
                                                        verbose):
17
        field_structure = []
18
        # Analyze unique values
19
        for unique_row_index, current_value in enumerate(field_unique_values):
20
            # determine the field type by current content
21
            crt_field_type = self.fn_type_determination(current_value, data_type_and_their_formats_to_evaluate)
22
            # write aside the determined value
23
            if unique_row_index == 0:
24
                field_structure = {
25
                    'order': field_idx,
26
                    'name': field_name,
27
                    'nulls': field_counted_nulls,
28
                    'panda_type': field_panda_type,
29
                    'type': crt_field_type
30
                }
31
                ClassBN.fn_optional_print(verbose, f'Column {field_idx} having the name [{field_name}] '
32
                                          + f'has the value <{current_value}> '
33
                                          + f'which mean is of type "{crt_field_type}"')
34
            else:
35
                crt_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(crt_field_type)
36
                prv_type = field_structure['type']
37
                prv_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(prv_type)
38
                # if CSV structure for current field (column) exists, does the current type is more important?
39
                if crt_type_index > prv_type_index:
40
                    ClassBN.fn_optional_print(verbose, f' column {field_idx} having the name [{field_name}] '
41
                                              + f'has the value <{current_value}> '
42
                                              + f'which means is of type "{crt_field_type}" '
43
                                              + 'and this is stronger than previously thought to be '
44
                                              + f'as "{prv_type}"')
45
                    field_structure['type'] = crt_field_type
46
            # If currently determined field type is string makes not sense to scan any further
47
            if crt_field_type == 'str':
48
                return field_structure
49
        return field_structure
50
51
    @staticmethod
52
    def fn_detect_csv_structure(self, input_csv_data_frame, formats_to_evaluate, verbose):
53
        col_idx = 0
54
        csv_structure = []
55
        # Cycle through all found columns
56
        for label, content in input_csv_data_frame.items():
57
            panda_determined_type = content.infer_objects().dtypes
58
            ClassBN.fn_optional_print(verbose, f'Field "{label}" according to Pandas package '
59
                                      + f'is of type "{panda_determined_type}"')
60
            counted_nulls = content.isnull().sum()
61
            if panda_determined_type in ('float64', 'object'):
62
                list_unique_values = content.dropna().unique()
63
                self.fn_optional_column_statistics(self, verbose, label, content, list_unique_values)
64
                csv_structure.append(col_idx)
65
                csv_structure[col_idx] = self.fn_analyze_field_content_to_establish_data_type(self,
66
                                                                                              col_idx,
67
                                                                                              label,
68
                                                                                              counted_nulls,
69
                                                                                              list_unique_values[0:200],
70
                                                                                              panda_determined_type,
71
                                                                                              formats_to_evaluate,
72
                                                                                              verbose)
73
            elif panda_determined_type == 'int64':
74
                csv_structure.append(col_idx)
75
                csv_structure[col_idx] = {
76
                    'order': col_idx,
77
                    'name': label,
78
                    'nulls': counted_nulls,
79
                    'panda_type': panda_determined_type,
80
                    'type': 'int'
81
                }
82
            col_idx += 1
83
        return csv_structure
84
85
    @staticmethod
86
    def fn_optional_column_statistics(self, verbose, field_name, field_content, field_unique_values):
87
        if verbose:
88
            counted_values_null = field_content.isnull().sum()
89
            counted_values_not_null = field_content.notnull().sum()
90
            counted_values_unique = field_content.nunique()
91
            ClassBN.fn_optional_print(verbose, f'"{field_name}" has following characteristics: ' +
92
                                     f'count of null values: {counted_values_null}, ' +
93
                                     f'count of not-null values: {counted_values_not_null}, ' +
94
                                     f'count of unique values: {counted_values_unique}, ' +
95
                                     f'list of not-null and unique values is: <' +
96
                                     '>, <'.join(np.array(field_unique_values, dtype=str)) + '>')
97
98
    @staticmethod
99
    def fn_type_determination(intput_variable_to_assess, evaluation_formats):
100
        # Website https://regex101.com/ was used to validate below code
101
        variable_to_assess = str(intput_variable_to_assess)
102
        if variable_to_assess == '':
103
            return 'empty'
104
        else:
105
            for current_dtype, current_format in evaluation_formats.items():
106
                if re.match(current_format, variable_to_assess):
107
                    return current_dtype
108
            return 'str'
109