TypeDetermination - Code Metrics - Inspection of "minor tweak to setup logic" - danielgp/tableau-hyper-management - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 206750...bcacd1 )

by Daniel

created 2019-12-05 10:54 UTC

TypeDetermination A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	109
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	92
dl	0
loc	109
rs	10
c	0
b	0
f	0
wmc	15

4 Methods

Rating	Name	Duplication	Size	Complexity
B	tableau_hyper_management.TypeDetermination.TypeDetermination.fn_analyze_field_content_to_establish_data_type()	41	41	5
A	tableau_hyper_management.TypeDetermination.TypeDetermination.fn_type_determination()	11	11	4
A	tableau_hyper_management.TypeDetermination.TypeDetermination.fn_detect_csv_structure()	33	33	4
A	tableau_hyper_management.TypeDetermination.TypeDetermination.fn_optional_column_statistics()	12	12	2

import numpy as np
import re

from . import BasicNeeds as ClassBN


class TypeDetermination:


    def fn_analyze_field_content_to_establish_data_type(self,
                                                        field_idx, 
                                                        field_name,
                                                        field_counted_nulls,
                                                        field_unique_values,
                                                        field_panda_type,
                                                        data_type_and_their_formats_to_evaluate,
                                                        verbose):
        field_structure = []
        # Analyze unique values
        for unique_row_index, current_value in enumerate(field_unique_values):
            # determine the field type by current content
            crt_field_type = self.fn_type_determination(current_value, data_type_and_their_formats_to_evaluate)
            # write aside the determined value
            if unique_row_index == 0:
                field_structure = {
                    'order': field_idx,
                    'name': field_name,
                    'nulls': field_counted_nulls,
                    'panda_type': field_panda_type,
                    'type': crt_field_type
                }
                ClassBN.fn_optional_print(verbose, f'Column {field_idx} having the name [{field_name}] '
                                          + f'has the value <{current_value}> '
                                          + f'which mean is of type "{crt_field_type}"')
            else:
                crt_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(crt_field_type)
                prv_type = field_structure['type']
                prv_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(prv_type)
                # if CSV structure for current field (column) exists, does the current type is more important?
                if crt_type_index > prv_type_index:
                    ClassBN.fn_optional_print(verbose, f' column {field_idx} having the name [{field_name}] '
                                              + f'has the value <{current_value}> '
                                              + f'which means is of type "{crt_field_type}" '
                                              + 'and this is stronger than previously thought to be '
                                              + f'as "{prv_type}"')
                    field_structure['type'] = crt_field_type
            # If currently determined field type is string makes not sense to scan any further
            if crt_field_type == 'str':
                return field_structure
        return field_structure

    @staticmethod
    def fn_detect_csv_structure(self, input_csv_data_frame, formats_to_evaluate, verbose):
        col_idx = 0
        csv_structure = []
        # Cycle through all found columns
        for label, content in input_csv_data_frame.items():
            panda_determined_type = content.infer_objects().dtypes
            ClassBN.fn_optional_print(verbose, f'Field "{label}" according to Pandas package '
                                      + f'is of type "{panda_determined_type}"')
            counted_nulls = content.isnull().sum()
            if panda_determined_type in ('float64', 'object'):
                list_unique_values = content.dropna().unique()
                self.fn_optional_column_statistics(self, verbose, label, content, list_unique_values)
                csv_structure.append(col_idx)
                csv_structure[col_idx] = self.fn_analyze_field_content_to_establish_data_type(self,
                                                                                              col_idx,
                                                                                              label,
                                                                                              counted_nulls,
                                                                                              list_unique_values[0:200],
                                                                                              panda_determined_type,
                                                                                              formats_to_evaluate,
                                                                                              verbose)
            elif panda_determined_type == 'int64':
                csv_structure.append(col_idx)
                csv_structure[col_idx] = {
                    'order': col_idx,
                    'name': label,
                    'nulls': counted_nulls,
                    'panda_type': panda_determined_type,
                    'type': 'int'
                }
            col_idx += 1
        return csv_structure

    @staticmethod
    def fn_optional_column_statistics(self, verbose, field_name, field_content, field_unique_values):
        if verbose:
            counted_values_null = field_content.isnull().sum()
            counted_values_not_null = field_content.notnull().sum()
            counted_values_unique = field_content.nunique()
            ClassBN.fn_optional_print(verbose, f'"{field_name}" has following characteristics: ' +
                                     f'count of null values: {counted_values_null}, ' +
                                     f'count of not-null values: {counted_values_not_null}, ' +
                                     f'count of unique values: {counted_values_unique}, ' +
                                     f'list of not-null and unique values is: <' +
                                     '>, <'.join(np.array(field_unique_values, dtype=str)) + '>')

    @staticmethod
    def fn_type_determination(intput_variable_to_assess, evaluation_formats):
        # Website https://regex101.com/ was used to validate below code
        variable_to_assess = str(intput_variable_to_assess)
        if variable_to_assess == '':
            return 'empty'
        else:
            for current_dtype, current_format in evaluation_formats.items():
                if re.match(current_format, variable_to_assess):
                    return current_dtype
            return 'str'


1		import numpy as np
2		import re
3
4		from . import BasicNeeds as ClassBN
5
6
7	View Code Duplication	class TypeDetermination:
		0 ignored issues – show Duplication introduced 2019-12-05 10:56 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
8
9		def fn_analyze_field_content_to_establish_data_type(self,
10		field_idx,
11		field_name,
12		field_counted_nulls,
13		field_unique_values,
14		field_panda_type,
15		data_type_and_their_formats_to_evaluate,
16		verbose):
17		field_structure = []
18		# Analyze unique values
19		for unique_row_index, current_value in enumerate(field_unique_values):
20		# determine the field type by current content
21		crt_field_type = self.fn_type_determination(current_value, data_type_and_their_formats_to_evaluate)
22		# write aside the determined value
23		if unique_row_index == 0:
24		field_structure = {
25		'order': field_idx,
26		'name': field_name,
27		'nulls': field_counted_nulls,
28		'panda_type': field_panda_type,
29		'type': crt_field_type
30		}
31		ClassBN.fn_optional_print(verbose, f'Column {field_idx} having the name [{field_name}] '
32		+ f'has the value <{current_value}> '
33		+ f'which mean is of type "{crt_field_type}"')
34		else:
35		crt_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(crt_field_type)
36		prv_type = field_structure['type']
37		prv_type_index = list(data_type_and_their_formats_to_evaluate.keys()).index(prv_type)
38		# if CSV structure for current field (column) exists, does the current type is more important?
39		if crt_type_index > prv_type_index:
40		ClassBN.fn_optional_print(verbose, f' column {field_idx} having the name [{field_name}] '
41		+ f'has the value <{current_value}> '
42		+ f'which means is of type "{crt_field_type}" '
43		+ 'and this is stronger than previously thought to be '
44		+ f'as "{prv_type}"')
45		field_structure['type'] = crt_field_type
46		# If currently determined field type is string makes not sense to scan any further
47		if crt_field_type == 'str':
48		return field_structure
49		return field_structure
50
51		@staticmethod
52		def fn_detect_csv_structure(self, input_csv_data_frame, formats_to_evaluate, verbose):
53		col_idx = 0
54		csv_structure = []
55		# Cycle through all found columns
56		for label, content in input_csv_data_frame.items():
57		panda_determined_type = content.infer_objects().dtypes
58		ClassBN.fn_optional_print(verbose, f'Field "{label}" according to Pandas package '
59		+ f'is of type "{panda_determined_type}"')
60		counted_nulls = content.isnull().sum()
61		if panda_determined_type in ('float64', 'object'):
62		list_unique_values = content.dropna().unique()
63		self.fn_optional_column_statistics(self, verbose, label, content, list_unique_values)
64		csv_structure.append(col_idx)
65		csv_structure[col_idx] = self.fn_analyze_field_content_to_establish_data_type(self,
66		col_idx,
67		label,
68		counted_nulls,
69		list_unique_values[0:200],
70		panda_determined_type,
71		formats_to_evaluate,
72		verbose)
73		elif panda_determined_type == 'int64':
74		csv_structure.append(col_idx)
75		csv_structure[col_idx] = {
76		'order': col_idx,
77		'name': label,
78		'nulls': counted_nulls,
79		'panda_type': panda_determined_type,
80		'type': 'int'
81		}
82		col_idx += 1
83		return csv_structure
84
85		@staticmethod
86		def fn_optional_column_statistics(self, verbose, field_name, field_content, field_unique_values):
87		if verbose:
88		counted_values_null = field_content.isnull().sum()
89		counted_values_not_null = field_content.notnull().sum()
90		counted_values_unique = field_content.nunique()
91		ClassBN.fn_optional_print(verbose, f'"{field_name}" has following characteristics: ' +
92		f'count of null values: {counted_values_null}, ' +
93		f'count of not-null values: {counted_values_not_null}, ' +
94		f'count of unique values: {counted_values_unique}, ' +
95		f'list of not-null and unique values is: <' +
96		'>, <'.join(np.array(field_unique_values, dtype=str)) + '>')
97
98		@staticmethod
99		def fn_type_determination(intput_variable_to_assess, evaluation_formats):
100		# Website https://regex101.com/ was used to validate below code
101		variable_to_assess = str(intput_variable_to_assess)
102		if variable_to_assess == '':
103		return 'empty'
104		else:
105		for current_dtype, current_format in evaluation_formats.items():
106		if re.match(current_format, variable_to_assess):
107		return current_dtype
108		return 'str'
109

danielgp / tableau-hyper-management

Push — master ( 206750...bcacd1 )

TypeDetermination A

Complexity

Size/Duplication

Importance

4 Methods

Duplication Side-by-Side

Filter issues like