klib.utils - Code Metrics - Inspection of "-" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Branch — master (5deb01)

by Andreas

created 2020-05-29 11:01 UTC

klib.utils A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	209
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	87
dl	0
loc	209
rs	10
c	0
b	0
f	0
wmc	24

10 Functions

Rating	Name	Size	Complexity
A	_memory_usage()	17	1
A	_drop_duplicates()	19	1
A	_validate_input_int()	3	2
A	_validate_input_bool()	3	2
B	_diff_report()	64	5
A	_validate_input_range()	4	3
A	_validate_input_sum()	3	2
A	_corr_selector()	33	5
A	_missing_vals()	29	1
A	_validate_input_smaller()	3	2

'''
Utilities and auxiliary functions.

:author: Andreas Kanz

'''

# Imports
import numpy as np
import pandas as pd


def _corr_selector(corr, split=None, threshold=0):
    '''
    Select correlations based on the provided parameters.

    Parameters
    ----------
    corr: pd.Series or pd.DataFrame of correlations.

    split: {None, 'pos', 'neg', 'above', 'below'}, default None
        Type of split to be performed.

    threshold: float, default 0
        Value between 0 <= threshold <= 1

    Returns:
    -------
    corr: List or matrix of (filtered) correlations.
    '''

    if split == 'pos':
        corr = corr.where((corr >= threshold) & (corr > 0))
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
    elif split == 'neg':
        corr = corr.where((corr <= threshold) & (corr < 0))
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
    elif split == 'above':
        corr = corr.where(np.abs(corr) >= threshold)
        print(f'Displaying absolute correlations above the threshold ({threshold}).')
    elif split == 'below':
        corr = corr.where(np.abs(corr) <= threshold)
        print(f'Displaying absolute correlations below the threshold ({threshold}).')

    return corr


def _diff_report(data, data_cleaned, dupl_rows=None, single_val_cols=None, show='changes'):
    '''
    Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
    missing values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.
        Input the initial dataset here.

    data_cleaned: 2D dataset that can be coerced into Pandas DataFrame.
        Input the cleaned / updated dataset here.

    dupl_rows: list, default None
        List of duplicate row indices.

    single_val_cols: list, default None
        List of single-valued column indices. I.e. columns where all cells contain the same value. \
        NaNs count as a separate value.

    show: {'all', 'changes', None} default 'all'
        Specify verbosity of the output.
        * 'all': Print information about the data before and after cleaning as well as information about changes.
        * 'changes': Print out differences in the data before and after cleaning.
        * None: No information about the data and the data cleaning is printed.

    Returns:
    -------
    Print statement highlighting the datasets or changes between the two datasets.
    '''

    if show in ['changes', 'all']:
        dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
        single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
        data_mem = _memory_usage(data)
        data_cl_mem = _memory_usage(data_cleaned)
        data_mv_tot = _missing_vals(data)['mv_total']
        data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']

        if show == 'all':
            print('Before data cleaning:\n')
            print(f'dtypes:\n{data.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data.shape[0]}')
            print(f'Number of cols: {data.shape[1]}')
            print(f'Missing values: {data_mv_tot}')
            print(f'Memory usage: {data_mem} KB')
            print('_______________________________________________________\n')
            print('After data cleaning:\n')
            print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
            print(f'\nNumber of rows: {data_cleaned.shape[0]}')
            print(f'Number of cols: {data_cleaned.shape[1]}')
            print(f'Missing values: {data_cl_mv_tot}')
            print(f'Memory usage: {data_cl_mem} KB')
            print('_______________________________________________________\n')

        print(f'Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}')
        print(f'\nChanges:')
        print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
        print(f'     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})')
        print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
        print(f'     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})')
        print(f'Dropped missing values: {data_mv_tot-data_cl_mv_tot}')
        mem_change = data_mem-data_cl_mem
        print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')


def _drop_duplicates(data):
    '''
    Provides information and drops duplicate rows.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    Returns
    -------
    data: Deduplicated Pandas DataFrame
    rows_dropped: Index Object of rows dropped.
    '''

    data = pd.DataFrame(data).copy()
    dupl_rows = data[data.duplicated()].index.tolist()
    data = data.drop(dupl_rows, axis='index')

    return data, dupl_rows


def _memory_usage(data):
    '''
    Gives the total memory usage in kilobytes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    Returns
    -------
    memory_usage: float
    '''

    data = pd.DataFrame(data).copy()
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)

    return memory_usage


def _missing_vals(data):
    '''
    Gives metrics of missing values in the dataset.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    Returns
    -------
    mv_total: float, number of missing values in the entire dataset
    mv_rows: float, number of missing values in each row
    mv_cols: float, number of missing values in each column
    mv_rows_ratio: float, ratio of missing values for each row
    mv_cols_ratio: float, ratio of missing values for each column
    '''

    data = pd.DataFrame(data).copy()
    mv_rows = data.isna().sum(axis=1)
    mv_cols = data.isna().sum(axis=0)
    mv_total = data.isna().sum().sum()
    mv_rows_ratio = mv_rows/data.shape[1]
    mv_cols_ratio = mv_cols/data.shape[0]

    return {'mv_total': mv_total,
            'mv_rows': mv_rows,
            'mv_cols': mv_cols,
            'mv_rows_ratio': mv_rows_ratio,
            'mv_cols_ratio': mv_cols_ratio}


def _validate_input_bool(value, desc):
    if not(isinstance(value, bool)):
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")


def _validate_input_int(value, desc):
    if type(value) != int:
        raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")


def _validate_input_range(value, desc, lower, upper):
    if value < lower or value > upper:
        raise ValueError(
            f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")


def _validate_input_smaller(value1, value2, desc):
    if value1 > value2:
        raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")


def _validate_input_sum(limit, desc, *args):
    if sum(args) > limit:
        raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")


1			'''
2			Utilities and auxiliary functions.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11
12
13			def _corr_selector(corr, split=None, threshold=0):
14			'''
15			Select correlations based on the provided parameters.
16
17			Parameters
18			----------
19			corr: pd.Series or pd.DataFrame of correlations.
20
21			split: {None, 'pos', 'neg', 'above', 'below'}, default None
22			Type of split to be performed.
23
24			threshold: float, default 0
25			Value between 0 <= threshold <= 1
26
27			Returns:
28			-------
29			corr: List or matrix of (filtered) correlations.
30			'''
31
32			if split == 'pos':
33			corr = corr.where((corr >= threshold) & (corr > 0))
34			print('Displaying positive correlations. Use "threshold" to further limit the results.')
35			elif split == 'neg':
36			corr = corr.where((corr <= threshold) & (corr < 0))
37			print('Displaying negative correlations. Use "threshold" to further limit the results.')
38			elif split == 'above':
39			corr = corr.where(np.abs(corr) >= threshold)
40			print(f'Displaying absolute correlations above the threshold ({threshold}).')
41			elif split == 'below':
42			corr = corr.where(np.abs(corr) <= threshold)
43			print(f'Displaying absolute correlations below the threshold ({threshold}).')
44
45			return corr
46
47
48			def _diff_report(data, data_cleaned, dupl_rows=None, single_val_cols=None, show='changes'):
49			'''
50			Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \
51			missing values.
52
53			Parameters
54			----------
55			data: 2D dataset that can be coerced into Pandas DataFrame.
56			Input the initial dataset here.
57
58			data_cleaned: 2D dataset that can be coerced into Pandas DataFrame.
59			Input the cleaned / updated dataset here.
60
61			dupl_rows: list, default None
62			List of duplicate row indices.
63
64			single_val_cols: list, default None
65			List of single-valued column indices. I.e. columns where all cells contain the same value. \
66			NaNs count as a separate value.
67
68			show: {'all', 'changes', None} default 'all'
69			Specify verbosity of the output.
70			* 'all': Print information about the data before and after cleaning as well as information about changes.
71			* 'changes': Print out differences in the data before and after cleaning.
72			* None: No information about the data and the data cleaning is printed.
73
74			Returns:
75			-------
76			Print statement highlighting the datasets or changes between the two datasets.
77			'''
78
79			if show in ['changes', 'all']:
80			dupl_rows = [] if dupl_rows is None else dupl_rows.copy()
81			single_val_cols = [] if single_val_cols is None else single_val_cols.copy()
82			data_mem = _memory_usage(data)
83			data_cl_mem = _memory_usage(data_cleaned)
84			data_mv_tot = _missing_vals(data)['mv_total']
85			data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']
86
87			if show == 'all':
88			print('Before data cleaning:\n')
89			print(f'dtypes:\n{data.dtypes.value_counts()}')
90			print(f'\nNumber of rows: {data.shape[0]}')
91			print(f'Number of cols: {data.shape[1]}')
92			print(f'Missing values: {data_mv_tot}')
93			print(f'Memory usage: {data_mem} KB')
94			print('_______________________________________________________\n')
95			print('After data cleaning:\n')
96			print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}')
97			print(f'\nNumber of rows: {data_cleaned.shape[0]}')
98			print(f'Number of cols: {data_cleaned.shape[1]}')
99			print(f'Missing values: {data_cl_mv_tot}')
100			print(f'Memory usage: {data_cl_mem} KB')
101			print('_______________________________________________________\n')
102
103			print(f'Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}')
104			print(f'\nChanges:')
105			print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}')
106			print(f' of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})')
107			print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}')
108			print(f' of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})')
109			print(f'Dropped missing values: {data_mv_tot-data_cl_mv_tot}')
110			mem_change = data_mem-data_cl_mem
111			print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)')
112
113
114			def _drop_duplicates(data):
115			'''
116			Provides information and drops duplicate rows.
117
118			Parameters
119			----------
120			data: 2D dataset that can be coerced into Pandas DataFrame.
121
122			Returns
123			-------
124			data: Deduplicated Pandas DataFrame
125			rows_dropped: Index Object of rows dropped.
126			'''
127
128			data = pd.DataFrame(data).copy()
129			dupl_rows = data[data.duplicated()].index.tolist()
130			data = data.drop(dupl_rows, axis='index')
131
132			return data, dupl_rows
133
134
135			def _memory_usage(data):
136			'''
137			Gives the total memory usage in kilobytes.
138
139			Parameters
140			----------
141			data: 2D dataset that can be coerced into Pandas DataFrame.
142
143			Returns
144			-------
145			memory_usage: float
146			'''
147
148			data = pd.DataFrame(data).copy()
149			memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
150
151			return memory_usage
152
153
154			def _missing_vals(data):
155			'''
156			Gives metrics of missing values in the dataset.
157
158			Parameters
159			----------
160			data: 2D dataset that can be coerced into Pandas DataFrame.
161
162			Returns
163			-------
164			mv_total: float, number of missing values in the entire dataset
165			mv_rows: float, number of missing values in each row
166			mv_cols: float, number of missing values in each column
167			mv_rows_ratio: float, ratio of missing values for each row
168			mv_cols_ratio: float, ratio of missing values for each column
169			'''
170
171			data = pd.DataFrame(data).copy()
172			mv_rows = data.isna().sum(axis=1)
173			mv_cols = data.isna().sum(axis=0)
174			mv_total = data.isna().sum().sum()
175			mv_rows_ratio = mv_rows/data.shape[1]
176			mv_cols_ratio = mv_cols/data.shape[0]
177
178			return {'mv_total': mv_total,
179			'mv_rows': mv_rows,
180			'mv_cols': mv_cols,
181			'mv_rows_ratio': mv_rows_ratio,
182			'mv_cols_ratio': mv_cols_ratio}
183
184
185			def _validate_input_bool(value, desc):
186			if not(isinstance(value, bool)):
187			raise TypeError(f"Input value for '{desc}' is {type(value)} but should be a boolean.")
188
189
190			def _validate_input_int(value, desc):
191			if type(value) != int:
192			raise TypeError(f"Input value for '{desc}' is {type(value)} but should be an integer.")
193
194
195			def _validate_input_range(value, desc, lower, upper):
196			if value < lower or value > upper:
197			raise ValueError(
198			f"'{desc}' = {value} but should be within the range {lower} <= '{desc}' <= {upper}.")
199
200
201			def _validate_input_smaller(value1, value2, desc):
202			if value1 > value2:
203			raise ValueError(f"The first input for '{desc}' should be smaller or equal to the second input.")
204
205
206			def _validate_input_sum(limit, desc, *args):
207			if sum(args) > limit:
208			raise ValueError(f"The sum of imput values provided for '{desc}' should be less or equal to {limit}.")
209

akanz1 / klib

GitHub Access Token became invalid

Branch — master (5deb01)

klib.utils A

Complexity

Size/Duplication

Importance

10 Functions

Duplication Side-by-Side

Filter issues like