klib.utils._corr_selector() - Code Metrics - Inspection of "apply corr_selector()" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 4f98db...92a4ba )

by Andreas

created 2020-04-16 12:01 UTC

klib.utils._corr_selector() A

↳ Parent: klib.utils

Complexity

Conditions

Size

Total Lines	32
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	15
nop	3
dl	0
loc	32
rs	9.1832
c	0
b	0
f	0

'''
Utilities and auxiliary functions.

:author: Andreas Kanz

'''

# Imports
import numpy as np
import pandas as pd


def _corr_selector(corr, split=None, threshold=0):
    '''
    Parameters
    ----------
    corr: List or matrix of correlations.

    split: {None, 'pos', 'neg', 'high', 'low'}, default None
        Type of split to be performed.

    threshold: float, default 0
        Value between 0 <= threshold <= 1

    Returns:
    -------
    corr: List or matrix of (filtered) correlations.
    '''
    if split == 'pos':
        corr = corr.where((corr >= threshold) & (corr > 0))
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
    elif split == 'neg':
        corr = corr.where((corr <= threshold) & (corr < 0))
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
    elif split == 'high':
        corr = corr.where(np.abs(corr) >= threshold)
        print('Displaying absolute correlations above a chosen threshold.')
    elif split == 'low':
        corr = corr.where(np.abs(corr) <= threshold)
        print('Displaying absolute correlations below a chosen threshold.')
    else:
        corr = corr

    return corr


def _drop_duplicates(data):
    '''
    Provides information and drops duplicate rows.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    Returns
    -------
    data: Deduplicated Pandas DataFrame
    rows_dropped: Index Object of rows dropped.
    '''

    data = pd.DataFrame(data).copy()
    rows_dropped = data[data.duplicated()].index
    data = data.drop_duplicates()

    return data, rows_dropped


def _memory_usage(data):
    '''
    Gives the total memory usage in kilobytes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    Returns
    -------
    memory_usage: float
    '''

    data = pd.DataFrame(data).copy()
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)

    return memory_usage


def _missing_vals(data):
    '''
    Gives metrics of missing values in the dataset.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    Returns
    -------
    mv_total: float, number of missing values in the entire dataset
    mv_rows: float, number of missing values in each row
    mv_cols: float, number of missing values in each column
    mv_rows_ratio: float, ratio of missing values for each row
    mv_cols_ratio: float, ratio of missing values for each column
    '''

    data = pd.DataFrame(data).copy()
    mv_rows = data.isna().sum(axis=1)
    mv_cols = data.isna().sum(axis=0)
    mv_total = data.isna().sum().sum()
    mv_rows_ratio = mv_rows/data.shape[1]
    mv_cols_ratio = mv_cols/data.shape[0]

    return {'mv_total': mv_total,
            'mv_rows': mv_rows,
            'mv_cols': mv_cols,
            'mv_rows_ratio': mv_rows_ratio,
            'mv_cols_ratio': mv_cols_ratio}


def _validate_input_0_1(value, desc):
    if value < 0 or value > 1:
        raise ValueError(f'Input value for {desc} is {value} but should be a float in the range 0 <= {desc} <=1.')


def _validate_input_bool(value, desc):
    if not(isinstance(value, bool)):
        raise ValueError(f'Input value for {desc} is {value} but should be boolean.')


1			'''
2			Utilities and auxiliary functions.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11
12
13			def _corr_selector(corr, split=None, threshold=0):
14			'''
15			Parameters
16			----------
17			corr: List or matrix of correlations.
18
19			split: {None, 'pos', 'neg', 'high', 'low'}, default None
20			Type of split to be performed.
21
22			threshold: float, default 0
23			Value between 0 <= threshold <= 1
24
25			Returns:
26			-------
27			corr: List or matrix of (filtered) correlations.
28			'''
29			if split == 'pos':
30			corr = corr.where((corr >= threshold) & (corr > 0))
31			print('Displaying positive correlations. Use "threshold" to further limit the results.')
32			elif split == 'neg':
33			corr = corr.where((corr <= threshold) & (corr < 0))
34			print('Displaying negative correlations. Use "threshold" to further limit the results.')
35			elif split == 'high':
36			corr = corr.where(np.abs(corr) >= threshold)
37			print('Displaying absolute correlations above a chosen threshold.')
38			elif split == 'low':
39			corr = corr.where(np.abs(corr) <= threshold)
40			print('Displaying absolute correlations below a chosen threshold.')
41			else:
42			corr = corr
43
44			return corr
45
46
47			def _drop_duplicates(data):
48			'''
49			Provides information and drops duplicate rows.
50
51			Parameters
52			----------
53			data: 2D dataset that can be coerced into Pandas DataFrame.
54
55			Returns
56			-------
57			data: Deduplicated Pandas DataFrame
58			rows_dropped: Index Object of rows dropped.
59			'''
60
61			data = pd.DataFrame(data).copy()
62			rows_dropped = data[data.duplicated()].index
63			data = data.drop_duplicates()
64
65			return data, rows_dropped
66
67
68			def _memory_usage(data):
69			'''
70			Gives the total memory usage in kilobytes.
71
72			Parameters
73			----------
74			data: 2D dataset that can be coerced into Pandas DataFrame.
75
76			Returns
77			-------
78			memory_usage: float
79			'''
80
81			data = pd.DataFrame(data).copy()
82			memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
83
84			return memory_usage
85
86
87			def _missing_vals(data):
88			'''
89			Gives metrics of missing values in the dataset.
90
91			Parameters
92			----------
93			data: 2D dataset that can be coerced into Pandas DataFrame.
94
95			Returns
96			-------
97			mv_total: float, number of missing values in the entire dataset
98			mv_rows: float, number of missing values in each row
99			mv_cols: float, number of missing values in each column
100			mv_rows_ratio: float, ratio of missing values for each row
101			mv_cols_ratio: float, ratio of missing values for each column
102			'''
103
104			data = pd.DataFrame(data).copy()
105			mv_rows = data.isna().sum(axis=1)
106			mv_cols = data.isna().sum(axis=0)
107			mv_total = data.isna().sum().sum()
108			mv_rows_ratio = mv_rows/data.shape[1]
109			mv_cols_ratio = mv_cols/data.shape[0]
110
111			return {'mv_total': mv_total,
112			'mv_rows': mv_rows,
113			'mv_cols': mv_cols,
114			'mv_rows_ratio': mv_rows_ratio,
115			'mv_cols_ratio': mv_cols_ratio}
116
117
118			def _validate_input_0_1(value, desc):
119			if value < 0 or value > 1:
120			raise ValueError(f'Input value for {desc} is {value} but should be a float in the range 0 <= {desc} <=1.')
121
122
123			def _validate_input_bool(value, desc):
124			if not(isinstance(value, bool)):
125			raise ValueError(f'Input value for {desc} is {value} but should be boolean.')
126

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 4f98db...92a4ba )

klib.utils._corr_selector() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like