klib.preprocess - Code Metrics - Inspection of "add mv_col_handler" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( ac014b...fe8305 )

by Andreas

created 2020-04-26 09:57 UTC

klib.preprocess A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	74
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	29
dl	0
loc	74
rs	10
c	0
b	0
f	0
wmc	8

1 Function

Rating	Name	Duplication	Size	Complexity
B	mv_col_handler()	0	58	8

'''
Functions for data preprocessing.

:author: Andreas Kanz

'''

# Imports
import pandas as pd

from .describe import corr_mat
from .utils import _missing_vals
from .utils import _validate_input_range


def mv_col_handler(data, target=None, mv_threshold=0.25, corr_thresh_features=0.65, corr_thresh_target=0.2):
    '''
    Drops columns with a high ratio of missing values based on correlation with other features and the target variable.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.25
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.65
        Value between 0 <= threshold <= 1. Previously identified features with a high mv-ratio with a correlation \
        larger than corr_thresh_features with any other feature undergo further analysis.

    corr_thresh_target: float, default 0.25
        Value between 0 <= threshold <= 1. The remaining features (with a high mv-ratio and high correlation to an \
        existing feature) are dropped unless their correlation with the target is larger than corr_thresh_target.

    Returns
    -------
    data: Updated Pandas DataFrame
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', -1, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', -1, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', -1, 1)

    data = pd.DataFrame(data).copy()
    mv_ratios = _missing_vals(data)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    for col in cols_mv:
        data[col] = data_mv_binary[col]

    high_corr_features = []
    data_temp = data.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    for col in high_corr_features:
        if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target:
            drop_cols.append(col)
            data = data.drop(columns=[col])

    return data, drop_cols


1			'''
2			Functions for data preprocessing.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10
11			from .describe import corr_mat
12			from .utils import _missing_vals
13			from .utils import _validate_input_range
14
15
16			def mv_col_handler(data, target=None, mv_threshold=0.25, corr_thresh_features=0.65, corr_thresh_target=0.2):
17			'''
18			Drops columns with a high ratio of missing values based on correlation with other features and the target variable.
19
20			Parameters
21			----------
22			data: 2D dataset that can be coerced into Pandas DataFrame.
23
24			target: string, list, np.array or pd.Series, default None
25			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
26			and the label.
27
28			mv_threshold: float, default 0.25
29			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
30			for dropping and undergo further analysis.
31
32			corr_thresh_features: float, default 0.65
33			Value between 0 <= threshold <= 1. Previously identified features with a high mv-ratio with a correlation \
34			larger than corr_thresh_features with any other feature undergo further analysis.
35
36			corr_thresh_target: float, default 0.25
37			Value between 0 <= threshold <= 1. The remaining features (with a high mv-ratio and high correlation to an \
38			existing feature) are dropped unless their correlation with the target is larger than corr_thresh_target.
39
40			Returns
41			-------
42			data: Updated Pandas DataFrame
43			drop_cols: List of dropped columns
44			'''
45
46			# Validate Inputs
47			_validate_input_range(mv_threshold, 'mv_threshold', -1, 1)
48			_validate_input_range(corr_thresh_features, 'corr_thresh_features', -1, 1)
49			_validate_input_range(corr_thresh_target, 'corr_thresh_target', -1, 1)
50
51			data = pd.DataFrame(data).copy()
52			mv_ratios = _missing_vals(data)['mv_cols_ratio']
53			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
54			data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
55
56			for col in cols_mv:
57			data[col] = data_mv_binary[col]
58
59			high_corr_features = []
60			data_temp = data.copy()
61			for col in cols_mv:
62			corrmat = corr_mat(data_temp, colored=False)
63			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
64			high_corr_features.append(col)
65			data_temp = data_temp.drop(columns=[col])
66
67			drop_cols = []
68			for col in high_corr_features:
69			if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target:
70			drop_cols.append(col)
71			data = data.drop(columns=[col])
72
73			return data, drop_cols
74

akanz1 / klib

GitHub Access Token became invalid

Push — master ( ac014b...fe8305 )

klib.preprocess A

Complexity

Size/Duplication

Importance

1 Function

Duplication Side-by-Side

Filter issues like