GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( ac014b...fe8305 )
by Andreas
01:18
created

klib.preprocess   A

Complexity

Total Complexity 8

Size/Duplication

Total Lines 74
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 29
dl 0
loc 74
rs 10
c 0
b 0
f 0
wmc 8

1 Function

Rating   Name   Duplication   Size   Complexity  
B mv_col_handler() 0 58 8
1
'''
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import pandas as pd
10
11
from .describe import corr_mat
12
from .utils import _missing_vals
13
from .utils import _validate_input_range
14
15
16
def mv_col_handler(data, target=None, mv_threshold=0.25, corr_thresh_features=0.65, corr_thresh_target=0.2):
17
    '''
18
    Drops columns with a high ratio of missing values based on correlation with other features and the target variable.
19
20
    Parameters
21
    ----------
22
    data: 2D dataset that can be coerced into Pandas DataFrame.
23
24
    target: string, list, np.array or pd.Series, default None
25
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
26
        and the label.
27
28
    mv_threshold: float, default 0.25
29
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
30
        for dropping and undergo further analysis.
31
32
    corr_thresh_features: float, default 0.65
33
        Value between 0 <= threshold <= 1. Previously identified features with a high mv-ratio with a correlation \
34
        larger than corr_thresh_features with any other feature undergo further analysis.
35
36
    corr_thresh_target: float, default 0.25
37
        Value between 0 <= threshold <= 1. The remaining features (with a high mv-ratio and high correlation to an \
38
        existing feature) are dropped unless their correlation with the target is larger than corr_thresh_target.
39
40
    Returns
41
    -------
42
    data: Updated Pandas DataFrame
43
    drop_cols: List of dropped columns
44
    '''
45
46
    # Validate Inputs
47
    _validate_input_range(mv_threshold, 'mv_threshold', -1, 1)
48
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', -1, 1)
49
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', -1, 1)
50
51
    data = pd.DataFrame(data).copy()
52
    mv_ratios = _missing_vals(data)['mv_cols_ratio']
53
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
54
    data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
55
56
    for col in cols_mv:
57
        data[col] = data_mv_binary[col]
58
59
    high_corr_features = []
60
    data_temp = data.copy()
61
    for col in cols_mv:
62
        corrmat = corr_mat(data_temp, colored=False)
63
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
64
            high_corr_features.append(col)
65
            data_temp = data_temp.drop(columns=[col])
66
67
    drop_cols = []
68
    for col in high_corr_features:
69
        if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target:
70
            drop_cols.append(col)
71
            data = data.drop(columns=[col])
72
73
    return data, drop_cols
74