GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( b2514e...e55ee5 )
by Andreas
05:22
created

klib.preprocess.preprocessing_pipe()   A

Complexity

Conditions 5

Size

Total Lines 22
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 8
nop 2
dl 0
loc 22
rs 9.3333
c 0
b 0
f 0
1
'''
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
from .describe import corr_mat
10
from .utils import _missing_vals
11
import numpy as np
12
import pandas as pd
13
14
from sklearn.base import BaseEstimator, TransformerMixin
15
from sklearn.ensemble import ExtraTreesRegressor
16
from sklearn.experimental import enable_iterative_imputer  # noqa
17
from sklearn.impute import SimpleImputer, IterativeImputer
18
from sklearn.model_selection import train_test_split
19
from sklearn.pipeline import make_pipeline, make_union
20
from sklearn.preprocessing import OneHotEncoder, RobustScaler
21
22
from .utils import _validate_input_int
23
from .utils import _validate_input_range
24
25
26
def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
27
    '''
28
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
29
    their correlation with other features and the target variable. This function follows a three step process:
30
    - 1) Identify features with a high ratio of missing values
31
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
32
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
33
         they correlate reasonably well with the target variable.
34
35
    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
36
37
    Parameters
38
    ----------
39
    data: 2D dataset that can be coerced into Pandas DataFrame.
40
41
    target: string, list, np.array or pd.Series, default None
42
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
43
        and the label.
44
45
    mv_threshold: float, default 0.1
46
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
47
        for dropping and undergo further analysis.
48
49
    corr_thresh_features: float, default 0.6
50
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
51
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
52
53
    corr_thresh_target: float, default 0.3
54
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
55
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
56
        the feature is ultimately dropped.
57
58
    Returns
59
    -------
60
    data: Updated Pandas DataFrame
61
    cols_mv: Columns with missing values included in the analysis
62
    drop_cols: List of dropped columns
63
    '''
64
65
    # Validate Inputs
66
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
67
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
68
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
69
70
    data = pd.DataFrame(data).copy()
71
    data_local = data.copy()
72
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
73
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
74
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
75
76
    high_corr_features = []
77
    data_temp = data_local.copy()
78
    for col in cols_mv:
79
        corrmat = corr_mat(data_temp, colored=False)
80
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
81
            high_corr_features.append(col)
82
            data_temp = data_temp.drop(columns=[col])
83
84
    drop_cols = []
85
    if target is None:
86
        data = data.drop(columns=high_corr_features)
87
    else:
88
        for col in high_corr_features:
89
            if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
90
                drop_cols.append(col)
91
                data = data.drop(columns=[col])
92
93
    return data, cols_mv, drop_cols
94
95
96
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
97
    '''
98
    Split a dataset and a label column into train, dev and test sets.
99
100
    Parameters:
101
    ----------
102
103
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
104
    information is used to label the plots.
105
106
    target: string, list, np.array or pd.Series, default None
107
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
108
        and the label.
109
110
    dev_size: float, default 0.1
111
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
112
        split.
113
114
    test_size: float, default 0.1
115
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
116
        split.
117
118
    stratify: target column, default None
119
        If not None, data is split in a stratified fashion, using the input as the class labels.
120
121
    random_state: integer, default 408
122
        Random_state is the seed used by the random number generator.
123
124
    Returns
125
    -------
126
    tuple: Tuple containing train-dev-test split of inputs.
127
    '''
128
129
    # Validate Inputs
130
    _validate_input_range(dev_size, 'dev_size', 0, 1)
131
    _validate_input_range(test_size, 'test_size', 0, 1)
132
    _validate_input_int(random_state, 'random_state')
133
134
    target_data = []
135
    if isinstance(target, str):
136
        target_data = data[target]
137
        data = data.drop(target, axis=1)
138
139
    elif isinstance(target, (list, pd.Series, np.ndarray)):
140
        target_data = pd.Series(target)
141
        target = target.name
142
143
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
144
                                                                test_size=dev_size+test_size,
145
                                                                random_state=random_state,
146
                                                                stratify=stratify)
147
148
    if (dev_size == 0) or (test_size == 0):
149
        return X_train, X_dev_test, y_train, y_dev_test
150
151
    else:
152
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
153
                                                        test_size=test_size/(dev_size+test_size),
154
                                                        random_state=random_state,
155
                                                        stratify=y_dev_test)
156
        return X_train, X_dev, X_test, y_train, y_dev, y_test
157
158
159
class ColumnSelector(BaseEstimator, TransformerMixin):
160
    ''''''
161
162
    def __init__(self, num=True):
163
        self.num = num
164
165
    def fit(self, X, y=None):
166
        return self
167
168
    def transform(self, X, y=None):
169
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
170
171
        if self.num:
172
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
173
        else:
174
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
175
176
177
def cat_pipe(imputer=SimpleImputer(strategy='most_frequent')):
178
    '''Set of standard preprocessing operations on categorical data.'''
179
180
    cat_pipe = make_pipeline(ColumnSelector(num=False),
181
                             imputer,
182
                             OneHotEncoder(handle_unknown='ignore'))
183
    return cat_pipe
184
185
186
def num_pipe(imputer=IterativeImputer(
187
        estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), random_state=408),
188
        scaler=RobustScaler()):
189
    '''Set of standard preprocessing operations on numerical data.'''
190
191
    num_pipe = make_pipeline(ColumnSelector(),
192
                             (imputer),
193
                             (scaler))
194
    return num_pipe
195
196
197
def preprocessing_pipe(num=True, cat=True):
198
    '''Set of standard preprocessing operations on numerical and categorical data.
199
200
    Parameters:
201
    ----------
202
    num: bool, default True
203
        Set to false if no numerical data is in the dataset.
204
205
    cat: bool, default True
206
        Set to false if no categorical data is in the dataset.
207
    '''
208
209
    if num and cat:
210
        pipe = make_union(num_pipe(), cat_pipe(), n_jobs=4)
211
212
    elif num:
213
        pipe = num_pipe()
214
215
    elif cat:
216
        pipe = cat_pipe()
217
218
    return pipe
0 ignored issues
show
introduced by
The variable pipe does not seem to be defined for all execution paths.
Loading history...
219
220
221
class MVColHandler(BaseEstimator, TransformerMixin):
222
    '''possible component of a cleaning pipeline --> follows DataCleaning'''
223
224
    def __init__(self, target=None, mch_mv_thresh=0.1, mch_feature_thresh=0.6, mch_target_thresh=0.3):
225
        self.target = target
226
        self.mch_mv_thresh = mch_mv_thresh
227
        self.mch_feature_thresh = mch_feature_thresh
228
        self.mch_target_thresh = mch_target_thresh
229
230
    def fit(self, data, target=None):
231
        return self
232
233
    def transform(self, data, target=None):
234
        data, cols_mv, dropped_cols = mv_col_handler(data, target=self.target, mv_threshold=self.mch_mv_thresh,
235
                                                     corr_thresh_features=self.mch_feature_thresh,
236
                                                     corr_thresh_target=self.mch_target_thresh)
237
238
        print(f'\nFeatures with MV-ratio > {self.mch_mv_thresh}: {len(cols_mv)}')
239
        print('Features dropped:', len(dropped_cols), dropped_cols)
240
241
        return data
242