GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 9934a3...be544d )
by Andreas
01:15
created

klib.preprocess.PipeInfo.transform()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 3
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
'''
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from sklearn.ensemble import ExtraTreesRegressor
14
from sklearn.experimental import enable_iterative_imputer  # noqa
15
from sklearn.impute import (SimpleImputer, IterativeImputer)
16
from sklearn.feature_selection import (f_classif,
17
                                       SelectFromModel,
18
                                       SelectPercentile,
19
                                       VarianceThreshold)
20
from sklearn.linear_model import LassoCV
21
from sklearn.model_selection import train_test_split
22
from sklearn.pipeline import make_pipeline
23
from sklearn.preprocessing import (OneHotEncoder,
24
                                   RobustScaler,
25
                                   MaxAbsScaler)
26
27
from .utils import (_validate_input_int,
28
                    _validate_input_range,
29
                    _validate_input_sum)
30
31
32
__all__ = ['feature_selection_pipe',
33
           'num_pipe',
34
           'cat_pipe',
35
           'train_dev_test_split']
36
37
38
class ColumnSelector(BaseEstimator, TransformerMixin):
39
    '''
40
    Determines and selects numerical and categorical columns from a dataset based on their supposed dtype. Unlike \
41
    sklearn's make_column_selector() missing values are temporarily filled in to allow convert_dtypes() to determine \
42
    the dtype of a column.
43
44
    Parameter:
45
    ---------
46
    num: default, True
47
        Select only numerica Columns. If num = False, only categorical columns are selected.
48
49
    Returns:
50
    -------
51
    Dataset containing only numerical or categorical data.
52
    '''
53
54
    def __init__(self, num=True):
55
        self.num = num
56
57
    def fit(self, X, y=None):
58
        return self
59
60
    def transform(self, X, y=None):
61
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
62
63
        if self.num:
64
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
65
        else:
66
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
67
68
69
class PipeInfo(BaseEstimator, TransformerMixin):
70
    '''
71
    Prints intermediary information about the dataset from within a pipeline. Include at any point in a Pipeline to
72
    print out the shape of the dataset at this point.
73
74
    Parameter:
75
    ---------
76
    name: string, default None
77
        Provide a name for the current step.
78
79
    Returns:
80
    -------
81
    Data: Data is being passed through.
82
    '''
83
84
    def __init__(self, name=None):
85
        self.name = name
86
87
    def fit(self, X, y=None):
88
        return self
89
90
    def transform(self, X, y=None):
91
        print(f'Step: {self.name} --- Shape: {X.shape}')
92
        return X
93
94
95
def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
96
             encoder=OneHotEncoder(handle_unknown='ignore'),
97
             scaler=MaxAbsScaler(),
98
             encoder_info=PipeInfo(name='after encoding')):
99
    '''
100
    Standard preprocessing operations on categorical data.
101
102
    Parameters:
103
    ----------
104
    imputer: default, SimpleImputer(strategy='most_frequent')
105
106
    encoder: default, OneHotEncoder(handle_unknown='ignore')
107
        Encode categorical features as a one-hot numeric array.
108
109
    scaler: default, MaxAbsScaler()
110
        Scale each feature by its maximum absolute value. MaxAbsScaler() does not shift/center the data, and thus does \
111
        not destroy any sparsity. It is recommended to check for outliers before applying MaxAbsScaler().
112
113
114
    Returns:
115
    -------
116
    Pipeline
117
    '''
118
119
    cat_pipe = make_pipeline(ColumnSelector(num=False),
120
                             imputer,
121
                             encoder, encoder_info,
122
                             scaler)
123
    return cat_pipe
124
125
126
def feature_selection_pipe(
127
        var_thresh=VarianceThreshold(threshold=0.1),
128
        select_from_model=SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median"),
129
        select_percentile=SelectPercentile(f_classif, percentile=95),
130
        var_thresh_info=PipeInfo(name='after var_thresh'),
131
        select_from_model_info=PipeInfo(name='after select_from_model'),
132
        select_percentile_info=PipeInfo(name='after select_percentile')):
133
    '''
134
    Preprocessing operations for feature selection.
135
136
    Parameters:
137
    ----------
138
    var_thresh: default, VarianceThreshold(threshold=0.1)
139
        Specify a threshold to drop low variance features.
140
141
    select_from_model: default, SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median")
142
        Specify an estimator which is used for selecting features based on importance weights.
143
144
    select_percentile: default, SelectPercentile(f_classif, percentile=95)
145
        Specify a score-function and a percentile value of features to keep.
146
147
    Returns:
148
    -------
149
    Pipeline
150
    '''
151
152
    feature_selection_pipe = make_pipeline(var_thresh, var_thresh_info,
153
                                           select_from_model, select_from_model_info,
154
                                           select_percentile, select_percentile_info)
155
    return feature_selection_pipe
156
157
158
def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
159
        n_estimators=25, n_jobs=4, random_state=408), random_state=408),
160
        scaler=RobustScaler()):
161
    '''
162
    Standard preprocessing operations on numerical data.
163
164
    Parameters:
165
    ----------
166
    imputer: default, IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), \
167
                                       random_state=408)
168
169
    scaler: default, RobustScaler()
170
171
    Returns:
172
    -------
173
    Pipeline
174
    '''
175
176
    num_pipe = make_pipeline(ColumnSelector(),
177
                             imputer,
178
                             scaler)
179
    return num_pipe
180
181
182
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
183
    '''
184
    Split a dataset and a label column into train, dev and test sets.
185
186
    Parameters:
187
    ----------
188
189
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
190
    information is used to label the plots.
191
192
    target: string, list, np.array or pd.Series, default None
193
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
194
        and the label.
195
196
    dev_size: float, default 0.1
197
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
198
        split.
199
200
    test_size: float, default 0.1
201
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
202
        split.
203
204
    stratify: target column, default None
205
        If not None, data is split in a stratified fashion, using the input as the class labels.
206
207
    random_state: integer, default 408
208
        Random_state is the seed used by the random number generator.
209
210
    Returns
211
    -------
212
    tuple: Tuple containing train-dev-test split of inputs.
213
    '''
214
215
    # Validate Inputs
216
    _validate_input_range(dev_size, 'dev_size', 0, 1)
217
    _validate_input_range(test_size, 'test_size', 0, 1)
218
    _validate_input_int(random_state, 'random_state')
219
    _validate_input_sum(1, 'Dev and test', dev_size, test_size)
220
221
    target_data = []
222
    if isinstance(target, str):
223
        target_data = data[target]
224
        data = data.drop(target, axis=1)
225
226
    elif isinstance(target, (list, pd.Series, np.ndarray)):
227
        target_data = pd.Series(target)
228
229
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
230
                                                                test_size=dev_size+test_size,
231
                                                                random_state=random_state,
232
                                                                stratify=stratify)
233
234
    if (dev_size == 0) or (test_size == 0):
235
        return X_train, X_dev_test, y_train, y_dev_test
236
237
    else:
238
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
239
                                                        test_size=test_size/(dev_size+test_size),
240
                                                        random_state=random_state,
241
                                                        stratify=y_dev_test)
242
        return X_train, X_dev, X_test, y_train, y_dev, y_test
243