GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Branch master (5deb01)
by Andreas
02:32
created

klib.preprocess.train_dev_test_split()   B

Complexity

Conditions 5

Size

Total Lines 61
Code Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 22
nop 6
dl 0
loc 61
rs 8.8853
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from sklearn.ensemble import ExtraTreesRegressor
14
from sklearn.experimental import enable_iterative_imputer  # noqa
15
from sklearn.impute import (SimpleImputer, IterativeImputer)
16
from sklearn.feature_selection import (f_classif,
17
                                       SelectFromModel,
18
                                       SelectPercentile,
19
                                       VarianceThreshold)
20
from sklearn.linear_model import LassoCV
21
from sklearn.model_selection import train_test_split
22
from sklearn.pipeline import make_pipeline
23
from sklearn.preprocessing import (OneHotEncoder,
24
                                   RobustScaler,
25
                                   MaxAbsScaler)
26
27
from .utils import (_validate_input_int,
28
                    _validate_input_range,
29
                    _validate_input_sum)
30
31
32
__all__ = ['feature_selection_pipe',
33
           'num_pipe',
34
           'cat_pipe',
35
           'train_dev_test_split']
36
37
38
class ColumnSelector(BaseEstimator, TransformerMixin):
39
    '''
40
    Determines and selects numerical and categorical columns from a dataset based on their supposed dtype. Unlike \
41
    sklearn's make_column_selector() missing values are temporarily filled in to allow convert_dtypes() to determine \
42
    the dtype of a column.
43
44
    Parameter:
45
    ---------
46
    num: default, True
47
        Select only numerica Columns. If num = False, only categorical columns are selected.
48
49
    Returns:
50
    -------
51
    Dataset containing only numerical or categorical data.
52
    '''
53
54
    def __init__(self, num=True):
55
        self.num = num
56
57
    def fit(self, X, y=None):
58
        return self
59
60
    def transform(self, X, y=None):
61
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
62
63
        if self.num:
64
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
65
        else:
66
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
67
68
69
class PipeInfo(BaseEstimator, TransformerMixin):
70
    '''
71
    Prints intermediary information about the dataset from within a pipeline. Include at any point in a Pipeline to
72
    print out the shape of the dataset at this point and to receive an indication of the progress within the pipeline.
73
    Set to 'None' to avoid printing the shape of the dataset. This parameter can also be set as a hyperparameter, \
74
    e.g. 'pipeline__pipeinfo-1': [None] or 'pipeline__pipeinfo-1__name': ['my_custom_name'].
75
76
    Parameter:
77
    ---------
78
    name: string, default None
79
        Provide a name for the current step.
80
81
    Returns:
82
    -------
83
    Data: Data is being passed through.
84
    '''
85
86
    def __init__(self, name=None):
87
        self.name = name
88
89
    def fit(self, X, y=None):
90
        return self
91
92
    def transform(self, X, y=None):
93
        print(f'Step: {self.name} --- Shape: {X.shape}')
94
        return X
95
96
97
def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
98
             encoder=OneHotEncoder(handle_unknown='ignore'),
99
             scaler=MaxAbsScaler(),
100
             encoder_info=PipeInfo(name='after encoding')):
101
    '''
102
    Standard preprocessing operations on categorical data.
103
104
    Parameters:
105
    ----------
106
    imputer: default, SimpleImputer(strategy='most_frequent')
107
108
    encoder: default, OneHotEncoder(handle_unknown='ignore')
109
        Encode categorical features as a one-hot numeric array.
110
111
    scaler: default, MaxAbsScaler()
112
        Scale each feature by its maximum absolute value. MaxAbsScaler() does not shift/center the data, and thus does \
113
        not destroy any sparsity. It is recommended to check for outliers before applying MaxAbsScaler().
114
115
    encoder_info:
116
        Prints the shape of the dataset at the end of 'cat_pipe'. Set to 'None' to avoid printing the shape of \
117
        dataset. This parameter can also be set as a hyperparameter, e.g. 'pipeline__pipeinfo-1': [None] or \
118
        'pipeline__pipeinfo-1__name': ['my_custom_name'].
119
120
    Returns:
121
    -------
122
    Pipeline
123
    '''
124
125
    cat_pipe = make_pipeline(ColumnSelector(num=False),
126
                             imputer,
127
                             encoder, encoder_info,
128
                             scaler)
129
    return cat_pipe
130
131
132
def feature_selection_pipe(
133
        var_thresh=VarianceThreshold(threshold=0.1),
134
        select_from_model=SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median"),
135
        select_percentile=SelectPercentile(f_classif, percentile=95),
136
        var_thresh_info=PipeInfo(name='after var_thresh'),
137
        select_from_model_info=PipeInfo(name='after select_from_model'),
138
        select_percentile_info=PipeInfo(name='after select_percentile')):
139
    '''
140
    Preprocessing operations for feature selection.
141
142
    Parameters:
143
    ----------
144
    var_thresh: default, VarianceThreshold(threshold=0.1)
145
        Specify a threshold to drop low variance features.
146
147
    select_from_model: default, SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median")
148
        Specify an estimator which is used for selecting features based on importance weights.
149
150
    select_percentile: default, SelectPercentile(f_classif, percentile=95)
151
        Specify a score-function and a percentile value of features to keep.
152
153
    var_thresh_info, select_from_model_info, select_percentile_info
154
        Prints the shape of the dataset after applying the respective function. Set to 'None' to avoid printing the \
155
        shape of dataset. This parameter can also be set as a hyperparameter, e.g. 'pipeline__pipeinfo-1': [None] \
156
        or 'pipeline__pipeinfo-1__name': ['my_custom_name'].
157
158
    Returns:
159
    -------
160
    Pipeline
161
    '''
162
163
    feature_selection_pipe = make_pipeline(var_thresh, var_thresh_info,
164
                                           select_from_model, select_from_model_info,
165
                                           select_percentile, select_percentile_info)
166
    return feature_selection_pipe
167
168
169
def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
170
        n_estimators=25, n_jobs=4, random_state=408), random_state=408),
171
        scaler=RobustScaler()):
172
    '''
173
    Standard preprocessing operations on numerical data.
174
175
    Parameters:
176
    ----------
177
    imputer: default, IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), \
178
                                       random_state=408)
179
180
    scaler: default, RobustScaler()
181
182
    Returns:
183
    -------
184
    Pipeline
185
    '''
186
187
    num_pipe = make_pipeline(ColumnSelector(),
188
                             imputer,
189
                             scaler)
190
    return num_pipe
191
192
193
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
194
    '''
195
    Split a dataset and a label column into train, dev and test sets.
196
197
    Parameters:
198
    ----------
199
200
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
201
    information is used to label the plots.
202
203
    target: string, list, np.array or pd.Series, default None
204
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
205
        and the label.
206
207
    dev_size: float, default 0.1
208
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
209
        split.
210
211
    test_size: float, default 0.1
212
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
213
        split.
214
215
    stratify: target column, default None
216
        If not None, data is split in a stratified fashion, using the input as the class labels.
217
218
    random_state: integer, default 408
219
        Random_state is the seed used by the random number generator.
220
221
    Returns
222
    -------
223
    tuple: Tuple containing train-dev-test split of inputs.
224
    '''
225
226
    # Validate Inputs
227
    _validate_input_range(dev_size, 'dev_size', 0, 1)
228
    _validate_input_range(test_size, 'test_size', 0, 1)
229
    _validate_input_int(random_state, 'random_state')
230
    _validate_input_sum(1, 'Dev and test', dev_size, test_size)
231
232
    target_data = []
233
    if isinstance(target, str):
234
        target_data = data[target]
235
        data = data.drop(target, axis=1)
236
237
    elif isinstance(target, (list, pd.Series, np.ndarray)):
238
        target_data = pd.Series(target)
239
240
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
241
                                                                test_size=dev_size+test_size,
242
                                                                random_state=random_state,
243
                                                                stratify=stratify)
244
245
    if (dev_size == 0) or (test_size == 0):
246
        return X_train, X_dev_test, y_train, y_dev_test
247
248
    else:
249
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
250
                                                        test_size=test_size/(dev_size+test_size),
251
                                                        random_state=random_state,
252
                                                        stratify=y_dev_test)
253
        return X_train, X_dev, X_test, y_train, y_dev, y_test
254