GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — main ( 476ed9...09d8d6 )
by Andreas
02:41
created

klib.preprocess.train_dev_test_split()   B

Complexity

Conditions 5

Size

Total Lines 66
Code Lines 27

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 16
CRAP Score 5

Importance

Changes 0
Metric Value
cc 5
eloc 27
nop 6
dl 0
loc 66
ccs 16
cts 16
cp 1
crap 5
rs 8.7653
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
"""
7
8
# Imports
9 1
import numpy as np
10 1
import pandas as pd
11 1
from sklearn.base import BaseEstimator, TransformerMixin
12 1
from sklearn.ensemble import ExtraTreesRegressor
13 1
from sklearn.experimental import enable_iterative_imputer  # noqa
14 1
from sklearn.feature_selection import (
15
    SelectFromModel,
16
    SelectPercentile,
17
    VarianceThreshold,
18
    f_classif,
19
)
20 1
from sklearn.impute import IterativeImputer, SimpleImputer
21 1
from sklearn.linear_model import LassoCV
22 1
from sklearn.model_selection import train_test_split
23 1
from sklearn.pipeline import make_pipeline
24 1
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder, RobustScaler
25
26 1
from klib.utils import (
27
    _validate_input_int,
28
    _validate_input_range,
29
    _validate_input_sum_smaller,
30
)
31
32 1
__all__ = ["feature_selection_pipe", "num_pipe", "cat_pipe", "train_dev_test_split"]
33
34
35 1
class ColumnSelector(BaseEstimator, TransformerMixin):
36
    """
37
    Determines and selects numerical and categorical columns from a dataset based on \
38
    their supposed dtype. Unlike sklearn's make_column_selector() missing values are \
39
    temporarily filled in to allow convert_dtypes() to determine the dtype of a column.
40
41
    Parameter
42
    ---------
43
    num: default, True
44
        Select only numerical Columns. If num = False, only categorical columns are \
45
        selected.
46
47
    Returns
48
    -------
49
    Dataset containing only numerical or categorical data.
50
    """
51
52 1
    def __init__(self, num=True):
53
        self.num = num
54
55 1
    def fit(self, X, y=None):
56
        return self
57
58 1
    def transform(self, X, y=None):
59
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
60
61
        if self.num:
62
            return X[temp.select_dtypes(include=["number"]).columns.tolist()]
63
        return X[temp.select_dtypes(exclude=["number"]).columns.tolist()]
64
65
66 1
class PipeInfo(BaseEstimator, TransformerMixin):
67
    """
68
    Prints intermediary information about the dataset from within a pipeline.
69
70
    Include at any point in a Pipeline to print out the shape of the dataset at this \
71
    point and to receive an indication of the progress within the pipeline.
72
73
    Set to 'None' to avoid printing the shape of the dataset. This parameter can also \
74
    be set as a hyperparameter, e.g. 'pipeline__pipeinfo-1': [None] or \
75
    'pipeline__pipeinfo-1__name': ['my_custom_name'].
76
77
    Parameter
78
    ---------
79
    name: string, default None
80
        Provide a name for the current step.
81
82
    Returns
83
    -------
84
    Data: Data is being passed through.
85
    """
86
87 1
    def __init__(self, name=None):
88 1
        self.name = name
89
90 1
    def fit(self, X, y=None):
91
        return self
92
93 1
    def transform(self, X, y=None):
94
        print(f"Step: {self.name} --- Shape: {X.shape}")
95
        return X
96
97
98 1
def cat_pipe(
99
    imputer=SimpleImputer(strategy="most_frequent"),
100
    encoder=OneHotEncoder(handle_unknown="ignore"),
101
    scaler=MaxAbsScaler(),
102
    encoder_info=PipeInfo(name="after encoding categorical data"),
103
):
104
    """
105
    Standard preprocessing operations on categorical data.
106
107
    Parameters
108
    ----------
109
    imputer: default, SimpleImputer(strategy='most_frequent')
110
111
    encoder: default, OneHotEncoder(handle_unknown='ignore')
112
        Encode categorical features as a one-hot numeric array.
113
114
    scaler: default, MaxAbsScaler()
115
        Scale each feature by its maximum absolute value. MaxAbsScaler() does not \
116
        shift/center the data, and thus does not destroy any sparsity. It is \
117
        recommended to check for outliers before applying MaxAbsScaler().
118
119
    encoder_info:
120
        Prints the shape of the dataset at the end of 'cat_pipe'. Set to 'None' to \
121
        avoid printing the shape of dataset. This parameter can also be set as a \
122
        hyperparameter, e.g. 'pipeline__pipeinfo-1': [None] or \
123
        'pipeline__pipeinfo-1__name': ['my_custom_name'].
124
125
    Returns
126
    -------
127
    Pipeline
128
    """
129
    return make_pipeline(
130
        ColumnSelector(num=False), imputer, encoder, encoder_info, scaler
131
    )
132
133
134 1
def feature_selection_pipe(
135
    var_thresh=VarianceThreshold(threshold=0.1),
136
    select_from_model=SelectFromModel(
137
        LassoCV(cv=4, random_state=408), threshold="0.1*median"
138
    ),
139
    select_percentile=SelectPercentile(f_classif, percentile=95),
140
    var_thresh_info=PipeInfo(name="after var_thresh"),
141
    select_from_model_info=PipeInfo(name="after select_from_model"),
142
    select_percentile_info=PipeInfo(name="after select_percentile"),
143
):
144
    """
145
    Preprocessing operations for feature selection.
146
147
    Parameters
148
    ----------
149
    var_thresh: default, VarianceThreshold(threshold=0.1)
150
        Specify a threshold to drop low variance features.
151
152
    select_from_model: default, SelectFromModel(LassoCV(cv=4, random_state=408), \
153
    threshold="0.1 * median")
154
        Specify an estimator which is used for selecting features based on importance \
155
        weights.
156
157
    select_percentile: default, SelectPercentile(f_classif, percentile=95)
158
        Specify a score-function and a percentile value of features to keep.
159
160
    var_thresh_info, select_from_model_info, select_percentile_info
161
        Prints the shape of the dataset after applying the respective function. Set to \
162
        'None' to avoid printing the shape of dataset. This parameter can also be set \
163
        as a hyperparameter, e.g. 'pipeline__pipeinfo-1': [None] \
164
        or 'pipeline__pipeinfo-1__name': ['my_custom_name'].
165
166
    Returns
167
    -------
168
    Pipeline
169
    """
170
    return make_pipeline(
171
        var_thresh,
172
        var_thresh_info,
173
        select_from_model,
174
        select_from_model_info,
175
        select_percentile,
176
        select_percentile_info,
177
    )
178
179
180 1
def num_pipe(
181
    imputer=IterativeImputer(
182
        estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408),
183
        random_state=408,
184
    ),
185
    scaler=RobustScaler(),
186
):
187
    """
188
    Standard preprocessing operations on numerical data.
189
190
    Parameters
191
    ----------
192
    imputer: default, IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=25, \
193
    n_jobs=4, random_state=408), random_state=408)
194
195
    scaler: default, RobustScaler()
196
197
    Returns
198
    -------
199
    Pipeline
200
    """
201
    return make_pipeline(ColumnSelector(), imputer, scaler)
202
203
204 1
def train_dev_test_split(
205
    data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408
206
):
207
    """
208
    Split a dataset and a label column into train, dev and test sets.
209
210
    Parameters
211
    ----------
212
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame \
213
    is provided, the index/column information is used to label the plots.
214
215
    target: string, list, np.array or pd.Series, default None
216
        Specify target for correlation. E.g. label column to generate only the \
217
        correlations between each feature and the label.
218
219
    dev_size: float, default 0.1
220
        If float, should be between 0.0 and 1.0 and represent the proportion of the \
221
        dataset to include in the dev split.
222
223
    test_size: float, default 0.1
224
        If float, should be between 0.0 and 1.0 and represent the proportion of the \
225
        dataset to include in the test split.
226
227
    stratify: target column, default None
228
        If not None, data is split in a stratified fashion, using the input as the \
229
        class labels.
230
231
    random_state: integer, default 408
232
        Random_state is the seed used by the random number generator.
233
234
    Returns
235
    -------
236
    tuple: Tuple containing train-dev-test split of inputs.
237
    """
238
    # Validate Inputs
239 1
    _validate_input_range(dev_size, "dev_size", 0, 1)
240 1
    _validate_input_range(test_size, "test_size", 0, 1)
241 1
    _validate_input_int(random_state, "random_state")
242 1
    _validate_input_sum_smaller(1, "Dev and test", dev_size, test_size)
243
244 1
    target_data = []
245 1
    if isinstance(target, str):
246 1
        target_data = data[target]
247 1
        data = data.drop(target, axis=1)
248
249 1
    elif isinstance(target, (list, pd.Series, np.ndarray)):
250 1
        target_data = pd.Series(target)
251
252 1
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(
253
        data,
254
        target_data,
255
        test_size=dev_size + test_size,
256
        random_state=random_state,
257
        stratify=stratify,
258
    )
259
260 1
    if (dev_size == 0) or (test_size == 0):
261 1
        return X_train, X_dev_test, y_train, y_dev_test
262 1
    X_dev, X_test, y_dev, y_test = train_test_split(
263
        X_dev_test,
264
        y_dev_test,
265
        test_size=test_size / (dev_size + test_size),
266
        random_state=random_state,
267
        stratify=y_dev_test,
268
    )
269
    return X_train, X_dev, X_test, y_train, y_dev, y_test
270