GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 1ffeda...94eb5d )
by Andreas
01:28
created

klib.preprocess.preprocessing_pipe()   A

Complexity

Conditions 5

Size

Total Lines 23
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 9
nop 2
dl 0
loc 23
rs 9.3333
c 0
b 0
f 0
1
'''
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from sklearn.ensemble import ExtraTreesRegressor
14
from sklearn.experimental import enable_iterative_imputer  # noqa
15
from sklearn.impute import (SimpleImputer, IterativeImputer)
16
from sklearn.feature_selection import (f_classif,
17
                                       SelectPercentile,
18
                                       VarianceThreshold)
19
from sklearn.model_selection import train_test_split
20
from sklearn.pipeline import make_pipeline
21
from sklearn.preprocessing import OneHotEncoder, RobustScaler
22
23
from .utils import (_validate_input_int,
24
                    _validate_input_range,
25
                    _validate_input_sum)
26
27
28
__all__ = ['feature_selection_pipe',
29
           'num_pipe',
30
           'cat_pipe',
31
           'train_dev_test_split']
32
33
34
class ColumnSelector(BaseEstimator, TransformerMixin):
35
    ''''''
36
37
    def __init__(self, num=True):
38
        self.num = num
39
40
    def fit(self, X, y=None):
41
        return self
42
43
    def transform(self, X, y=None):
44
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
45
46
        if self.num:
47
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
48
        else:
49
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
50
51
52
def feature_selection_pipe(
53
        var_thresh=VarianceThreshold(threshold=0.1),
54
        select_percentile=SelectPercentile(f_classif, percentile=95)):
55
    '''Preprocessing operations for feature selection.'''
56
57
    feature_selection_pipe = make_pipeline(var_thresh,
58
                                           select_percentile)
59
    return feature_selection_pipe
60
61
62
def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
63
        n_estimators=25, n_jobs=4, random_state=408), random_state=408),
64
        scaler=RobustScaler()):
65
    '''Standard preprocessing operations on numerical data.'''
66
67
    num_pipe = make_pipeline(ColumnSelector(),
68
                             imputer,
69
                             scaler)
70
    return num_pipe
71
72
73
def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
74
             scaler=OneHotEncoder(handle_unknown='ignore')):
75
    '''Set of standard preprocessing operations on categorical data.'''
76
77
    cat_pipe = make_pipeline(ColumnSelector(num=False),
78
                             imputer,
79
                             scaler)
80
    return cat_pipe
81
82
83
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
84
    '''
85
    Split a dataset and a label column into train, dev and test sets.
86
87
    Parameters:
88
    ----------
89
90
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
91
    information is used to label the plots.
92
93
    target: string, list, np.array or pd.Series, default None
94
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
95
        and the label.
96
97
    dev_size: float, default 0.1
98
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
99
        split.
100
101
    test_size: float, default 0.1
102
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
103
        split.
104
105
    stratify: target column, default None
106
        If not None, data is split in a stratified fashion, using the input as the class labels.
107
108
    random_state: integer, default 408
109
        Random_state is the seed used by the random number generator.
110
111
    Returns
112
    -------
113
    tuple: Tuple containing train-dev-test split of inputs.
114
    '''
115
116
    # Validate Inputs
117
    _validate_input_range(dev_size, 'dev_size', 0, 1)
118
    _validate_input_range(test_size, 'test_size', 0, 1)
119
    _validate_input_int(random_state, 'random_state')
120
    _validate_input_sum(1, 'Sum of args', dev_size, test_size)
121
122
    target_data = []
123
    if isinstance(target, str):
124
        target_data = data[target]
125
        data = data.drop(target, axis=1)
126
127
    elif isinstance(target, (list, pd.Series, np.ndarray)):
128
        target_data = pd.Series(target)
129
        target = target.name
130
131
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
132
                                                                test_size=dev_size+test_size,
133
                                                                random_state=random_state,
134
                                                                stratify=stratify)
135
136
    if (dev_size == 0) or (test_size == 0):
137
        return X_train, X_dev_test, y_train, y_dev_test
138
139
    else:
140
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
141
                                                        test_size=test_size/(dev_size+test_size),
142
                                                        random_state=random_state,
143
                                                        stratify=y_dev_test)
144
        return X_train, X_dev, X_test, y_train, y_dev, y_test
145