GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( bfbbb5...96b70c )
by Andreas
01:50
created

klib.preprocess   A

Complexity

Total Complexity 16

Size/Duplication

Total Lines 154
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 70
dl 0
loc 154
rs 10
c 0
b 0
f 0
wmc 16

4 Functions

Rating   Name   Duplication   Size   Complexity  
A preprocessing_pipe() 0 23 5
A cat_pipe() 0 7 1
B train_dev_test_split() 0 61 5
A num_pipe() 0 9 1

3 Methods

Rating   Name   Duplication   Size   Complexity  
A ColumnSelector.transform() 0 7 2
A ColumnSelector.__init__() 0 2 1
A ColumnSelector.fit() 0 2 1
1
'''
2
Functions for data preprocessing.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import numpy as np
10
import pandas as pd
11
12
from sklearn.base import BaseEstimator, TransformerMixin
13
from sklearn.ensemble import ExtraTreesRegressor
14
from sklearn.experimental import enable_iterative_imputer  # noqa
15
from sklearn.impute import SimpleImputer, IterativeImputer
16
from sklearn.model_selection import train_test_split
17
from sklearn.pipeline import make_pipeline, make_union
18
from sklearn.preprocessing import OneHotEncoder, RobustScaler
19
20
from .utils import _validate_input_int
21
from .utils import _validate_input_range
22
23
24
__all__ = ['train_dev_test_split',
25
           'cat_pipe',
26
           'num_pipe',
27
           'preprocessing_pipe']
28
29
30
def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
31
    '''
32
    Split a dataset and a label column into train, dev and test sets.
33
34
    Parameters:
35
    ----------
36
37
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
38
    information is used to label the plots.
39
40
    target: string, list, np.array or pd.Series, default None
41
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
42
        and the label.
43
44
    dev_size: float, default 0.1
45
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
46
        split.
47
48
    test_size: float, default 0.1
49
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
50
        split.
51
52
    stratify: target column, default None
53
        If not None, data is split in a stratified fashion, using the input as the class labels.
54
55
    random_state: integer, default 408
56
        Random_state is the seed used by the random number generator.
57
58
    Returns
59
    -------
60
    tuple: Tuple containing train-dev-test split of inputs.
61
    '''
62
63
    # Validate Inputs
64
    _validate_input_range(dev_size, 'dev_size', 0, 1)
65
    _validate_input_range(test_size, 'test_size', 0, 1)
66
    _validate_input_int(random_state, 'random_state')
67
68
    target_data = []
69
    if isinstance(target, str):
70
        target_data = data[target]
71
        data = data.drop(target, axis=1)
72
73
    elif isinstance(target, (list, pd.Series, np.ndarray)):
74
        target_data = pd.Series(target)
75
        target = target.name
76
77
    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
78
                                                                test_size=dev_size+test_size,
79
                                                                random_state=random_state,
80
                                                                stratify=stratify)
81
82
    if (dev_size == 0) or (test_size == 0):
83
        return X_train, X_dev_test, y_train, y_dev_test
84
85
    else:
86
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
87
                                                        test_size=test_size/(dev_size+test_size),
88
                                                        random_state=random_state,
89
                                                        stratify=y_dev_test)
90
        return X_train, X_dev, X_test, y_train, y_dev, y_test
91
92
93
class ColumnSelector(BaseEstimator, TransformerMixin):
94
    ''''''
95
96
    def __init__(self, num=True):
97
        self.num = num
98
99
    def fit(self, X, y=None):
100
        return self
101
102
    def transform(self, X, y=None):
103
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
104
105
        if self.num:
106
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
107
        else:
108
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
109
110
111
def cat_pipe(imputer=SimpleImputer(strategy='most_frequent')):
112
    '''Set of standard preprocessing operations on categorical data.'''
113
114
    cat_pipe = make_pipeline(ColumnSelector(num=False),
115
                             imputer,
116
                             OneHotEncoder(handle_unknown='ignore'))
117
    return cat_pipe
118
119
120
def num_pipe(imputer=IterativeImputer(
121
        estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), random_state=408),
122
        scaler=RobustScaler()):
123
    '''Set of standard preprocessing operations on numerical data.'''
124
125
    num_pipe = make_pipeline(ColumnSelector(),
126
                             (imputer),
127
                             (scaler))
128
    return num_pipe
129
130
131
def preprocessing_pipe(num=True, cat=True):
132
    '''Set of standard preprocessing operations on numerical and categorical data.
133
134
    Parameters:
135
    ----------
136
    num: bool, default True
137
        Set to false if no numerical data is in the dataset.
138
139
    cat: bool, default True
140
        Set to false if no categorical data is in the dataset.
141
    '''
142
143
    pipe = None
144
    if num and cat:
145
        pipe = make_union(num_pipe(), cat_pipe(), n_jobs=4)
146
147
    elif num:
148
        pipe = num_pipe()
149
150
    elif cat:
151
        pipe = cat_pipe()
152
153
    return pipe
154