klib.preprocess.PipeInfo.transform() - Code Metrics - Inspection of "introduce pipe_info" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 9934a3...be544d )

by Andreas

created 2020-05-10 12:58 UTC

klib.preprocess.PipeInfo.transform() A

↳ Parent: klib.preprocess

Complexity

Conditions

Size

Total Lines	3
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	3
nop	3
dl	0
loc	3
rs	10
c	0
b	0
f	0

'''
Functions for data preprocessing.

:author: Andreas Kanz

'''

# Imports
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import (SimpleImputer, IterativeImputer)
from sklearn.feature_selection import (f_classif,
                                       SelectFromModel,
                                       SelectPercentile,
                                       VarianceThreshold)
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (OneHotEncoder,
                                   RobustScaler,
                                   MaxAbsScaler)

from .utils import (_validate_input_int,
                    _validate_input_range,
                    _validate_input_sum)


__all__ = ['feature_selection_pipe',
           'num_pipe',
           'cat_pipe',
           'train_dev_test_split']


class ColumnSelector(BaseEstimator, TransformerMixin):
    '''
    Determines and selects numerical and categorical columns from a dataset based on their supposed dtype. Unlike \
    sklearn's make_column_selector() missing values are temporarily filled in to allow convert_dtypes() to determine \
    the dtype of a column.

    Parameter:
    ---------
    num: default, True
        Select only numerica Columns. If num = False, only categorical columns are selected.

    Returns:
    -------
    Dataset containing only numerical or categorical data.
    '''

    def __init__(self, num=True):
        self.num = num

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()

        if self.num:
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
        else:
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]


class PipeInfo(BaseEstimator, TransformerMixin):
    '''
    Prints intermediary information about the dataset from within a pipeline. Include at any point in a Pipeline to
    print out the shape of the dataset at this point.

    Parameter:
    ---------
    name: string, default None
        Provide a name for the current step.

    Returns:
    -------
    Data: Data is being passed through.
    '''

    def __init__(self, name=None):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print(f'Step: {self.name} --- Shape: {X.shape}')
        return X


def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
             encoder=OneHotEncoder(handle_unknown='ignore'),
             scaler=MaxAbsScaler(),
             encoder_info=PipeInfo(name='after encoding')):
    '''
    Standard preprocessing operations on categorical data.

    Parameters:
    ----------
    imputer: default, SimpleImputer(strategy='most_frequent')

    encoder: default, OneHotEncoder(handle_unknown='ignore')
        Encode categorical features as a one-hot numeric array.

    scaler: default, MaxAbsScaler()
        Scale each feature by its maximum absolute value. MaxAbsScaler() does not shift/center the data, and thus does \
        not destroy any sparsity. It is recommended to check for outliers before applying MaxAbsScaler().


    Returns:
    -------
    Pipeline
    '''

    cat_pipe = make_pipeline(ColumnSelector(num=False),
                             imputer,
                             encoder, encoder_info,
                             scaler)
    return cat_pipe


def feature_selection_pipe(
        var_thresh=VarianceThreshold(threshold=0.1),
        select_from_model=SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median"),
        select_percentile=SelectPercentile(f_classif, percentile=95),
        var_thresh_info=PipeInfo(name='after var_thresh'),
        select_from_model_info=PipeInfo(name='after select_from_model'),
        select_percentile_info=PipeInfo(name='after select_percentile')):
    '''
    Preprocessing operations for feature selection.

    Parameters:
    ----------
    var_thresh: default, VarianceThreshold(threshold=0.1)
        Specify a threshold to drop low variance features.

    select_from_model: default, SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median")
        Specify an estimator which is used for selecting features based on importance weights.

    select_percentile: default, SelectPercentile(f_classif, percentile=95)
        Specify a score-function and a percentile value of features to keep.

    Returns:
    -------
    Pipeline
    '''

    feature_selection_pipe = make_pipeline(var_thresh, var_thresh_info,
                                           select_from_model, select_from_model_info,
                                           select_percentile, select_percentile_info)
    return feature_selection_pipe


def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
        n_estimators=25, n_jobs=4, random_state=408), random_state=408),
        scaler=RobustScaler()):
    '''
    Standard preprocessing operations on numerical data.

    Parameters:
    ----------
    imputer: default, IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), \
                                       random_state=408)

    scaler: default, RobustScaler()

    Returns:
    -------
    Pipeline
    '''

    num_pipe = make_pipeline(ColumnSelector(),
                             imputer,
                             scaler)
    return num_pipe


def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
    '''
    Split a dataset and a label column into train, dev and test sets.

    Parameters:
    ----------

    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    dev_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
        split.

    test_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
        split.

    stratify: target column, default None
        If not None, data is split in a stratified fashion, using the input as the class labels.

    random_state: integer, default 408
        Random_state is the seed used by the random number generator.

    Returns
    -------
    tuple: Tuple containing train-dev-test split of inputs.
    '''

    # Validate Inputs
    _validate_input_range(dev_size, 'dev_size', 0, 1)
    _validate_input_range(test_size, 'test_size', 0, 1)
    _validate_input_int(random_state, 'random_state')
    _validate_input_sum(1, 'Dev and test', dev_size, test_size)

    target_data = []
    if isinstance(target, str):
        target_data = data[target]
        data = data.drop(target, axis=1)

    elif isinstance(target, (list, pd.Series, np.ndarray)):
        target_data = pd.Series(target)

    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
                                                                test_size=dev_size+test_size,
                                                                random_state=random_state,
                                                                stratify=stratify)

    if (dev_size == 0) or (test_size == 0):
        return X_train, X_dev_test, y_train, y_dev_test

    else:
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
                                                        test_size=test_size/(dev_size+test_size),
                                                        random_state=random_state,
                                                        stratify=y_dev_test)
        return X_train, X_dev, X_test, y_train, y_dev, y_test


1			'''
2			Functions for data preprocessing.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11
12			from sklearn.base import BaseEstimator, TransformerMixin
13			from sklearn.ensemble import ExtraTreesRegressor
14			from sklearn.experimental import enable_iterative_imputer # noqa
15			from sklearn.impute import (SimpleImputer, IterativeImputer)
16			from sklearn.feature_selection import (f_classif,
17			SelectFromModel,
18			SelectPercentile,
19			VarianceThreshold)
20			from sklearn.linear_model import LassoCV
21			from sklearn.model_selection import train_test_split
22			from sklearn.pipeline import make_pipeline
23			from sklearn.preprocessing import (OneHotEncoder,
24			RobustScaler,
25			MaxAbsScaler)
26
27			from .utils import (_validate_input_int,
28			_validate_input_range,
29			_validate_input_sum)
30
31
32			__all__ = ['feature_selection_pipe',
33			'num_pipe',
34			'cat_pipe',
35			'train_dev_test_split']
36
37
38			class ColumnSelector(BaseEstimator, TransformerMixin):
39			'''
40			Determines and selects numerical and categorical columns from a dataset based on their supposed dtype. Unlike \
41			sklearn's make_column_selector() missing values are temporarily filled in to allow convert_dtypes() to determine \
42			the dtype of a column.
43
44			Parameter:
45			---------
46			num: default, True
47			Select only numerica Columns. If num = False, only categorical columns are selected.
48
49			Returns:
50			-------
51			Dataset containing only numerical or categorical data.
52			'''
53
54			def __init__(self, num=True):
55			self.num = num
56
57			def fit(self, X, y=None):
58			return self
59
60			def transform(self, X, y=None):
61			temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
62
63			if self.num:
64			return X[temp.select_dtypes(include=['number']).columns.tolist()]
65			else:
66			return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
67
68
69			class PipeInfo(BaseEstimator, TransformerMixin):
70			'''
71			Prints intermediary information about the dataset from within a pipeline. Include at any point in a Pipeline to
72			print out the shape of the dataset at this point.
73
74			Parameter:
75			---------
76			name: string, default None
77			Provide a name for the current step.
78
79			Returns:
80			-------
81			Data: Data is being passed through.
82			'''
83
84			def __init__(self, name=None):
85			self.name = name
86
87			def fit(self, X, y=None):
88			return self
89
90			def transform(self, X, y=None):
91			print(f'Step: {self.name} --- Shape: {X.shape}')
92			return X
93
94
95			def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
96			encoder=OneHotEncoder(handle_unknown='ignore'),
97			scaler=MaxAbsScaler(),
98			encoder_info=PipeInfo(name='after encoding')):
99			'''
100			Standard preprocessing operations on categorical data.
101
102			Parameters:
103			----------
104			imputer: default, SimpleImputer(strategy='most_frequent')
105
106			encoder: default, OneHotEncoder(handle_unknown='ignore')
107			Encode categorical features as a one-hot numeric array.
108
109			scaler: default, MaxAbsScaler()
110			Scale each feature by its maximum absolute value. MaxAbsScaler() does not shift/center the data, and thus does \
111			not destroy any sparsity. It is recommended to check for outliers before applying MaxAbsScaler().
112
113
114			Returns:
115			-------
116			Pipeline
117			'''
118
119			cat_pipe = make_pipeline(ColumnSelector(num=False),
120			imputer,
121			encoder, encoder_info,
122			scaler)
123			return cat_pipe
124
125
126			def feature_selection_pipe(
127			var_thresh=VarianceThreshold(threshold=0.1),
128			select_from_model=SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median"),
129			select_percentile=SelectPercentile(f_classif, percentile=95),
130			var_thresh_info=PipeInfo(name='after var_thresh'),
131			select_from_model_info=PipeInfo(name='after select_from_model'),
132			select_percentile_info=PipeInfo(name='after select_percentile')):
133			'''
134			Preprocessing operations for feature selection.
135
136			Parameters:
137			----------
138			var_thresh: default, VarianceThreshold(threshold=0.1)
139			Specify a threshold to drop low variance features.
140
141			select_from_model: default, SelectFromModel(LassoCV(cv=4, random_state=408), threshold="0.1*median")
142			Specify an estimator which is used for selecting features based on importance weights.
143
144			select_percentile: default, SelectPercentile(f_classif, percentile=95)
145			Specify a score-function and a percentile value of features to keep.
146
147			Returns:
148			-------
149			Pipeline
150			'''
151
152			feature_selection_pipe = make_pipeline(var_thresh, var_thresh_info,
153			select_from_model, select_from_model_info,
154			select_percentile, select_percentile_info)
155			return feature_selection_pipe
156
157
158			def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
159			n_estimators=25, n_jobs=4, random_state=408), random_state=408),
160			scaler=RobustScaler()):
161			'''
162			Standard preprocessing operations on numerical data.
163
164			Parameters:
165			----------
166			imputer: default, IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), \
167			random_state=408)
168
169			scaler: default, RobustScaler()
170
171			Returns:
172			-------
173			Pipeline
174			'''
175
176			num_pipe = make_pipeline(ColumnSelector(),
177			imputer,
178			scaler)
179			return num_pipe
180
181
182			def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
183			'''
184			Split a dataset and a label column into train, dev and test sets.
185
186			Parameters:
187			----------
188
189			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
190			information is used to label the plots.
191
192			target: string, list, np.array or pd.Series, default None
193			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
194			and the label.
195
196			dev_size: float, default 0.1
197			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
198			split.
199
200			test_size: float, default 0.1
201			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
202			split.
203
204			stratify: target column, default None
205			If not None, data is split in a stratified fashion, using the input as the class labels.
206
207			random_state: integer, default 408
208			Random_state is the seed used by the random number generator.
209
210			Returns
211			-------
212			tuple: Tuple containing train-dev-test split of inputs.
213			'''
214
215			# Validate Inputs
216			_validate_input_range(dev_size, 'dev_size', 0, 1)
217			_validate_input_range(test_size, 'test_size', 0, 1)
218			_validate_input_int(random_state, 'random_state')
219			_validate_input_sum(1, 'Dev and test', dev_size, test_size)
220
221			target_data = []
222			if isinstance(target, str):
223			target_data = data[target]
224			data = data.drop(target, axis=1)
225
226			elif isinstance(target, (list, pd.Series, np.ndarray)):
227			target_data = pd.Series(target)
228
229			X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
230			test_size=dev_size+test_size,
231			random_state=random_state,
232			stratify=stratify)
233
234			if (dev_size == 0) or (test_size == 0):
235			return X_train, X_dev_test, y_train, y_dev_test
236
237			else:
238			X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
239			test_size=test_size/(dev_size+test_size),
240			random_state=random_state,
241			stratify=y_dev_test)
242			return X_train, X_dev, X_test, y_train, y_dev, y_test
243

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 9934a3...be544d )

klib.preprocess.PipeInfo.transform() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like