klib.preprocess.preprocessing_pipe() - Code Metrics - Inspection of "reset_" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( b2514e...e55ee5 )

by Andreas

created 2020-04-28 10:29 UTC

klib.preprocess.preprocessing_pipe() A

↳ Parent: klib.preprocess

Complexity

Conditions

Size

Total Lines	22
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	8
nop	2
dl	0
loc	22
rs	9.3333
c	0
b	0
f	0

'''
Functions for data preprocessing.

:author: Andreas Kanz

'''

# Imports
from .describe import corr_mat
from .utils import _missing_vals
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, RobustScaler

from .utils import _validate_input_int
from .utils import _validate_input_range


def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
    '''
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
    their correlation with other features and the target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
         they correlate reasonably well with the target variable.

    Note: If no target is provided, the process exits after step two and drops columns identified up to this point.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    Returns
    -------
    data: Updated Pandas DataFrame
    cols_mv: Columns with missing values included in the analysis
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)

    data = pd.DataFrame(data).copy()
    data_local = data.copy()
    mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    high_corr_features = []
    data_temp = data_local.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data.drop(columns=high_corr_features)
    else:
        for col in high_corr_features:
            if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
                drop_cols.append(col)
                data = data.drop(columns=[col])

    return data, cols_mv, drop_cols


def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
    '''
    Split a dataset and a label column into train, dev and test sets.

    Parameters:
    ----------

    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    dev_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
        split.

    test_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
        split.

    stratify: target column, default None
        If not None, data is split in a stratified fashion, using the input as the class labels.

    random_state: integer, default 408
        Random_state is the seed used by the random number generator.

    Returns
    -------
    tuple: Tuple containing train-dev-test split of inputs.
    '''

    # Validate Inputs
    _validate_input_range(dev_size, 'dev_size', 0, 1)
    _validate_input_range(test_size, 'test_size', 0, 1)
    _validate_input_int(random_state, 'random_state')

    target_data = []
    if isinstance(target, str):
        target_data = data[target]
        data = data.drop(target, axis=1)

    elif isinstance(target, (list, pd.Series, np.ndarray)):
        target_data = pd.Series(target)
        target = target.name

    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
                                                                test_size=dev_size+test_size,
                                                                random_state=random_state,
                                                                stratify=stratify)

    if (dev_size == 0) or (test_size == 0):
        return X_train, X_dev_test, y_train, y_dev_test

    else:
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
                                                        test_size=test_size/(dev_size+test_size),
                                                        random_state=random_state,
                                                        stratify=y_dev_test)
        return X_train, X_dev, X_test, y_train, y_dev, y_test


class ColumnSelector(BaseEstimator, TransformerMixin):
    ''''''

    def __init__(self, num=True):
        self.num = num

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()

        if self.num:
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
        else:
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]


def cat_pipe(imputer=SimpleImputer(strategy='most_frequent')):
    '''Set of standard preprocessing operations on categorical data.'''

    cat_pipe = make_pipeline(ColumnSelector(num=False),
                             imputer,
                             OneHotEncoder(handle_unknown='ignore'))
    return cat_pipe


def num_pipe(imputer=IterativeImputer(
        estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), random_state=408),
        scaler=RobustScaler()):
    '''Set of standard preprocessing operations on numerical data.'''

    num_pipe = make_pipeline(ColumnSelector(),
                             (imputer),
                             (scaler))
    return num_pipe


def preprocessing_pipe(num=True, cat=True):
    '''Set of standard preprocessing operations on numerical and categorical data.

    Parameters:
    ----------
    num: bool, default True
        Set to false if no numerical data is in the dataset.

    cat: bool, default True
        Set to false if no categorical data is in the dataset.
    '''

    if num and cat:
        pipe = make_union(num_pipe(), cat_pipe(), n_jobs=4)

    elif num:
        pipe = num_pipe()

    elif cat:
        pipe = cat_pipe()

    return pipe



class MVColHandler(BaseEstimator, TransformerMixin):
    '''possible component of a cleaning pipeline --> follows DataCleaning'''

    def __init__(self, target=None, mch_mv_thresh=0.1, mch_feature_thresh=0.6, mch_target_thresh=0.3):
        self.target = target
        self.mch_mv_thresh = mch_mv_thresh
        self.mch_feature_thresh = mch_feature_thresh
        self.mch_target_thresh = mch_target_thresh

    def fit(self, data, target=None):
        return self

    def transform(self, data, target=None):
        data, cols_mv, dropped_cols = mv_col_handler(data, target=self.target, mv_threshold=self.mch_mv_thresh,
                                                     corr_thresh_features=self.mch_feature_thresh,
                                                     corr_thresh_target=self.mch_target_thresh)

        print(f'\nFeatures with MV-ratio > {self.mch_mv_thresh}: {len(cols_mv)}')
        print('Features dropped:', len(dropped_cols), dropped_cols)

        return data


1			'''
2			Functions for data preprocessing.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			from .describe import corr_mat
10			from .utils import _missing_vals
11			import numpy as np
12			import pandas as pd
13
14			from sklearn.base import BaseEstimator, TransformerMixin
15			from sklearn.ensemble import ExtraTreesRegressor
16			from sklearn.experimental import enable_iterative_imputer # noqa
17			from sklearn.impute import SimpleImputer, IterativeImputer
18			from sklearn.model_selection import train_test_split
19			from sklearn.pipeline import make_pipeline, make_union
20			from sklearn.preprocessing import OneHotEncoder, RobustScaler
21
22			from .utils import _validate_input_int
23			from .utils import _validate_input_range
24
25
26			def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
27			'''
28			Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
29			their correlation with other features and the target variable. This function follows a three step process:
30			- 1) Identify features with a high ratio of missing values
31			- 2) Identify high correlations of these features among themselves and with other features in the dataset.
32			- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
33			they correlate reasonably well with the target variable.
34
35			Note: If no target is provided, the process exits after step two and drops columns identified up to this point.
36
37			Parameters
38			----------
39			data: 2D dataset that can be coerced into Pandas DataFrame.
40
41			target: string, list, np.array or pd.Series, default None
42			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
43			and the label.
44
45			mv_threshold: float, default 0.1
46			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
47			for dropping and undergo further analysis.
48
49			corr_thresh_features: float, default 0.6
50			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
51			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
52
53			corr_thresh_target: float, default 0.3
54			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
55			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
56			the feature is ultimately dropped.
57
58			Returns
59			-------
60			data: Updated Pandas DataFrame
61			cols_mv: Columns with missing values included in the analysis
62			drop_cols: List of dropped columns
63			'''
64
65			# Validate Inputs
66			_validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
67			_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
68			_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
69
70			data = pd.DataFrame(data).copy()
71			data_local = data.copy()
72			mv_ratios = _missing_vals(data_local)['mv_cols_ratio']
73			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
74			data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
75
76			high_corr_features = []
77			data_temp = data_local.copy()
78			for col in cols_mv:
79			corrmat = corr_mat(data_temp, colored=False)
80			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
81			high_corr_features.append(col)
82			data_temp = data_temp.drop(columns=[col])
83
84			drop_cols = []
85			if target is None:
86			data = data.drop(columns=high_corr_features)
87			else:
88			for col in high_corr_features:
89			if pd.DataFrame(data_local[col]).corrwith(target)[0] < corr_thresh_target:
90			drop_cols.append(col)
91			data = data.drop(columns=[col])
92
93			return data, cols_mv, drop_cols
94
95
96			def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
97			'''
98			Split a dataset and a label column into train, dev and test sets.
99
100			Parameters:
101			----------
102
103			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
104			information is used to label the plots.
105
106			target: string, list, np.array or pd.Series, default None
107			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
108			and the label.
109
110			dev_size: float, default 0.1
111			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
112			split.
113
114			test_size: float, default 0.1
115			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
116			split.
117
118			stratify: target column, default None
119			If not None, data is split in a stratified fashion, using the input as the class labels.
120
121			random_state: integer, default 408
122			Random_state is the seed used by the random number generator.
123
124			Returns
125			-------
126			tuple: Tuple containing train-dev-test split of inputs.
127			'''
128
129			# Validate Inputs
130			_validate_input_range(dev_size, 'dev_size', 0, 1)
131			_validate_input_range(test_size, 'test_size', 0, 1)
132			_validate_input_int(random_state, 'random_state')
133
134			target_data = []
135			if isinstance(target, str):
136			target_data = data[target]
137			data = data.drop(target, axis=1)
138
139			elif isinstance(target, (list, pd.Series, np.ndarray)):
140			target_data = pd.Series(target)
141			target = target.name
142
143			X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
144			test_size=dev_size+test_size,
145			random_state=random_state,
146			stratify=stratify)
147
148			if (dev_size == 0) or (test_size == 0):
149			return X_train, X_dev_test, y_train, y_dev_test
150
151			else:
152			X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
153			test_size=test_size/(dev_size+test_size),
154			random_state=random_state,
155			stratify=y_dev_test)
156			return X_train, X_dev, X_test, y_train, y_dev, y_test
157
158
159			class ColumnSelector(BaseEstimator, TransformerMixin):
160			''''''
161
162			def __init__(self, num=True):
163			self.num = num
164
165			def fit(self, X, y=None):
166			return self
167
168			def transform(self, X, y=None):
169			temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
170
171			if self.num:
172			return X[temp.select_dtypes(include=['number']).columns.tolist()]
173			else:
174			return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
175
176
177			def cat_pipe(imputer=SimpleImputer(strategy='most_frequent')):
178			'''Set of standard preprocessing operations on categorical data.'''
179
180			cat_pipe = make_pipeline(ColumnSelector(num=False),
181			imputer,
182			OneHotEncoder(handle_unknown='ignore'))
183			return cat_pipe
184
185
186			def num_pipe(imputer=IterativeImputer(
187			estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), random_state=408),
188			scaler=RobustScaler()):
189			'''Set of standard preprocessing operations on numerical data.'''
190
191			num_pipe = make_pipeline(ColumnSelector(),
192			(imputer),
193			(scaler))
194			return num_pipe
195
196
197			def preprocessing_pipe(num=True, cat=True):
198			'''Set of standard preprocessing operations on numerical and categorical data.
199
200			Parameters:
201			----------
202			num: bool, default True
203			Set to false if no numerical data is in the dataset.
204
205			cat: bool, default True
206			Set to false if no categorical data is in the dataset.
207			'''
208
209			if num and cat:
210			pipe = make_union(num_pipe(), cat_pipe(), n_jobs=4)
211
212			elif num:
213			pipe = num_pipe()
214
215			elif cat:
216			pipe = cat_pipe()
217
218			return pipe
			0 ignored issues – show introduced 2020-04-28 10:34 UTC by Report Bug Copy Issue Report The variable `pipe` does not seem to be defined for all execution paths. Loading history...
219
220
221			class MVColHandler(BaseEstimator, TransformerMixin):
222			'''possible component of a cleaning pipeline --> follows DataCleaning'''
223
224			def __init__(self, target=None, mch_mv_thresh=0.1, mch_feature_thresh=0.6, mch_target_thresh=0.3):
225			self.target = target
226			self.mch_mv_thresh = mch_mv_thresh
227			self.mch_feature_thresh = mch_feature_thresh
228			self.mch_target_thresh = mch_target_thresh
229
230			def fit(self, data, target=None):
231			return self
232
233			def transform(self, data, target=None):
234			data, cols_mv, dropped_cols = mv_col_handler(data, target=self.target, mv_threshold=self.mch_mv_thresh,
235			corr_thresh_features=self.mch_feature_thresh,
236			corr_thresh_target=self.mch_target_thresh)
237
238			print(f'\nFeatures with MV-ratio > {self.mch_mv_thresh}: {len(cols_mv)}')
239			print('Features dropped:', len(dropped_cols), dropped_cols)
240
241			return data
242

akanz1 / klib

GitHub Access Token became invalid

Push — master ( b2514e...e55ee5 )

klib.preprocess.preprocessing_pipe() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like