klib.preprocess.train_dev_test_split() - Code Metrics - Inspection of "test" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 1ffeda...94eb5d )

by Andreas

created 2020-04-29 09:13 UTC

klib.preprocess.train_dev_test_split() B

↳ Parent: klib.preprocess

Complexity

Conditions

Size

Total Lines	62
Code Lines	23

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	23
nop	6
dl	0
loc	62
rs	8.8613
c	0
b	0
f	0

How to fix Long Method

'''
Functions for data preprocessing.

:author: Andreas Kanz

'''

# Imports
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import (SimpleImputer, IterativeImputer)
from sklearn.feature_selection import (f_classif,
                                       SelectPercentile,
                                       VarianceThreshold)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler

from .utils import (_validate_input_int,
                    _validate_input_range,
                    _validate_input_sum)


__all__ = ['feature_selection_pipe',
           'num_pipe',
           'cat_pipe',
           'train_dev_test_split']


class ColumnSelector(BaseEstimator, TransformerMixin):
    ''''''

    def __init__(self, num=True):
        self.num = num

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()

        if self.num:
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
        else:
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]


def feature_selection_pipe(
        var_thresh=VarianceThreshold(threshold=0.1),
        select_percentile=SelectPercentile(f_classif, percentile=95)):
    '''Preprocessing operations for feature selection.'''

    feature_selection_pipe = make_pipeline(var_thresh,
                                           select_percentile)
    return feature_selection_pipe


def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
        n_estimators=25, n_jobs=4, random_state=408), random_state=408),
        scaler=RobustScaler()):
    '''Standard preprocessing operations on numerical data.'''

    num_pipe = make_pipeline(ColumnSelector(),
                             imputer,
                             scaler)
    return num_pipe


def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
             scaler=OneHotEncoder(handle_unknown='ignore')):
    '''Set of standard preprocessing operations on categorical data.'''

    cat_pipe = make_pipeline(ColumnSelector(num=False),
                             imputer,
                             scaler)
    return cat_pipe


def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
    '''
    Split a dataset and a label column into train, dev and test sets.

    Parameters:
    ----------

    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    dev_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
        split.

    test_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
        split.

    stratify: target column, default None
        If not None, data is split in a stratified fashion, using the input as the class labels.

    random_state: integer, default 408
        Random_state is the seed used by the random number generator.

    Returns
    -------
    tuple: Tuple containing train-dev-test split of inputs.
    '''

    # Validate Inputs
    _validate_input_range(dev_size, 'dev_size', 0, 1)
    _validate_input_range(test_size, 'test_size', 0, 1)
    _validate_input_int(random_state, 'random_state')
    _validate_input_sum(1, 'Sum of args', dev_size, test_size)

    target_data = []
    if isinstance(target, str):
        target_data = data[target]
        data = data.drop(target, axis=1)

    elif isinstance(target, (list, pd.Series, np.ndarray)):
        target_data = pd.Series(target)
        target = target.name

    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
                                                                test_size=dev_size+test_size,
                                                                random_state=random_state,
                                                                stratify=stratify)

    if (dev_size == 0) or (test_size == 0):
        return X_train, X_dev_test, y_train, y_dev_test

    else:
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
                                                        test_size=test_size/(dev_size+test_size),
                                                        random_state=random_state,
                                                        stratify=y_dev_test)
        return X_train, X_dev, X_test, y_train, y_dev, y_test


1			'''
2			Functions for data preprocessing.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11
12			from sklearn.base import BaseEstimator, TransformerMixin
13			from sklearn.ensemble import ExtraTreesRegressor
14			from sklearn.experimental import enable_iterative_imputer # noqa
15			from sklearn.impute import (SimpleImputer, IterativeImputer)
16			from sklearn.feature_selection import (f_classif,
17			SelectPercentile,
18			VarianceThreshold)
19			from sklearn.model_selection import train_test_split
20			from sklearn.pipeline import make_pipeline
21			from sklearn.preprocessing import OneHotEncoder, RobustScaler
22
23			from .utils import (_validate_input_int,
24			_validate_input_range,
25			_validate_input_sum)
26
27
28			__all__ = ['feature_selection_pipe',
29			'num_pipe',
30			'cat_pipe',
31			'train_dev_test_split']
32
33
34			class ColumnSelector(BaseEstimator, TransformerMixin):
35			''''''
36
37			def __init__(self, num=True):
38			self.num = num
39
40			def fit(self, X, y=None):
41			return self
42
43			def transform(self, X, y=None):
44			temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
45
46			if self.num:
47			return X[temp.select_dtypes(include=['number']).columns.tolist()]
48			else:
49			return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
50
51
52			def feature_selection_pipe(
53			var_thresh=VarianceThreshold(threshold=0.1),
54			select_percentile=SelectPercentile(f_classif, percentile=95)):
55			'''Preprocessing operations for feature selection.'''
56
57			feature_selection_pipe = make_pipeline(var_thresh,
58			select_percentile)
59			return feature_selection_pipe
60
61
62			def num_pipe(imputer=IterativeImputer(estimator=ExtraTreesRegressor(
63			n_estimators=25, n_jobs=4, random_state=408), random_state=408),
64			scaler=RobustScaler()):
65			'''Standard preprocessing operations on numerical data.'''
66
67			num_pipe = make_pipeline(ColumnSelector(),
68			imputer,
69			scaler)
70			return num_pipe
71
72
73			def cat_pipe(imputer=SimpleImputer(strategy='most_frequent'),
74			scaler=OneHotEncoder(handle_unknown='ignore')):
75			'''Set of standard preprocessing operations on categorical data.'''
76
77			cat_pipe = make_pipeline(ColumnSelector(num=False),
78			imputer,
79			scaler)
80			return cat_pipe
81
82
83			def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
84			'''
85			Split a dataset and a label column into train, dev and test sets.
86
87			Parameters:
88			----------
89
90			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
91			information is used to label the plots.
92
93			target: string, list, np.array or pd.Series, default None
94			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
95			and the label.
96
97			dev_size: float, default 0.1
98			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
99			split.
100
101			test_size: float, default 0.1
102			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
103			split.
104
105			stratify: target column, default None
106			If not None, data is split in a stratified fashion, using the input as the class labels.
107
108			random_state: integer, default 408
109			Random_state is the seed used by the random number generator.
110
111			Returns
112			-------
113			tuple: Tuple containing train-dev-test split of inputs.
114			'''
115
116			# Validate Inputs
117			_validate_input_range(dev_size, 'dev_size', 0, 1)
118			_validate_input_range(test_size, 'test_size', 0, 1)
119			_validate_input_int(random_state, 'random_state')
120			_validate_input_sum(1, 'Sum of args', dev_size, test_size)
121
122			target_data = []
123			if isinstance(target, str):
124			target_data = data[target]
125			data = data.drop(target, axis=1)
126
127			elif isinstance(target, (list, pd.Series, np.ndarray)):
128			target_data = pd.Series(target)
129			target = target.name
130
131			X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
132			test_size=dev_size+test_size,
133			random_state=random_state,
134			stratify=stratify)
135
136			if (dev_size == 0) or (test_size == 0):
137			return X_train, X_dev_test, y_train, y_dev_test
138
139			else:
140			X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
141			test_size=test_size/(dev_size+test_size),
142			random_state=random_state,
143			stratify=y_dev_test)
144			return X_train, X_dev, X_test, y_train, y_dev, y_test
145

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 1ffeda...94eb5d )

klib.preprocess.train_dev_test_split() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like