klib.preprocess.train_dev_test_split() - Code Metrics - Inspection of "remove circular dependency" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( bfbbb5...96b70c )

by Andreas

created 2020-04-28 14:32 UTC

klib.preprocess.train_dev_test_split() B

↳ Parent: klib.preprocess

Complexity

Conditions

Size

Total Lines	61
Code Lines	22

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	5
eloc	22
nop	6
dl	0
loc	61
rs	8.8853
c	0
b	0
f	0

How to fix Long Method

'''
Functions for data preprocessing.

:author: Andreas Kanz

'''

# Imports
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, RobustScaler

from .utils import _validate_input_int
from .utils import _validate_input_range


__all__ = ['train_dev_test_split',
           'cat_pipe',
           'num_pipe',
           'preprocessing_pipe']


def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
    '''
    Split a dataset and a label column into train, dev and test sets.

    Parameters:
    ----------

    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    dev_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
        split.

    test_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
        split.

    stratify: target column, default None
        If not None, data is split in a stratified fashion, using the input as the class labels.

    random_state: integer, default 408
        Random_state is the seed used by the random number generator.

    Returns
    -------
    tuple: Tuple containing train-dev-test split of inputs.
    '''

    # Validate Inputs
    _validate_input_range(dev_size, 'dev_size', 0, 1)
    _validate_input_range(test_size, 'test_size', 0, 1)
    _validate_input_int(random_state, 'random_state')

    target_data = []
    if isinstance(target, str):
        target_data = data[target]
        data = data.drop(target, axis=1)

    elif isinstance(target, (list, pd.Series, np.ndarray)):
        target_data = pd.Series(target)
        target = target.name

    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
                                                                test_size=dev_size+test_size,
                                                                random_state=random_state,
                                                                stratify=stratify)

    if (dev_size == 0) or (test_size == 0):
        return X_train, X_dev_test, y_train, y_dev_test

    else:
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
                                                        test_size=test_size/(dev_size+test_size),
                                                        random_state=random_state,
                                                        stratify=y_dev_test)
        return X_train, X_dev, X_test, y_train, y_dev, y_test


class ColumnSelector(BaseEstimator, TransformerMixin):
    ''''''

    def __init__(self, num=True):
        self.num = num

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        temp = X.fillna(X.mode().iloc[0]).convert_dtypes()

        if self.num:
            return X[temp.select_dtypes(include=['number']).columns.tolist()]
        else:
            return X[temp.select_dtypes(exclude=['number']).columns.tolist()]


def cat_pipe(imputer=SimpleImputer(strategy='most_frequent')):
    '''Set of standard preprocessing operations on categorical data.'''

    cat_pipe = make_pipeline(ColumnSelector(num=False),
                             imputer,
                             OneHotEncoder(handle_unknown='ignore'))
    return cat_pipe


def num_pipe(imputer=IterativeImputer(
        estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), random_state=408),
        scaler=RobustScaler()):
    '''Set of standard preprocessing operations on numerical data.'''

    num_pipe = make_pipeline(ColumnSelector(),
                             (imputer),
                             (scaler))
    return num_pipe


def preprocessing_pipe(num=True, cat=True):
    '''Set of standard preprocessing operations on numerical and categorical data.

    Parameters:
    ----------
    num: bool, default True
        Set to false if no numerical data is in the dataset.

    cat: bool, default True
        Set to false if no categorical data is in the dataset.
    '''

    pipe = None
    if num and cat:
        pipe = make_union(num_pipe(), cat_pipe(), n_jobs=4)

    elif num:
        pipe = num_pipe()

    elif cat:
        pipe = cat_pipe()

    return pipe


1			'''
2			Functions for data preprocessing.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11
12			from sklearn.base import BaseEstimator, TransformerMixin
13			from sklearn.ensemble import ExtraTreesRegressor
14			from sklearn.experimental import enable_iterative_imputer # noqa
15			from sklearn.impute import SimpleImputer, IterativeImputer
16			from sklearn.model_selection import train_test_split
17			from sklearn.pipeline import make_pipeline, make_union
18			from sklearn.preprocessing import OneHotEncoder, RobustScaler
19
20			from .utils import _validate_input_int
21			from .utils import _validate_input_range
22
23
24			__all__ = ['train_dev_test_split',
25			'cat_pipe',
26			'num_pipe',
27			'preprocessing_pipe']
28
29
30			def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=408):
31			'''
32			Split a dataset and a label column into train, dev and test sets.
33
34			Parameters:
35			----------
36
37			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
38			information is used to label the plots.
39
40			target: string, list, np.array or pd.Series, default None
41			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
42			and the label.
43
44			dev_size: float, default 0.1
45			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
46			split.
47
48			test_size: float, default 0.1
49			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
50			split.
51
52			stratify: target column, default None
53			If not None, data is split in a stratified fashion, using the input as the class labels.
54
55			random_state: integer, default 408
56			Random_state is the seed used by the random number generator.
57
58			Returns
59			-------
60			tuple: Tuple containing train-dev-test split of inputs.
61			'''
62
63			# Validate Inputs
64			_validate_input_range(dev_size, 'dev_size', 0, 1)
65			_validate_input_range(test_size, 'test_size', 0, 1)
66			_validate_input_int(random_state, 'random_state')
67
68			target_data = []
69			if isinstance(target, str):
70			target_data = data[target]
71			data = data.drop(target, axis=1)
72
73			elif isinstance(target, (list, pd.Series, np.ndarray)):
74			target_data = pd.Series(target)
75			target = target.name
76
77			X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
78			test_size=dev_size+test_size,
79			random_state=random_state,
80			stratify=stratify)
81
82			if (dev_size == 0) or (test_size == 0):
83			return X_train, X_dev_test, y_train, y_dev_test
84
85			else:
86			X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
87			test_size=test_size/(dev_size+test_size),
88			random_state=random_state,
89			stratify=y_dev_test)
90			return X_train, X_dev, X_test, y_train, y_dev, y_test
91
92
93			class ColumnSelector(BaseEstimator, TransformerMixin):
94			''''''
95
96			def __init__(self, num=True):
97			self.num = num
98
99			def fit(self, X, y=None):
100			return self
101
102			def transform(self, X, y=None):
103			temp = X.fillna(X.mode().iloc[0]).convert_dtypes()
104
105			if self.num:
106			return X[temp.select_dtypes(include=['number']).columns.tolist()]
107			else:
108			return X[temp.select_dtypes(exclude=['number']).columns.tolist()]
109
110
111			def cat_pipe(imputer=SimpleImputer(strategy='most_frequent')):
112			'''Set of standard preprocessing operations on categorical data.'''
113
114			cat_pipe = make_pipeline(ColumnSelector(num=False),
115			imputer,
116			OneHotEncoder(handle_unknown='ignore'))
117			return cat_pipe
118
119
120			def num_pipe(imputer=IterativeImputer(
121			estimator=ExtraTreesRegressor(n_estimators=25, n_jobs=4, random_state=408), random_state=408),
122			scaler=RobustScaler()):
123			'''Set of standard preprocessing operations on numerical data.'''
124
125			num_pipe = make_pipeline(ColumnSelector(),
126			(imputer),
127			(scaler))
128			return num_pipe
129
130
131			def preprocessing_pipe(num=True, cat=True):
132			'''Set of standard preprocessing operations on numerical and categorical data.
133
134			Parameters:
135			----------
136			num: bool, default True
137			Set to false if no numerical data is in the dataset.
138
139			cat: bool, default True
140			Set to false if no categorical data is in the dataset.
141			'''
142
143			pipe = None
144			if num and cat:
145			pipe = make_union(num_pipe(), cat_pipe(), n_jobs=4)
146
147			elif num:
148			pipe = num_pipe()
149
150			elif cat:
151			pipe = cat_pipe()
152
153			return pipe
154

akanz1 / klib

GitHub Access Token became invalid

Push — master ( bfbbb5...96b70c )

klib.preprocess.train_dev_test_split() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like