klib.preprocess - Code Metrics - Inspection of "add train_dev_test_split and input checks" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 7c1057...9930c8 )

by Andreas

created 2020-04-26 16:25 UTC

klib.preprocess A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	150
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	56
dl	0
loc	150
rs	10
c	0
b	0
f	0
wmc	14

2 Functions

Rating	Name	Duplication	Size	Complexity
C	mv_col_handler()	0	67	9
B	train_dev_test_split()	0	61	5

'''
Functions for data preprocessing.

:author: Andreas Kanz

'''

# Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from .describe import corr_mat
from .utils import _missing_vals
from .utils import _validate_input_int
from .utils import _validate_input_range


def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
    '''
    Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
    their correlation with other features and the target variable. This function follows a three step process:
    - 1) Identify features with a high ratio of missing values
    - 2) Identify high correlations of these features among themselves and with other features in the dataset.
    - 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
         they correlate reasonably well with the target variable.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    mv_threshold: float, default 0.1
        Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
        for dropping and undergo further analysis.

    corr_thresh_features: float, default 0.6
        Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
         allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.

    corr_thresh_target: float, default 0.3
        Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
        high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
        the feature is ultimately dropped.

    Returns
    -------
    data: Updated Pandas DataFrame
    drop_cols: List of dropped columns
    '''

    # Validate Inputs
    _validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
    _validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
    _validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)

    data = pd.DataFrame(data).copy()
    mv_ratios = _missing_vals(data)['mv_cols_ratio']
    cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
    data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)

    for col in cols_mv:
        data[col] = data_mv_binary[col]

    high_corr_features = []
    data_temp = data.copy()
    for col in cols_mv:
        corrmat = corr_mat(data_temp, colored=False)
        if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
            high_corr_features.append(col)
            data_temp = data_temp.drop(columns=[col])

    drop_cols = []
    if target is None:
        data = data_temp
    else:
        for col in high_corr_features:
            if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target:
                drop_cols.append(col)
                data = data.drop(columns=[col])

    return data, drop_cols


def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=1234):
    '''
    Split a dataset and a label column into train, dev and test sets.

    Parameters:
    ----------

    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    target: string, list, np.array or pd.Series, default None
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
        and the label.

    dev_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
        split.

    test_size: float, default 0.1
        If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
        split.

    stratify: target column, default None
        If not None, data is split in a stratified fashion, using the input as the class labels.

    random_state: integer
        Random_state is the seed used by the random number generator.

    Returns
    -------
    tuple: Tuple containing train-dev-test split of inputs.
    '''

    # Validate Inputs
    _validate_input_int(random_state, 'random_state')
    _validate_input_range(dev_size, 'dev_size', 0, 1)
    _validate_input_range(test_size, 'test_size', 0, 1)

    target_data = []
    if isinstance(target, str):
        target_data = data[target]
        data = data.drop(target, axis=1)

    elif isinstance(target, (list, pd.Series, np.ndarray)):
        target_data = pd.Series(target)
        target = target.name

    X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
                                                                test_size=dev_size+test_size,
                                                                random_state=random_state,
                                                                stratify=stratify)

    if (dev_size == 0) or (test_size == 0):
        return X_train, X_dev_test, y_train, y_dev_test

    else:
        X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
                                                        test_size=test_size/(dev_size+test_size),
                                                        random_state=random_state,
                                                        stratify=y_dev_test)
        return X_train, X_dev, X_test, y_train, y_dev, y_test


1			'''
2			Functions for data preprocessing.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import numpy as np
10			import pandas as pd
11
12			from sklearn.model_selection import train_test_split
13
14			from .describe import corr_mat
15			from .utils import _missing_vals
16			from .utils import _validate_input_int
17			from .utils import _validate_input_range
18
19
20			def mv_col_handler(data, target=None, mv_threshold=0.1, corr_thresh_features=0.6, corr_thresh_target=0.3):
21			'''
22			Converts columns with a high ratio of missing values into binary features and eventually drops them based on \
23			their correlation with other features and the target variable. This function follows a three step process:
24			- 1) Identify features with a high ratio of missing values
25			- 2) Identify high correlations of these features among themselves and with other features in the dataset.
26			- 3) Features with high ratio of missing values and high correlation among each other are dropped unless \
27			they correlate reasonably well with the target variable.
28
29			Parameters
30			----------
31			data: 2D dataset that can be coerced into Pandas DataFrame.
32
33			target: string, list, np.array or pd.Series, default None
34			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
35			and the label.
36
37			mv_threshold: float, default 0.1
38			Value between 0 <= threshold <= 1. Features with a missing-value-ratio larger than mv_threshold are candidates \
39			for dropping and undergo further analysis.
40
41			corr_thresh_features: float, default 0.6
42			Value between 0 <= threshold <= 1. Maximum correlation a previously identified features with a high mv-ratio is\
43			allowed to have with another feature. If this threshold is overstepped, the feature undergoes further analysis.
44
45			corr_thresh_target: float, default 0.3
46			Value between 0 <= threshold <= 1. Minimum required correlation of a remaining feature (i.e. feature with a \
47			high mv-ratio and high correlation to another existing feature) with the target. If this threshold is not met \
48			the feature is ultimately dropped.
49
50			Returns
51			-------
52			data: Updated Pandas DataFrame
53			drop_cols: List of dropped columns
54			'''
55
56			# Validate Inputs
57			_validate_input_range(mv_threshold, 'mv_threshold', 0, 1)
58			_validate_input_range(corr_thresh_features, 'corr_thresh_features', 0, 1)
59			_validate_input_range(corr_thresh_target, 'corr_thresh_target', 0, 1)
60
61			data = pd.DataFrame(data).copy()
62			mv_ratios = _missing_vals(data)['mv_cols_ratio']
63			cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
64			data_mv_binary = data[cols_mv].applymap(lambda x: 1 if not pd.isnull(x) else x).fillna(0)
65
66			for col in cols_mv:
67			data[col] = data_mv_binary[col]
68
69			high_corr_features = []
70			data_temp = data.copy()
71			for col in cols_mv:
72			corrmat = corr_mat(data_temp, colored=False)
73			if abs(corrmat[col]).nlargest(2)[1] > corr_thresh_features:
74			high_corr_features.append(col)
75			data_temp = data_temp.drop(columns=[col])
76
77			drop_cols = []
78			if target is None:
79			data = data_temp
80			else:
81			for col in high_corr_features:
82			if pd.DataFrame(data_mv_binary[col]).corrwith(target)[0] < corr_thresh_target:
83			drop_cols.append(col)
84			data = data.drop(columns=[col])
85
86			return data, drop_cols
87
88
89			def train_dev_test_split(data, target, dev_size=0.1, test_size=0.1, stratify=None, random_state=1234):
90			'''
91			Split a dataset and a label column into train, dev and test sets.
92
93			Parameters:
94			----------
95
96			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
97			information is used to label the plots.
98
99			target: string, list, np.array or pd.Series, default None
100			Specify target for correlation. E.g. label column to generate only the correlations between each feature \
101			and the label.
102
103			dev_size: float, default 0.1
104			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the dev \
105			split.
106
107			test_size: float, default 0.1
108			If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test \
109			split.
110
111			stratify: target column, default None
112			If not None, data is split in a stratified fashion, using the input as the class labels.
113
114			random_state: integer
115			Random_state is the seed used by the random number generator.
116
117			Returns
118			-------
119			tuple: Tuple containing train-dev-test split of inputs.
120			'''
121
122			# Validate Inputs
123			_validate_input_int(random_state, 'random_state')
124			_validate_input_range(dev_size, 'dev_size', 0, 1)
125			_validate_input_range(test_size, 'test_size', 0, 1)
126
127			target_data = []
128			if isinstance(target, str):
129			target_data = data[target]
130			data = data.drop(target, axis=1)
131
132			elif isinstance(target, (list, pd.Series, np.ndarray)):
133			target_data = pd.Series(target)
134			target = target.name
135
136			X_train, X_dev_test, y_train, y_dev_test = train_test_split(data, target_data,
137			test_size=dev_size+test_size,
138			random_state=random_state,
139			stratify=stratify)
140
141			if (dev_size == 0) or (test_size == 0):
142			return X_train, X_dev_test, y_train, y_dev_test
143
144			else:
145			X_dev, X_test, y_dev, y_test = train_test_split(X_dev_test, y_dev_test,
146			test_size=test_size/(dev_size+test_size),
147			random_state=random_state,
148			stratify=y_dev_test)
149			return X_train, X_dev, X_test, y_train, y_dev, y_test
150

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 7c1057...9930c8 )

klib.preprocess A

Complexity

Size/Duplication

Importance

2 Functions

Duplication Side-by-Side

Filter issues like