so_magic.data.backend.panda_handling.df_operations - Code Metrics - Inspection of "try to install conda on Windows, with a cmd script" - boromir674/so-magic - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — appveyor ( 280314...2c0e2c )

by Konstantinos

created 2020-11-13 17:31 UTC

so_magic.data.backend.panda_handling.df_operations A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	237
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	80
dl	0
loc	237
rs	9.6
c	0
b	0
f	0
wmc	35

25 Functions

Rating	Name	Size	Complexity
A	string_map()	3	1
A	from_csv()	2	1
A	add_qbin()	3	1
A	drop_columns()	3	1
A	add_reg()	3	1
A	split_attributes()	4	1
A	get_df_from_json()	2	1
A	binned_indices()	4	1
A	map_replace_string()	15	2
A	_map()	4	1
A	encode_bands()	4	1
A	label_encode()	2	1
A	missing_values()	2	1
A	_op_gen()	7	4
A	complete_numerical_with_median()	2	1
A	encode_continuous()	3	1
A	bin_column()	2	1
A	add_bin_for_continuous()	2	1
A	create_column()	2	1
A	qbin_column()	2	1
A	add_column()	3	1
A	encode_continuous_many()	3	1
A	df_map()	2	1
A	encode_bands_many()	4	1
A	complete_categorical_with_most_freq()	2	1

4 Methods

Rating	Name	Size	Complexity
A	MedianFiller._condition()	2	2
A	MedianFiller._convert()	2	1
A	MedianFiller._set_value()	3	1
A	MedianFiller.__call__()	11	2

import pandas as pd

# __all__ = []

def split_attributes(dataframe):
    """Return the categorical and numerical columns/attributes of the given dataframe"""
    _ = dataframe._get_numeric_data().columns.values
    return list(set(dataframe.columns) - set(_)), _

def missing_values(dataframe):
    return {k: v for k, v in dataframe.isnull().sum().to_dict().items() if v != 0}

# train_data.describe(include=['O'])

def drop_columns(dataframe, *columns):
    """Call this method to remove given columns for the given dataframe and get a new dataframe reference"""
    return dataframe.drop([*columns], axis=1)

def add_column(dataframe, name, values):
    """Call this method to add a new column with the given values and get a new dataframe reference"""
    return dataframe.assign(**{name: values})
########
def bin_column(column_ref, nb_bins):
    return pd.cut(column_ref, nb_bins)

def qbin_column(column_ref, nb_bins):
    return pd.qcut(column_ref, nb_bins)

def string_map(column_ref, strings, target_form):
    """Call this method to replace values found in 'strings' list with the target form, given a column (ie Series) reference and return the reference"""
    return column_ref.replace(strings, target_form)
####
def add_bin_for_continuous(dataframe, column, new_column, nb_bins):
    return add_column(dataframe, new_column, list(bin_column(dataframe[column], nb_bins)))

def add_reg(dataframe, name, regex, target):
    """Call this method to add a new column by applying a regex extractor to an existing column and get a new dataframe reference"""
    return add_column(dataframe, name, list(dataframe[target].str.extract(regex, expand=False)))

def map_replace_string(dataframe, column, norm):  #, strings_data, target_forms):
    """Call this method to replace strings with normalized form, given the input mapping
    Example input:
    norm = {
    'Rare': ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
    'Miss': ['Mlle', 'Ms'],
    'Mrs': ['Mme']
    }
    """
    for k, v in norm.items():
        dataframe[column] = string_map(dataframe[column], v, k)
    # c = list(reversed([list(_) for _ in zip(*list(norm.items()))]))
    # for strings, target in zip(strings_data, target_forms):
    #     dataframe[column] = string_map(dataframe[column], strings, target)
    return dataframe


###############################
def df_map(a_callable, dataframes, *args, **kwargs):
    return [a_callable(x, *args, **kwargs) for x in dataframes]
#########################

# # DROP 'Ticket' and 'Cabin' columns
# train_data, test_data = df_map(drop_columns, [train_data, test_data], 'Ticket', 'Cabin')

def complete_categorical_with_most_freq(dataframe, column):
    return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().mode()[0])})

def complete_numerical_with_median(dataframe, column):
    return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().median())})

from itertools import product
from functools import reduce

class MedianFiller:
    def __call__(self, dataframe, column, columns):
        """
        Call this method to fill missing values in a dataframe's column according to the medians computed on correlated columns\n
        :param str dataframe:
        :param str column: column with missing values
        :param list columns: correlated columns
        :return: a dataframe reference with the column completed
        """
        for vector in product(*[list(dataframe[c].unique()) for c in columns]):
            self._set_value(dataframe, column, self._condition(dataframe, columns, vector))
        return dataframe.assign(**{column: dataframe[column].astype(int)})

    def _set_value(self, dataframe, column, condition):
        dataframe.loc[(dataframe[column].isnull()) & condition, column] = self._convert(
            dataframe[condition][column].dropna().median())

    def _condition(self, dataframe, columns, values_vector):
        return reduce(lambda i, j: i & j, [dataframe[c] == values_vector[e] for e, c in enumerate(columns)])

    def _convert(self, value):
        return value



def create_column(dataframe, name, a_callable):
    return dataframe.assign(**{name: a_callable(dataframe)})

# # CREATE 5 BINS for 'Age' column (discreetize) and add a column in 'train' data
# train_data = train_data.assign(**{'AgeBand': pd.cut(train_data.Age.astype(int), 5)})


def add_qbin(dataframe, target, nb_bins, destination):
    """Call this function to create a column (with 'destination' name) of quantisized bins of the continuous variable given in the target column"""
    return dataframe.assign(**{destination: pd.qcut(dataframe[target], nb_bins)})


def binned_indices(values, left_boundaries):
    """Call this function to get an array of indices the given values belong based on the input boundaries.\n
        If values in `x` are beyond the bounds of `left_boundaries`, 0 or ``len(left_boundaries)`` is returned as appropriate."""
    return np.digitize(values, left_boundaries)



def _map(intervals_list):
    """Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..).
        Assumes that the input Intervals list is sorted"""
    return {(interval_obj): index for index, interval_obj in enumerate(intervals_list)}


## operations for dfs with constructed bins/bands
def encode_bands(dataframe, target_column, intervals_list, destination_column):
    """Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..).
        Assumes that the input Intervals list is sorted"""
    return dataframe.assign(**{destination_column: dataframe[target_column].map(_map(intervals_list)).astype(int)})


def encode_bands_many(dataframe, targets, intervals_lists, destinations):
    """"""
    return dataframe.assign(**{dest_c: dataframe[target_c].map(_map(intervals_list)).astype(int)
                               for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)})


def encode_continuous(dataframe, target_column, intervals_list, destination_column):
    return dataframe.assign(
        **{destination_column: binned_indices(dataframe[target_column], iter(x.left for x in intervals_list)) - 1})


def encode_continuous_many(dataframe, targets, intervals_lists, destinations):
    return dataframe.assign(**{dest_c: binned_indices(dataframe[target_c], [x.left for x in intervals_list]) - 1
                               for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)})


def _op_gen(dataframe, columns, band_str='Band', post_str='_Code'):
#     interval_lists = [sorted(dataframe[c+band_str].unique()) for c in columns]
#     coded = ['{}{}'.format(c, post_str) for c in columns]
    _ = [list(_) for _ in zip(*list([(sorted(dataframe[c+band_str].unique()), '{}{}'.format(c, post_str)) for c in columns]))]
    yield lambda x: encode_bands_many(x, [c+band_str for c in columns], _[0], _[1])
    while 1:
        yield lambda x: encode_continuous_many(x, columns, _[0], _[1])


#### CONSTANTS #####
POST_STR = '_Code'  # postfix string for encoded variables
BAND_STR = 'Band'

#### SETTINGS ###
# PICK columns with categorical variables to encode with sklearn LabelEncoder
TO_ENCODE_WITH_SKLEARN = ['Embarked', 'Sex', 'Title']

TO_ENCODE_WITH_INTERVALS = ['Age', 'Fare']



def label_encode(dataframe, columns, encode_callback, code_str='_Code'):
    return dataframe.assign(**{c+code_str: encode_callback(dataframe[c]) for c in columns})

#                                [train_data, test_data],
#                                ['Embarked', 'Sex', 'Title'],
#                                LabelEncoder().fit_transform)  # encodes categorical objects into indices starting from 0


# ENCODE 'Age' and 'Fare' by creating the 'Age_code' and 'Fare_Code' coluns in 'train_data' and 'test_data'
# train_data, test_data = encode_bands([train_data, test_data], TO_ENCODE_WITH_INTERVALS, bin_str=BAND_STR, post=POST_STR)
#
# op_gen = _op_gen(train_data, ['Age', 'Fare'], band_str='Band', post_str='_Code')
# train_data, test_data = [next(op_gen)(df) for df in [train_data, test_data]]




#### SELECTION
#
# #define y variable aka target/outcome
# Target = ['Survived']
#
# # FEATURE SLECTION
# # define variables (original and encoded)
# feature_titles = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] #pretty name/values for charts
# feature_names = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
# # data1_xy =  Target + data1_x
# # print('Original X Y: ', data1_xy, '\n')
#
#
# #define x variables for original w/bin variables to remove continuous variables
# data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'Age_Code', 'Fare_Code']
# # data1_xy_bin = Target + data1_x_bin
# # print('Bin X Y: ', data1_xy_bin, '\n')
#
#
# #define x and y variables for dummy variables original
# data1_dummy = pd.get_dummies(train_data[feature_titles])
# data1_x_dummy = data1_dummy.columns.tolist()
# # data1_xy_dummy = Target + data1_x_dummy
# # print('Dummy X Y: ', data1_xy_dummy, '\n')
#
#
# # SELECT variables
# numerical_feats = ['Pclass', 'Fare_Code', 'Age_Code', 'FamilySize']  # ordering makes sence: eg: class_1 < class_2,
# binary_feats = ['Sex_Code', 'IsAlone']
# categorical_feats = ['Embarked']
#
# assert all(all(x in train_dataframe.columns for x in y) for y in [numerical_feats, binary_feats, categorical_feats])
#
# # convert eg column "color" that takes {'white', 'black'} as values to
# # 2 columns: 'color_white' and 'color_black' (that take 0 or 1)
# pd.get_dummies(train_data[categorical_feats]).head()
#
#
# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1)
# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1)
# X_train.head()
#
#
# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1)
# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1)
# X_train.head()


def get_df_from_json(cls, json_path):
    return pd.read_json(path_or_buf=json_path)

def from_csv(file_path):
    return pd.read_csv(file_path)


1			import pandas as pd
2
3			# __all__ = []
4
5			def split_attributes(dataframe):
6			"""Return the categorical and numerical columns/attributes of the given dataframe"""
7			_ = dataframe._get_numeric_data().columns.values
8			return list(set(dataframe.columns) - set(_)), _
9
10			def missing_values(dataframe):
11			return {k: v for k, v in dataframe.isnull().sum().to_dict().items() if v != 0}
12
13			# train_data.describe(include=['O'])
14
15			def drop_columns(dataframe, *columns):
16			"""Call this method to remove given columns for the given dataframe and get a new dataframe reference"""
17			return dataframe.drop([*columns], axis=1)
18
19			def add_column(dataframe, name, values):
20			"""Call this method to add a new column with the given values and get a new dataframe reference"""
21			return dataframe.assign(**{name: values})
22			########
23			def bin_column(column_ref, nb_bins):
24			return pd.cut(column_ref, nb_bins)
25
26			def qbin_column(column_ref, nb_bins):
27			return pd.qcut(column_ref, nb_bins)
28
29			def string_map(column_ref, strings, target_form):
30			"""Call this method to replace values found in 'strings' list with the target form, given a column (ie Series) reference and return the reference"""
31			return column_ref.replace(strings, target_form)
32			####
33			def add_bin_for_continuous(dataframe, column, new_column, nb_bins):
34			return add_column(dataframe, new_column, list(bin_column(dataframe[column], nb_bins)))
35
36			def add_reg(dataframe, name, regex, target):
37			"""Call this method to add a new column by applying a regex extractor to an existing column and get a new dataframe reference"""
38			return add_column(dataframe, name, list(dataframe[target].str.extract(regex, expand=False)))
39
40			def map_replace_string(dataframe, column, norm): #, strings_data, target_forms):
41			"""Call this method to replace strings with normalized form, given the input mapping
42			Example input:
43			norm = {
44			'Rare': ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
45			'Miss': ['Mlle', 'Ms'],
46			'Mrs': ['Mme']
47			}
48			"""
49			for k, v in norm.items():
50			dataframe[column] = string_map(dataframe[column], v, k)
51			# c = list(reversed([list(_) for _ in zip(*list(norm.items()))]))
52			# for strings, target in zip(strings_data, target_forms):
53			# dataframe[column] = string_map(dataframe[column], strings, target)
54			return dataframe
55
56
57			###############################
58			def df_map(a_callable, dataframes, args, *kwargs):
59			return [a_callable(x, args, *kwargs) for x in dataframes]
60			#########################
61
62			# # DROP 'Ticket' and 'Cabin' columns
63			# train_data, test_data = df_map(drop_columns, [train_data, test_data], 'Ticket', 'Cabin')
64
65			def complete_categorical_with_most_freq(dataframe, column):
66			return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().mode()[0])})
67
68			def complete_numerical_with_median(dataframe, column):
69			return dataframe.assign(**{column: dataframe[column].fillna(dataframe[column].dropna().median())})
70
71			from itertools import product
72			from functools import reduce
73
74			class MedianFiller:
75			def __call__(self, dataframe, column, columns):
76			"""
77			Call this method to fill missing values in a dataframe's column according to the medians computed on correlated columns\n
78			:param str dataframe:
79			:param str column: column with missing values
80			:param list columns: correlated columns
81			:return: a dataframe reference with the column completed
82			"""
83			for vector in product(*[list(dataframe[c].unique()) for c in columns]):
84			self._set_value(dataframe, column, self._condition(dataframe, columns, vector))
85			return dataframe.assign(**{column: dataframe[column].astype(int)})
86
87			def _set_value(self, dataframe, column, condition):
88			dataframe.loc[(dataframe[column].isnull()) & condition, column] = self._convert(
89			dataframe[condition][column].dropna().median())
90
91			def _condition(self, dataframe, columns, values_vector):
92			return reduce(lambda i, j: i & j, [dataframe[c] == values_vector[e] for e, c in enumerate(columns)])
93
94			def _convert(self, value):
95			return value
96
97
98
99			def create_column(dataframe, name, a_callable):
100			return dataframe.assign(**{name: a_callable(dataframe)})
101
102			# # CREATE 5 BINS for 'Age' column (discreetize) and add a column in 'train' data
103			# train_data = train_data.assign(**{'AgeBand': pd.cut(train_data.Age.astype(int), 5)})
104
105
106			def add_qbin(dataframe, target, nb_bins, destination):
107			"""Call this function to create a column (with 'destination' name) of quantisized bins of the continuous variable given in the target column"""
108			return dataframe.assign(**{destination: pd.qcut(dataframe[target], nb_bins)})
109
110
111			def binned_indices(values, left_boundaries):
112			"""Call this function to get an array of indices the given values belong based on the input boundaries.\n
113			If values in `x` are beyond the bounds of `left_boundaries`, 0 or ``len(left_boundaries)`` is returned as appropriate."""
114			return np.digitize(values, left_boundaries)
			0 ignored issues – show Comprehensibility Best Practice introduced 2020-07-31 20:00 UTC by Report Bug Copy Issue Report The variable `np` does not seem to be defined. Loading history...
115
116
117			def _map(intervals_list):
118			"""Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..).
119			Assumes that the input Intervals list is sorted"""
120			return {(interval_obj): index for index, interval_obj in enumerate(intervals_list)}
121
122
123			## operations for dfs with constructed bins/bands
124			def encode_bands(dataframe, target_column, intervals_list, destination_column):
125			"""Call this function to get a dictionary mapping Interval objects to numerical codes (0, 1, ..).
126			Assumes that the input Intervals list is sorted"""
127			return dataframe.assign(**{destination_column: dataframe[target_column].map(_map(intervals_list)).astype(int)})
128
129
130			def encode_bands_many(dataframe, targets, intervals_lists, destinations):
131			""""""
132			return dataframe.assign(**{dest_c: dataframe[target_c].map(_map(intervals_list)).astype(int)
133			for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)})
134
135
136			def encode_continuous(dataframe, target_column, intervals_list, destination_column):
137			return dataframe.assign(
138			**{destination_column: binned_indices(dataframe[target_column], iter(x.left for x in intervals_list)) - 1})
139
140
141			def encode_continuous_many(dataframe, targets, intervals_lists, destinations):
142			return dataframe.assign(**{dest_c: binned_indices(dataframe[target_c], [x.left for x in intervals_list]) - 1
143			for target_c, intervals_list, dest_c in zip(targets, intervals_lists, destinations)})
144
145
146			def _op_gen(dataframe, columns, band_str='Band', post_str='_Code'):
147			# interval_lists = [sorted(dataframe[c+band_str].unique()) for c in columns]
148			# coded = ['{}{}'.format(c, post_str) for c in columns]
149			_ = [list(_) for _ in zip(*list([(sorted(dataframe[c+band_str].unique()), '{}{}'.format(c, post_str)) for c in columns]))]
150			yield lambda x: encode_bands_many(x, [c+band_str for c in columns], _[0], _[1])
151			while 1:
152			yield lambda x: encode_continuous_many(x, columns, _[0], _[1])
153
154
155			#### CONSTANTS #####
156			POST_STR = '_Code' # postfix string for encoded variables
157			BAND_STR = 'Band'
158
159			#### SETTINGS ###
160			# PICK columns with categorical variables to encode with sklearn LabelEncoder
161			TO_ENCODE_WITH_SKLEARN = ['Embarked', 'Sex', 'Title']
162
163			TO_ENCODE_WITH_INTERVALS = ['Age', 'Fare']
164
165
166
167			def label_encode(dataframe, columns, encode_callback, code_str='_Code'):
168			return dataframe.assign(**{c+code_str: encode_callback(dataframe[c]) for c in columns})
169
170			# [train_data, test_data],
171			# ['Embarked', 'Sex', 'Title'],
172			# LabelEncoder().fit_transform) # encodes categorical objects into indices starting from 0
173
174
175			# ENCODE 'Age' and 'Fare' by creating the 'Age_code' and 'Fare_Code' coluns in 'train_data' and 'test_data'
176			# train_data, test_data = encode_bands([train_data, test_data], TO_ENCODE_WITH_INTERVALS, bin_str=BAND_STR, post=POST_STR)
177			#
178			# op_gen = _op_gen(train_data, ['Age', 'Fare'], band_str='Band', post_str='_Code')
179			# train_data, test_data = [next(op_gen)(df) for df in [train_data, test_data]]
180
181
182
183
184			#### SELECTION
185			#
186			# #define y variable aka target/outcome
187			# Target = ['Survived']
188			#
189			# # FEATURE SLECTION
190			# # define variables (original and encoded)
191			# feature_titles = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] #pretty name/values for charts
192			# feature_names = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
193			# # data1_xy = Target + data1_x
194			# # print('Original X Y: ', data1_xy, '\n')
195			#
196			#
197			# #define x variables for original w/bin variables to remove continuous variables
198			# data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'Age_Code', 'Fare_Code']
199			# # data1_xy_bin = Target + data1_x_bin
200			# # print('Bin X Y: ', data1_xy_bin, '\n')
201			#
202			#
203			# #define x and y variables for dummy variables original
204			# data1_dummy = pd.get_dummies(train_data[feature_titles])
205			# data1_x_dummy = data1_dummy.columns.tolist()
206			# # data1_xy_dummy = Target + data1_x_dummy
207			# # print('Dummy X Y: ', data1_xy_dummy, '\n')
208			#
209			#
210			# # SELECT variables
211			# numerical_feats = ['Pclass', 'Fare_Code', 'Age_Code', 'FamilySize'] # ordering makes sence: eg: class_1 < class_2,
212			# binary_feats = ['Sex_Code', 'IsAlone']
213			# categorical_feats = ['Embarked']
214			#
215			# assert all(all(x in train_dataframe.columns for x in y) for y in [numerical_feats, binary_feats, categorical_feats])
216			#
217			# # convert eg column "color" that takes {'white', 'black'} as values to
218			# # 2 columns: 'color_white' and 'color_black' (that take 0 or 1)
219			# pd.get_dummies(train_data[categorical_feats]).head()
220			#
221			#
222			# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1)
223			# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1)
224			# X_train.head()
225			#
226			#
227			# X_train = pd.concat([train_data[numerical_feats + binary_feats], pd.get_dummies(train_data[categorical_feats])], axis=1)
228			# X_test = pd.concat([test_data[numerical_feats + binary_feats], pd.get_dummies(test_data[categorical_feats])], axis=1)
229			# X_train.head()
230
231
232			def get_df_from_json(cls, json_path):
233			return pd.read_json(path_or_buf=json_path)
234
235			def from_csv(file_path):
236			return pd.read_csv(file_path)
237

boromir674 / so-magic

Push — appveyor ( 280314...2c0e2c )

so_magic.data.backend.panda_handling.df_operations A

Complexity

Size/Duplication

Importance

25 Functions

4 Methods

Duplication Side-by-Side

Filter issues like