klib.clean.memory_usage() - Code Metrics - Inspection of "update notes" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( cd89f1...aa5eb4 )

by Andreas

created 2020-04-03 14:59 UTC

klib.clean.memory_usage() A

↳ Parent: klib.clean

Complexity

Conditions

Size

Total Lines	6
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	3
dl	0
loc	6
rs	10
c	0
b	0
f	0
cc	1
nop	1

# function to drop empty rows and columns based on thresholds and reindex?
# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series)

# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation

# list all dropped columns and rows and provide a before and after summary of shape, memory etc


# drop further columns and rows based on criteria


# deal with outliers --> Outlier models? Possible Options? Default values?
# list possible outliers base on standard deviation
# winsorize?
# quantile based
# Dropping the outlier rows with Percentiles
# upper_lim = data['column'].quantile(.95)
# lower_lim = data['column'].quantile(.05)

# capping the data at a certain value
# Capping the outlier rows with Percentiles
# upper_lim = data['column'].quantile(.95)
# lower_lim = data['column'].quantile(.05)
# data.loc[(df[column] > upper_lim),column] = upper_lim
# data.loc[(df[column] < lower_lim),column] = lower_lim


# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

# imputation methods
# col or row?
# mean
# median - more outlier resistant
# ...


# further feature engineering

# extract information from datetimes
# create features for year, month, day, weekday, weekend, day of the week, holiday, ...

# from datetime import date

# data = pd.DataFrame({'date':
# ['01-01-2017',
# '04-12-2008',
# '23-06-1988',
# '25-08-1999',
# '20-02-1993',
# ]})

# #Transform string to date
# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

# #Extracting Year
# data['year'] = data['date'].dt.year

# #Extracting Month
# data['month'] = data['date'].dt.month

# #Extracting passed years since the date
# data['passed_years'] = date.today().year - data['date'].dt.year

# #Extracting passed months since the date
# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

# #Extracting the weekday name of the date
# data['day_name'] = data['date'].dt.day_name()
#         date  year  month  passed_years  passed_months   day_name
# 0 2017-01-01  2017      1             2             26     Sunday
# 1 2008-12-04  2008     12            11            123   Thursday
# 2 1988-06-23  1988      6            31            369   Thursday
# 3 1999-08-25  1999      8            20            235  Wednesday
# 4 1993-02-20  1993      2            26            313   Saturday

# binning (give option to choose features to bin and how)
# binning of numerical variables
# binning of categorical data

# Categorical Binning Example
#      Country
# 0      Spain
# 1      Chile
# 2  Australia
# 3      Italy
# 4     Brazil
# conditions = [
#     data['Country'].str.contains('Spain'),
#     data['Country'].str.contains('Italy'),
#     data['Country'].str.contains('Chile'),
#     data['Country'].str.contains('Brazil')]

# choices = ['Europe', 'Europe', 'South America', 'South America']

# data['Continent'] = np.select(conditions, choices, default='Other')
#      Country      Continent
# 0      Spain         Europe
# 1      Chile  South America
# 2  Australia          Other
# 3      Italy         Europe
# 4     Brazil  South America

# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies

# export / save " clean data"?


def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]):
    '''
    Convert columns to best possible dtypes using dtypes supporting pd.NA.
    
    
    '''
    data = data.copy()
    for col in data.columns:
        data[col] = data[col].convert_dtypes()
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if category and unique_vals_ratio < cat_threshold and col not in exclude:
            data[col] = data[col].astype('category')
    return data


def memory_usage(data):
    '''
    Total memory usage in Kilobytes.
    '''
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
    return memory_usage


1			# function to drop empty rows and columns based on thresholds and reindex?
2			# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series)
3
4			# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation
5
6			# list all dropped columns and rows and provide a before and after summary of shape, memory etc
7
8
9			# drop further columns and rows based on criteria
10
11
12			# deal with outliers --> Outlier models? Possible Options? Default values?
13			# list possible outliers base on standard deviation
14			# winsorize?
15			# quantile based
16			# Dropping the outlier rows with Percentiles
17			# upper_lim = data['column'].quantile(.95)
18			# lower_lim = data['column'].quantile(.05)
19
20			# capping the data at a certain value
21			# Capping the outlier rows with Percentiles
22			# upper_lim = data['column'].quantile(.95)
23			# lower_lim = data['column'].quantile(.05)
24			# data.loc[(df[column] > upper_lim),column] = upper_lim
25			# data.loc[(df[column] < lower_lim),column] = lower_lim
26
27
28			# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
29
30			# imputation methods
31			# col or row?
32			# mean
33			# median - more outlier resistant
34			# ...
35
36
37			# further feature engineering
38
39			# extract information from datetimes
40			# create features for year, month, day, weekday, weekend, day of the week, holiday, ...
41
42			# from datetime import date
43
44			# data = pd.DataFrame({'date':
45			# ['01-01-2017',
46			# '04-12-2008',
47			# '23-06-1988',
48			# '25-08-1999',
49			# '20-02-1993',
50			# ]})
51
52			# #Transform string to date
53			# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")
54
55			# #Extracting Year
56			# data['year'] = data['date'].dt.year
57
58			# #Extracting Month
59			# data['month'] = data['date'].dt.month
60
61			# #Extracting passed years since the date
62			# data['passed_years'] = date.today().year - data['date'].dt.year
63
64			# #Extracting passed months since the date
65			# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month
66
67			# #Extracting the weekday name of the date
68			# data['day_name'] = data['date'].dt.day_name()
69			# date year month passed_years passed_months day_name
70			# 0 2017-01-01 2017 1 2 26 Sunday
71			# 1 2008-12-04 2008 12 11 123 Thursday
72			# 2 1988-06-23 1988 6 31 369 Thursday
73			# 3 1999-08-25 1999 8 20 235 Wednesday
74			# 4 1993-02-20 1993 2 26 313 Saturday
75
76			# binning (give option to choose features to bin and how)
77			# binning of numerical variables
78			# binning of categorical data
79
80			# Categorical Binning Example
81			# Country
82			# 0 Spain
83			# 1 Chile
84			# 2 Australia
85			# 3 Italy
86			# 4 Brazil
87			# conditions = [
88			# data['Country'].str.contains('Spain'),
89			# data['Country'].str.contains('Italy'),
90			# data['Country'].str.contains('Chile'),
91			# data['Country'].str.contains('Brazil')]
92
93			# choices = ['Europe', 'Europe', 'South America', 'South America']
94
95			# data['Continent'] = np.select(conditions, choices, default='Other')
96			# Country Continent
97			# 0 Spain Europe
98			# 1 Chile South America
99			# 2 Australia Other
100			# 3 Italy Europe
101			# 4 Brazil South America
102
103			# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies
104
105			# export / save " clean data"?
106
107
108			def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]):
109			'''
110			Convert columns to best possible dtypes using dtypes supporting pd.NA.
111
112
113			'''
114			data = data.copy()
115			for col in data.columns:
116			data[col] = data[col].convert_dtypes()
117			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
118			if category and unique_vals_ratio < cat_threshold and col not in exclude:
119			data[col] = data[col].astype('category')
120			return data
121
122
123			def memory_usage(data):
124			'''
125			Total memory usage in Kilobytes.
126			'''
127			memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
128			return memory_usage
129

akanz1 / klib

GitHub Access Token became invalid

Push — master ( cd89f1...aa5eb4 )

klib.clean.memory_usage() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like