klib.clean - Code Metrics - Inspection of "add convert_dtypes, memory_usage, missing_vals, su..." - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( aa5eb4...c0ff44 )

by Andreas

created 2020-04-03 19:03 UTC

klib.clean A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	217
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	10
eloc	33
dl	0
loc	217
rs	10
c	0
b	0
f	0

5 Functions

Rating	Name	Size	Complexity
A	missing_vals()	16	1
A	memory_usage()	17	1
A	summary_stats()	11	1
A	convert_datatypes()	30	5
A	data_cleaning()	15	2

'''
Utilities for data cleaning.

:author: Andreas Kanz

'''

# Imports
import pandas as pd
import statsmodels


# function to drop empty rows and columns based on thresholds and reindex?
# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series)

# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation

# list all dropped columns and rows and provide a before and after summary of shape, memory etc


# drop further columns and rows based on criteria


# deal with outliers --> Outlier models? Possible Options? Default values?
# list possible outliers base on standard deviation
# winsorize?
# quantile based
# Dropping the outlier rows with Percentiles
# upper_lim = data['column'].quantile(.95)
# lower_lim = data['column'].quantile(.05)

# capping the data at a certain value
# Capping the outlier rows with Percentiles
# upper_lim = data['column'].quantile(.95)
# lower_lim = data['column'].quantile(.05)
# data.loc[(df[column] > upper_lim),column] = upper_lim
# data.loc[(df[column] < lower_lim),column] = lower_lim


# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]

# imputation methods
# col or row?
# mean
# median - more outlier resistant
# ...


# further feature engineering

# extract information from datetimes
# create features for year, month, day, weekday, weekend, day of the week, holiday, ...

# from datetime import date

# data = pd.DataFrame({'date':
# ['01-01-2017',
# '04-12-2008',
# '23-06-1988',
# '25-08-1999',
# '20-02-1993',
# ]})

# #Transform string to date
# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

# #Extracting Year
# data['year'] = data['date'].dt.year

# #Extracting Month
# data['month'] = data['date'].dt.month

# #Extracting passed years since the date
# data['passed_years'] = date.today().year - data['date'].dt.year

# #Extracting passed months since the date
# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

# #Extracting the weekday name of the date
# data['day_name'] = data['date'].dt.day_name()
#         date  year  month  passed_years  passed_months   day_name
# 0 2017-01-01  2017      1             2             26     Sunday
# 1 2008-12-04  2008     12            11            123   Thursday
# 2 1988-06-23  1988      6            31            369   Thursday
# 3 1999-08-25  1999      8            20            235  Wednesday
# 4 1993-02-20  1993      2            26            313   Saturday

# binning (give option to choose features to bin and how)
# binning of numerical variables
# binning of categorical data

# Categorical Binning Example
#      Country
# 0      Spain
# 1      Chile
# 2  Australia
# 3      Italy
# 4     Brazil
# conditions = [
#     data['Country'].str.contains('Spain'),
#     data['Country'].str.contains('Italy'),
#     data['Country'].str.contains('Chile'),
#     data['Country'].str.contains('Brazil')]

# choices = ['Europe', 'Europe', 'South America', 'South America']

# data['Continent'] = np.select(conditions, choices, default='Other')
#      Country      Continent
# 0      Spain         Europe
# 1      Chile  South America
# 2  Australia          Other
# 3      Italy         Europe
# 4     Brazil  South America

# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies

# export / save " clean data"?


def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]):
    '''
    Convert columns to best possible dtypes using dtypes supporting pd.NA.

    Parameters:
    ----------
    data: Pandas DataFrame or numpy ndarray.

    category: bool, default True
        Change dtypes of columns to "category". Set threshold using cat_threshold.

    cat_threshold: float, default 0.05
        Ratio of unique values below which column dtype is changed to categorical.

    exclude: default [] (empty list)
        List of columns to exclude from categorical conversion.

    Returns:
    -------
    Pandas DataFrame or numpy ndarray.

    '''

    data = pd.DataFrame(data).copy()
    for col in data.columns:
        data[col] = data[col].convert_dtypes()
        unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
        if category and unique_vals_ratio < cat_threshold and col not in exclude:
            data[col] = data[col].astype('category')
    return data


def memory_usage(data): # --> describe.py & setup imports
    '''
    Total memory usage in kilobytes.

    Parameters:
    ----------
    data: Pandas DataFrame or numpy ndarray.

    Returns:
    -------
    memory_usage: float

    '''

    data = pd.DataFrame(data)
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
    return memory_usage


def missing_vals(data): # --> describe.py & setup imports
    '''
    Total missing values in the dataset.

    Parameters:
    ----------

    data: Pandas DataFrame or numpy ndarray.

    Returns:
    -------
    missing_vals: float
    '''

    missing_vals = data.isna().sum().sum()
    return missing_vals


def summary_stats(data):
    rows = data.shape[0]
    cols = data.shape[1]
    describe = data.describe().round(1)
    # look for possible statistics (statsmodels)
    # no visualizations here
    # Shape
    print('Summary statistics:\n___________________\n')
    print(f'Number of rows: {rows}')
    print(f'Number of columns: {cols}\n___________________\n')
    return describe


def data_cleaning(data, showall=True):
    '''
    initial data cleaning
    '''
    data_cleaned = convert_datatypes(data)

    if showall:
        print(f'Memory usage before data cleaning: {memory_usage(data)} kilobytes.')

        print(f'Memory usage after data cleaning: {memory_usage(data_cleaned)} kilobytes.')

    else:
        pass

    return data_cleaned

#     before: number of columns, number of rows, memory usage, number of NAs
#     cleaning empty columns etc.
#     after: number of columns, number of rows, memory usage, number of NAs
#     improvement / changes


1			'''
2			Utilities for data cleaning.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import pandas as pd
10			import statsmodels
11
12
13			# function to drop empty rows and columns based on thresholds and reindex?
14			# setting for row-wise or colum-wise or both to drop (i.e. might make little sense to drop rows in a time series)
15
16			# setting for "hard drop" (if NaN in this field drop row/column) --> Must exist. --> Consider imputation
17
18			# list all dropped columns and rows and provide a before and after summary of shape, memory etc
19
20
21			# drop further columns and rows based on criteria
22
23
24			# deal with outliers --> Outlier models? Possible Options? Default values?
25			# list possible outliers base on standard deviation
26			# winsorize?
27			# quantile based
28			# Dropping the outlier rows with Percentiles
29			# upper_lim = data['column'].quantile(.95)
30			# lower_lim = data['column'].quantile(.05)
31
32			# capping the data at a certain value
33			# Capping the outlier rows with Percentiles
34			# upper_lim = data['column'].quantile(.95)
35			# lower_lim = data['column'].quantile(.05)
36			# data.loc[(df[column] > upper_lim),column] = upper_lim
37			# data.loc[(df[column] < lower_lim),column] = lower_lim
38
39
40			# data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
41
42			# imputation methods
43			# col or row?
44			# mean
45			# median - more outlier resistant
46			# ...
47
48
49			# further feature engineering
50
51			# extract information from datetimes
52			# create features for year, month, day, weekday, weekend, day of the week, holiday, ...
53
54			# from datetime import date
55
56			# data = pd.DataFrame({'date':
57			# ['01-01-2017',
58			# '04-12-2008',
59			# '23-06-1988',
60			# '25-08-1999',
61			# '20-02-1993',
62			# ]})
63
64			# #Transform string to date
65			# data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")
66
67			# #Extracting Year
68			# data['year'] = data['date'].dt.year
69
70			# #Extracting Month
71			# data['month'] = data['date'].dt.month
72
73			# #Extracting passed years since the date
74			# data['passed_years'] = date.today().year - data['date'].dt.year
75
76			# #Extracting passed months since the date
77			# data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month
78
79			# #Extracting the weekday name of the date
80			# data['day_name'] = data['date'].dt.day_name()
81			# date year month passed_years passed_months day_name
82			# 0 2017-01-01 2017 1 2 26 Sunday
83			# 1 2008-12-04 2008 12 11 123 Thursday
84			# 2 1988-06-23 1988 6 31 369 Thursday
85			# 3 1999-08-25 1999 8 20 235 Wednesday
86			# 4 1993-02-20 1993 2 26 313 Saturday
87
88			# binning (give option to choose features to bin and how)
89			# binning of numerical variables
90			# binning of categorical data
91
92			# Categorical Binning Example
93			# Country
94			# 0 Spain
95			# 1 Chile
96			# 2 Australia
97			# 3 Italy
98			# 4 Brazil
99			# conditions = [
100			# data['Country'].str.contains('Spain'),
101			# data['Country'].str.contains('Italy'),
102			# data['Country'].str.contains('Chile'),
103			# data['Country'].str.contains('Brazil')]
104
105			# choices = ['Europe', 'Europe', 'South America', 'South America']
106
107			# data['Continent'] = np.select(conditions, choices, default='Other')
108			# Country Continent
109			# 0 Spain Europe
110			# 1 Chile South America
111			# 2 Australia Other
112			# 3 Italy Europe
113			# 4 Brazil South America
114
115			# encode dummies from categorical features unsing sklearn one hot (check intercept, regularization etc.) provide description why sklearn instead of pd.get_dummies
116
117			# export / save " clean data"?
118
119
120			def convert_datatypes(data, category=True, cat_threshold=0.05, exclude=[]):
121			'''
122			Convert columns to best possible dtypes using dtypes supporting pd.NA.
123
124			Parameters:
125			----------
126			data: Pandas DataFrame or numpy ndarray.
127
128			category: bool, default True
129			Change dtypes of columns to "category". Set threshold using cat_threshold.
130
131			cat_threshold: float, default 0.05
132			Ratio of unique values below which column dtype is changed to categorical.
133
134			exclude: default [] (empty list)
135			List of columns to exclude from categorical conversion.
136
137			Returns:
138			-------
139			Pandas DataFrame or numpy ndarray.
140
141			'''
142
143			data = pd.DataFrame(data).copy()
144			for col in data.columns:
145			data[col] = data[col].convert_dtypes()
146			unique_vals_ratio = data[col].nunique(dropna=False) / data.shape[0]
147			if category and unique_vals_ratio < cat_threshold and col not in exclude:
148			data[col] = data[col].astype('category')
149			return data
150
151
152			def memory_usage(data): # --> describe.py & setup imports
153			'''
154			Total memory usage in kilobytes.
155
156			Parameters:
157			----------
158			data: Pandas DataFrame or numpy ndarray.
159
160			Returns:
161			-------
162			memory_usage: float
163
164			'''
165
166			data = pd.DataFrame(data)
167			memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
168			return memory_usage
169
170
171			def missing_vals(data): # --> describe.py & setup imports
172			'''
173			Total missing values in the dataset.
174
175			Parameters:
176			----------
177
178			data: Pandas DataFrame or numpy ndarray.
179
180			Returns:
181			-------
182			missing_vals: float
183			'''
184
185			missing_vals = data.isna().sum().sum()
186			return missing_vals
187
188
189			def summary_stats(data):
190			rows = data.shape[0]
191			cols = data.shape[1]
192			describe = data.describe().round(1)
193			# look for possible statistics (statsmodels)
194			# no visualizations here
195			# Shape
196			print('Summary statistics:\n___________________\n')
197			print(f'Number of rows: {rows}')
198			print(f'Number of columns: {cols}\n___________________\n')
199			return describe
200
201
202			def data_cleaning(data, showall=True):
203			'''
204			initial data cleaning
205			'''
206			data_cleaned = convert_datatypes(data)
207
208			if showall:
209			print(f'Memory usage before data cleaning: {memory_usage(data)} kilobytes.')
210
211			print(f'Memory usage after data cleaning: {memory_usage(data_cleaned)} kilobytes.')
212
213			else:
214			pass
215
216			return data_cleaned
217
218			# before: number of columns, number of rows, memory usage, number of NAs
219			# cleaning empty columns etc.
220			# after: number of columns, number of rows, memory usage, number of NAs
221			# improvement / changes
222

akanz1 / klib

GitHub Access Token became invalid

Push — master ( aa5eb4...c0ff44 )

klib.clean A

Complexity

Size/Duplication

Importance

5 Functions

Duplication Side-by-Side

Filter issues like