Conditions | 3 |
Total Lines | 82 |
Code Lines | 31 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | ''' |
||
94 | def data_cleaning(data, drop_threshold_cols=0.95, drop_threshold_rows=0.95, category=True, |
||
95 | cat_threshold=0.03, cat_exclude=[], show='changes'): |
||
96 | ''' |
||
97 | Perform initial data cleaning tasks on a dataset, such as dropping empty rows and columns and optimizing the \ |
||
98 | datatypes. |
||
99 | |||
100 | Parameters |
||
101 | ---------- |
||
102 | data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ |
||
103 | information is used to label the plots. |
||
104 | |||
105 | drop_threshold_cols: float, default 0.95 |
||
106 | Drop columns with NA-ratio above the specified threshold. |
||
107 | |||
108 | drop_threshold_rows: float, default 0.95 |
||
109 | Drop rows with NA-ratio above the specified threshold. |
||
110 | |||
111 | category: bool, default True |
||
112 | Change dtypes of columns to "category". Set threshold using cat_threshold. |
||
113 | |||
114 | cat_threshold: float, default 0.03 |
||
115 | Ratio of unique values below which categories are inferred and column dtype is changed to categorical. |
||
116 | |||
117 | cat_exclude: default [] (empty list) |
||
118 | List of columns to exclude from categorical conversion. |
||
119 | |||
120 | show: {'all', 'changes', None} default 'all' |
||
121 | Specify verbosity of the output. |
||
122 | * 'all': Print information about the data before and after cleaning as well as information about changes. |
||
123 | * 'changes': Print out differences in the data before and after cleaning. |
||
124 | * None: no information about the data is printed. |
||
125 | |||
126 | Returns |
||
127 | ------- |
||
128 | Pandas DataFrame. |
||
129 | |||
130 | See Also |
||
131 | -------- |
||
132 | convert_datatypes: Converts columns to best possible dtypes. |
||
133 | drop_missing : Flexibly drops columns and rows. |
||
134 | _memory_usage: Gives the total memory usage in kilobytes. |
||
135 | _missing_vals: Metrics about missing values in the dataset. |
||
136 | |||
137 | |||
138 | Notes |
||
139 | ----- |
||
140 | The category dtype is not grouped in the summary, unless it contains exactly the same categories. |
||
141 | |||
142 | ''' |
||
143 | |||
144 | data = pd.DataFrame(data) |
||
145 | data_cleaned = drop_missing(data, drop_threshold_cols, drop_threshold_rows) |
||
146 | data_cleaned = convert_datatypes(data_cleaned, category=category, cat_threshold=cat_threshold, |
||
147 | cat_exclude=cat_exclude) |
||
148 | |||
149 | if show in ['changes', 'all']: |
||
150 | if show == 'all': |
||
151 | print('Before data cleaning:\n') |
||
152 | print(f'dtypes:\n{data.dtypes.value_counts()}') |
||
153 | print(f'\nNumber of rows: {data.shape[0]}') |
||
154 | print(f'Number of cols: {data.shape[1]}') |
||
155 | print(f"Missing values: {_missing_vals(data)['mv_total']}") |
||
156 | print(f'Memory usage: {_memory_usage(data)} KB') |
||
157 | print('_______________________________________________________\n') |
||
158 | print('After data cleaning:\n') |
||
159 | print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}') |
||
160 | print(f'\nNumber of rows: {data_cleaned.shape[0]}') |
||
161 | print(f'Number of cols: {data_cleaned.shape[1]}') |
||
162 | print(f"Missing values: {_missing_vals(data_cleaned)['mv_total']}") |
||
163 | print(f'Memory usage: {_memory_usage(data_cleaned)} KB') |
||
164 | print('_______________________________________________________\n') |
||
165 | |||
166 | print( |
||
167 | f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {_missing_vals(data_cleaned)['mv_total']}") |
||
168 | print(f'\nChanges:') |
||
169 | print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}') |
||
170 | print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}') |
||
171 | print(f"Dropped missing values: {_missing_vals(data)['mv_total']-_missing_vals(data_cleaned)['mv_total']}") |
||
172 | mem_change = _memory_usage(data)-_memory_usage(data_cleaned) |
||
173 | print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/_memory_usage(data),1)}%)') |
||
174 | |||
175 | return data_cleaned |
||
176 |