| Conditions | 5 | 
| Total Lines | 65 | 
| Code Lines | 32 | 
| Lines | 0 | 
| Ratio | 0 % | 
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | '''  | 
            ||
| 50 | def _diff_report(data, data_cleaned, dupl_rows=None, single_val_cols=None, show='changes'):  | 
            ||
| 51 | '''  | 
            ||
| 52 | Perform initial data cleaning tasks on a dataset, such as dropping single valued and empty rows, empty \  | 
            ||
| 53 | columns as well as optimizing the datatypes.  | 
            ||
| 54 | |||
| 55 | Parameters  | 
            ||
| 56 | ----------  | 
            ||
| 57 | data: 2D dataset that can be coerced into Pandas DataFrame.  | 
            ||
| 58 | Input the initial dataset here.  | 
            ||
| 59 | |||
| 60 | data_cleaned: 2D dataset that can be coerced into Pandas DataFrame.  | 
            ||
| 61 | Input the cleaned / updated dataset here.  | 
            ||
| 62 | |||
| 63 | dupl_rows: list, default None  | 
            ||
| 64 | List of duplicate row indices.  | 
            ||
| 65 | |||
| 66 | single_val_cols: list, default None  | 
            ||
| 67 | List of single-valued column indices. I.e. columns where all cells contain the same value. \  | 
            ||
| 68 | NaNs count as a separate value.  | 
            ||
| 69 | |||
| 70 |     show: {'all', 'changes', None} default 'all' | 
            ||
| 71 | Specify verbosity of the output.  | 
            ||
| 72 | * 'all': Print information about the data before and after cleaning as well as information about changes.  | 
            ||
| 73 | * 'changes': Print out differences in the data before and after cleaning.  | 
            ||
| 74 | * None: No information about the data and the data cleaning is printed.  | 
            ||
| 75 | |||
| 76 | Returns:  | 
            ||
| 77 | -------  | 
            ||
| 78 | Print statement highlighting the datasets or changes between the two datasets.  | 
            ||
| 79 | |||
| 80 | '''  | 
            ||
| 81 | |||
| 82 | if show in ['changes', 'all']:  | 
            ||
| 83 | dupl_rows = [] if dupl_rows is None else dupl_rows.copy()  | 
            ||
| 84 | single_val_cols = [] if single_val_cols is None else single_val_cols.copy()  | 
            ||
| 85 | data_mem = _memory_usage(data)  | 
            ||
| 86 | data_cl_mem = _memory_usage(data_cleaned)  | 
            ||
| 87 | data_mv_tot = _missing_vals(data)['mv_total']  | 
            ||
| 88 | data_cl_mv_tot = _missing_vals(data_cleaned)['mv_total']  | 
            ||
| 89 | |||
| 90 | if show == 'all':  | 
            ||
| 91 |             print('Before data cleaning:\n') | 
            ||
| 92 |             print(f'dtypes:\n{data.dtypes.value_counts()}') | 
            ||
| 93 |             print(f'\nNumber of rows: {data.shape[0]}') | 
            ||
| 94 |             print(f'Number of cols: {data.shape[1]}') | 
            ||
| 95 |             print(f"Missing values: {data_mv_tot}") | 
            ||
| 96 |             print(f'Memory usage: {data_mem} KB') | 
            ||
| 97 |             print('_______________________________________________________\n') | 
            ||
| 98 |             print('After data cleaning:\n') | 
            ||
| 99 |             print(f'dtypes:\n{data_cleaned.dtypes.value_counts()}') | 
            ||
| 100 |             print(f'\nNumber of rows: {data_cleaned.shape[0]}') | 
            ||
| 101 |             print(f'Number of cols: {data_cleaned.shape[1]}') | 
            ||
| 102 |             print(f"Missing values: {data_cl_mv_tot}") | 
            ||
| 103 |             print(f'Memory usage: {data_cl_mem} KB') | 
            ||
| 104 |             print('_______________________________________________________\n') | 
            ||
| 105 | |||
| 106 |         print(f'Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}') | 
            ||
| 107 | print(f'\nChanges:')  | 
            ||
| 108 |         print(f'Dropped rows: {data.shape[0]-data_cleaned.shape[0]}') | 
            ||
| 109 |         print(f'     of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})') | 
            ||
| 110 |         print(f'Dropped columns: {data.shape[1]-data_cleaned.shape[1]}') | 
            ||
| 111 |         print(f'     of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})') | 
            ||
| 112 |         print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}") | 
            ||
| 113 | mem_change = data_mem-data_cl_mem  | 
            ||
| 114 |         print(f'Reduced memory by: {round(mem_change,2)} KB (-{round(100*mem_change/data_mem,1)}%)') | 
            ||
| 115 | |||
| 202 |