| Conditions | 5 |
| Total Lines | 75 |
| Code Lines | 42 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | """ |
||
| 59 | def _diff_report( |
||
| 60 | data: pd.DataFrame, |
||
| 61 | data_cleaned: pd.DataFrame, |
||
| 62 | dupl_rows: Optional[List[Union[str, int]]] = None, |
||
| 63 | single_val_cols: Optional[List[str]] = None, |
||
| 64 | show: Optional[str] = "changes", # Optional[Literal["all", "changes"]] = "changes", |
||
| 65 | ) -> None: |
||
| 66 | """ Provides information about changes between two datasets, such as dropped rows and columns, memory usage and \ |
||
| 67 | missing values. |
||
| 68 | |||
| 69 | Parameters |
||
| 70 | ---------- |
||
| 71 | data : pd.DataFrame |
||
| 72 | 2D dataset that can be coerced into Pandas DataFrame. Input the initial dataset here |
||
| 73 | data_cleaned : pd.DataFrame |
||
| 74 | 2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / updated dataset here |
||
| 75 | dupl_rows : Optional[List[Union[str, int]]], optional |
||
| 76 | List of duplicate row indices, by default None |
||
| 77 | single_val_cols : Optional[List[str]], optional |
||
| 78 | List of single-valued column indices. I.e. columns where all cells contain the same value. \ |
||
| 79 | NaNs count as a separate value, by default None |
||
| 80 | show : str, optional |
||
| 81 | {'all', 'changes', None}, by default "changes" |
||
| 82 | Specify verbosity of the output: |
||
| 83 | * 'all': Print information about the data before and after cleaning as well as information about changes \ |
||
| 84 | and memory usage (deep). Please be aware, that this can slow down the function by quite a bit. |
||
| 85 | * 'changes': Print out differences in the data before and after cleaning. |
||
| 86 | * None: No information about the data and the data cleaning is printed. |
||
| 87 | |||
| 88 | Returns |
||
| 89 | ------- |
||
| 90 | None |
||
| 91 | Print statement highlighting the datasets or changes between the two datasets. |
||
| 92 | """ |
||
| 93 | |||
| 94 | if show in ["changes", "all"]: |
||
| 95 | dupl_rows = [] if dupl_rows is None else dupl_rows.copy() |
||
| 96 | single_val_cols = [] if single_val_cols is None else single_val_cols.copy() |
||
| 97 | data_mem = _memory_usage(data, deep=False) |
||
| 98 | data_cl_mem = _memory_usage(data_cleaned, deep=False) |
||
| 99 | data_mv_tot = _missing_vals(data)["mv_total"] |
||
| 100 | data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"] |
||
| 101 | |||
| 102 | if show == "all": |
||
| 103 | data_mem = _memory_usage(data, deep=True) |
||
| 104 | data_cl_mem = _memory_usage(data_cleaned, deep=True) |
||
| 105 | print("Before data cleaning:\n") |
||
| 106 | print(f"dtypes:\n{data.dtypes.value_counts()}") |
||
| 107 | print(f"\nNumber of rows: {data.shape[0]}") |
||
| 108 | print(f"Number of cols: {data.shape[1]}") |
||
| 109 | print(f"Missing values: {data_mv_tot}") |
||
| 110 | print(f"Memory usage: {data_mem} MB") |
||
| 111 | print("_______________________________________________________\n") |
||
| 112 | print("After data cleaning:\n") |
||
| 113 | print(f"dtypes:\n{data_cleaned.dtypes.value_counts()}") |
||
| 114 | print(f"\nNumber of rows: {data_cleaned.shape[0]}") |
||
| 115 | print(f"Number of cols: {data_cleaned.shape[1]}") |
||
| 116 | print(f"Missing values: {data_cl_mv_tot}") |
||
| 117 | print(f"Memory usage: {data_cl_mem} MB") |
||
| 118 | print("_______________________________________________________\n") |
||
| 119 | |||
| 120 | print( |
||
| 121 | f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}" |
||
| 122 | ) |
||
| 123 | print(f"\nChanges:") |
||
| 124 | print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}") |
||
| 125 | print(f" of which {len(dupl_rows)} duplicates. (Rows: {dupl_rows})") |
||
| 126 | print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}") |
||
| 127 | print( |
||
| 128 | f" of which {len(single_val_cols)} single valued. (Columns: {single_val_cols})" |
||
| 129 | ) |
||
| 130 | print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}") |
||
| 131 | mem_change = data_mem - data_cl_mem |
||
| 132 | print( |
||
| 133 | f"Reduced memory by at least: {round(mem_change,2)} MB (-{round(100*mem_change/data_mem,1)}%)" |
||
| 134 | ) |
||
| 249 |