Conditions | 5 |
Total Lines | 72 |
Code Lines | 32 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | """ |
||
71 | def _diff_report( |
||
72 | data: pd.DataFrame, |
||
73 | data_cleaned: pd.DataFrame, |
||
74 | dupl_rows: Optional[list[str | int]] = None, |
||
75 | single_val_cols: Optional[list[str]] = None, |
||
76 | show: Optional[Literal["all", "changes"]] = "changes", |
||
77 | ) -> None: |
||
78 | """Provide information about changes between two datasets, such as dropped rows \ |
||
79 | and columns, memory usage and missing values. |
||
80 | |||
81 | Parameters |
||
82 | ---------- |
||
83 | data : pd.DataFrame |
||
84 | 2D dataset that can be coerced into Pandas DataFrame. Input the initial \ |
||
85 | dataset here |
||
86 | data_cleaned : pd.DataFrame |
||
87 | 2D dataset that can be coerced into Pandas DataFrame. Input the cleaned / \ |
||
88 | updated dataset here |
||
89 | dupl_rows : Optional[list[str | int]], optional |
||
90 | List of duplicate row indices, by default None |
||
91 | single_val_cols : Optional[List[str]], optional |
||
92 | List of single-valued column indices. I.e. columns where all cells contain \ |
||
93 | the same value. NaNs count as a separate value, by default None |
||
94 | show : str, optional |
||
95 | {"all", "changes", None}, by default "changes" |
||
96 | Specify verbosity of the output: |
||
97 | * "all": Print information about the data before and after cleaning as \ |
||
98 | well as information about changes and memory usage (deep). Please be \ |
||
99 | aware, that this can slow down the function by quite a bit. |
||
100 | * "changes": Print out differences in the data before and after cleaning. |
||
101 | * None: No information about the data and the data cleaning is printed. |
||
102 | |||
103 | Returns |
||
104 | ------- |
||
105 | None |
||
106 | Print statement highlighting the datasets or changes between the two datasets. |
||
107 | """ |
||
108 | if show not in ["changes", "all"]: |
||
109 | return |
||
110 | |||
111 | dupl_rows = [] if dupl_rows is None else dupl_rows.copy() |
||
112 | single_val_cols = [] if single_val_cols is None else single_val_cols.copy() |
||
113 | data_mem = _memory_usage(data, deep=False) |
||
114 | data_cl_mem = _memory_usage(data_cleaned, deep=False) |
||
115 | data_mv_tot = _missing_vals(data)["mv_total"] |
||
116 | data_cl_mv_tot = _missing_vals(data_cleaned)["mv_total"] |
||
117 | |||
118 | if show == "all": |
||
119 | data_mem = _memory_usage(data, deep=True) |
||
120 | data_cl_mem = _memory_usage(data_cleaned, deep=True) |
||
121 | _print_cleaning_details("Before data cleaning:\n", data, data_mv_tot, data_mem) |
||
122 | _print_cleaning_details( |
||
123 | "After data cleaning:\n", data_cleaned, data_cl_mv_tot, data_cl_mem |
||
124 | ) |
||
125 | |||
126 | print( |
||
127 | f"Shape of cleaned data: {data_cleaned.shape} - " |
||
128 | f"Remaining NAs: {data_cl_mv_tot}\n" |
||
129 | ) |
||
130 | print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}") |
||
131 | print( |
||
132 | f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n" |
||
133 | ) |
||
134 | print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}") |
||
135 | print( |
||
136 | f" of which {len(single_val_cols)} single valued." |
||
137 | f" Columns: {single_val_cols}" |
||
138 | ) |
||
139 | print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}") |
||
140 | mem_change = data_mem - data_cl_mem |
||
141 | mem_perc = round(100 * mem_change / data_mem, 2) |
||
142 | print(f"Reduced memory by at least: {round(mem_change,3)} MB (-{mem_perc}%)\n") |
||
143 | |||
276 |