| Conditions | 7 |
| Total Lines | 131 |
| Code Lines | 84 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | ''' |
||
| 102 | def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'): |
||
| 103 | ''' |
||
| 104 | Two-dimensional visualization of the missing values in a dataset. |
||
| 105 | |||
| 106 | Parameters |
||
| 107 | ---------- |
||
| 108 | data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \ |
||
| 109 | information is used to label the plots. |
||
| 110 | |||
| 111 | cmap: colormap, default 'PuBuGn' |
||
| 112 | Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \ |
||
| 113 | documentation. |
||
| 114 | |||
| 115 | figsize: tuple, default (20,12) |
||
| 116 | Use to control the figure size. |
||
| 117 | |||
| 118 | sort: bool, default False |
||
| 119 | Sort columns based on missing values in descending order and drop columns without any missing values |
||
| 120 | |||
| 121 | spine_color: color-code, default '#EEEEEE' |
||
| 122 | Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument. |
||
| 123 | |||
| 124 | Returns |
||
| 125 | ------- |
||
| 126 | figure |
||
| 127 | ''' |
||
| 128 | |||
| 129 | data = pd.DataFrame(data) |
||
| 130 | |||
| 131 | if sort: |
||
| 132 | mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False) |
||
| 133 | final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist() |
||
| 134 | data = data[final_cols] |
||
| 135 | print('Displaying only columns with missing values.') |
||
| 136 | |||
| 137 | # Identify missing values |
||
| 138 | mv_cols = _missing_vals(data)['mv_cols'] # data.isna().sum(axis=0) |
||
| 139 | mv_rows = _missing_vals(data)['mv_rows'] # data.isna().sum(axis=1) |
||
| 140 | mv_total = _missing_vals(data)['mv_total'] |
||
| 141 | mv_cols_ratio = _missing_vals(data)['mv_cols_ratio'] # mv_cols / data.shape[0] |
||
| 142 | total_datapoints = data.shape[0]*data.shape[1] |
||
| 143 | |||
| 144 | if mv_total == 0: |
||
| 145 | print('No missing values found in the dataset.') |
||
| 146 | else: |
||
| 147 | # Create figure and axes |
||
| 148 | fig = plt.figure(figsize=figsize) |
||
| 149 | grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05) |
||
| 150 | ax1 = fig.add_subplot(grid[:1, :5]) |
||
| 151 | ax2 = fig.add_subplot(grid[1:, :5]) |
||
| 152 | ax3 = fig.add_subplot(grid[:1, 5:]) |
||
| 153 | ax4 = fig.add_subplot(grid[1:, 5:]) |
||
| 154 | |||
| 155 | # ax1 - Barplot |
||
| 156 | colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height |
||
| 157 | ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors) |
||
| 158 | ax1.get_xaxis().set_visible(False) |
||
| 159 | ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5)) |
||
| 160 | ax1.set_ylim(0, np.max(mv_cols_ratio)*100) |
||
| 161 | ax1.grid(linestyle=':', linewidth=1) |
||
| 162 | ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0)) |
||
| 163 | ax1.tick_params(axis='y', colors='#111111', length=1) |
||
| 164 | |||
| 165 | # annotate values on top of the bars |
||
| 166 | for rect, label in zip(ax1.patches, mv_cols): |
||
| 167 | height = rect.get_height() |
||
| 168 | ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label, |
||
| 169 | ha='center', |
||
| 170 | va='bottom', |
||
| 171 | rotation='90', |
||
| 172 | alpha=0.5, |
||
| 173 | fontsize='small') |
||
| 174 | |||
| 175 | ax1.set_frame_on(True) |
||
| 176 | for _, spine in ax1.spines.items(): |
||
| 177 | spine.set_visible(True) |
||
| 178 | spine.set_color(spine_color) |
||
| 179 | ax1.spines['top'].set_color(None) |
||
| 180 | |||
| 181 | # ax2 - Heatmap |
||
| 182 | sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2) |
||
| 183 | ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1)) |
||
| 184 | ax2.set_yticklabels(ax2.get_yticks()) |
||
| 185 | ax2.set_xticklabels( |
||
| 186 | ax2.get_xticklabels(), |
||
| 187 | horizontalalignment='center', |
||
| 188 | fontweight='light', |
||
| 189 | fontsize='medium') |
||
| 190 | ax2.tick_params(length=1, colors='#111111') |
||
| 191 | for _, spine in ax2.spines.items(): |
||
| 192 | spine.set_visible(True) |
||
| 193 | spine.set_color(spine_color) |
||
| 194 | |||
| 195 | # ax3 - Summary |
||
| 196 | fontax3 = {'color': '#111111', |
||
| 197 | 'weight': 'normal', |
||
| 198 | 'size': 12, |
||
| 199 | } |
||
| 200 | ax3.get_xaxis().set_visible(False) |
||
| 201 | ax3.get_yaxis().set_visible(False) |
||
| 202 | ax3.set(frame_on=False) |
||
| 203 | |||
| 204 | ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K", |
||
| 205 | transform=ax3.transAxes, |
||
| 206 | fontdict=fontax3) |
||
| 207 | ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K", |
||
| 208 | transform=ax3.transAxes, |
||
| 209 | fontdict=fontax3) |
||
| 210 | ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", |
||
| 211 | transform=ax3.transAxes, |
||
| 212 | fontdict=fontax3) |
||
| 213 | ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", |
||
| 214 | transform=ax3.transAxes, |
||
| 215 | fontdict=fontax3) |
||
| 216 | ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", |
||
| 217 | transform=ax3.transAxes, |
||
| 218 | fontdict=fontax3) |
||
| 219 | |||
| 220 | # ax4 - Scatter plot |
||
| 221 | ax4.get_yaxis().set_visible(False) |
||
| 222 | for _, spine in ax4.spines.items(): |
||
| 223 | spine.set_color(spine_color) |
||
| 224 | ax4.tick_params(axis='x', colors='#111111', length=1) |
||
| 225 | |||
| 226 | ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1) |
||
| 227 | ax4.set_ylim((0, len(mv_rows))[::-1]) # limit and invert y-axis |
||
| 228 | ax4.set_xlim(0, max(mv_rows)+0.5) |
||
| 229 | ax4.grid(linestyle=':', linewidth=1) |
||
| 230 | |||
| 231 | ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18}) |
||
| 232 | return grid |
||
| 233 | |||
| 351 |