Conditions | 6 |
Total Lines | 105 |
Code Lines | 66 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | ''' |
||
19 | def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), spine_color='#EEEEEE'): |
||
20 | ''' |
||
21 | Two-dimensional visualization of the missing values in a dataset. |
||
22 | |||
23 | Parameters: |
||
24 | ---------- |
||
25 | data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information is used to label the plots. |
||
26 | |||
27 | cmap: colormap, default 'PuBuGn' |
||
28 | Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib documentation. |
||
29 | |||
30 | figsize: tuple, default (20,12) |
||
31 | Use to control the figure size. |
||
32 | |||
33 | spine_color: color-code, default '#EEEEEE' |
||
34 | Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument. |
||
35 | |||
36 | Returns: |
||
37 | ------- |
||
38 | ax: matplotlib Axes. Axes object with the heatmap. |
||
39 | ''' |
||
40 | |||
41 | # Identify missing values |
||
42 | mv_cols = data.isna().sum(axis=0) |
||
43 | mv_rows = data.isna().sum(axis=1) |
||
44 | mv_total = mv_cols.sum() |
||
45 | mv_cols_rel = mv_cols / data.shape[0] |
||
46 | total_datapoints = data.shape[0]*data.shape[1] |
||
47 | |||
48 | if mv_total == 0: |
||
49 | print('No missing values found in the dataset.') |
||
50 | else: |
||
51 | # Create figure and axes |
||
52 | fig = plt.figure(figsize=figsize) |
||
53 | grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05) |
||
54 | ax1 = fig.add_subplot(grid[:1, :5]) |
||
55 | ax2 = fig.add_subplot(grid[1:, :5]) |
||
56 | ax3 = fig.add_subplot(grid[:1, 5:]) |
||
57 | ax4 = fig.add_subplot(grid[1:, 5:]) |
||
58 | |||
59 | # ax1 - Barplot |
||
60 | colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height |
||
61 | ax1.bar(range(len(mv_cols)), np.round((mv_cols_rel)*100, 2), color=colors) |
||
62 | ax1.get_xaxis().set_visible(False) |
||
63 | ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5)) |
||
64 | ax1.set_ylim(0, np.max(mv_cols_rel)*100) |
||
65 | ax1.grid(linestyle=':', linewidth=1) |
||
66 | ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0)) |
||
67 | ax1.tick_params(axis='y', colors='#111111', length=1) |
||
68 | |||
69 | # annotate values on top of the bars |
||
70 | for rect, label in zip(ax1.patches, mv_cols): |
||
71 | height = rect.get_height() |
||
72 | ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label, |
||
73 | ha='center', |
||
74 | va='bottom', |
||
75 | rotation='90', |
||
76 | alpha=0.5, |
||
77 | fontsize='small') |
||
78 | |||
79 | ax1.set_frame_on(True) |
||
80 | for _, spine in ax1.spines.items(): |
||
81 | spine.set_visible(True) |
||
82 | spine.set_color(spine_color) |
||
83 | ax1.spines['top'].set_color(None) |
||
84 | |||
85 | # ax2 - Heatmap |
||
86 | sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2) |
||
87 | ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1)) |
||
88 | ax2.set_yticklabels(ax2.get_yticks()) |
||
89 | ax2.set_xticklabels( |
||
90 | ax2.get_xticklabels(), |
||
91 | horizontalalignment='center', |
||
92 | fontweight='light', |
||
93 | fontsize='medium') |
||
94 | ax2.tick_params(length=1, colors='#111111') |
||
95 | for _, spine in ax2.spines.items(): |
||
96 | spine.set_visible(True) |
||
97 | spine.set_color(spine_color) |
||
98 | |||
99 | # ax3 - Summary |
||
100 | fontax3 = {'color': '#111111', |
||
101 | 'weight': 'normal', |
||
102 | 'size': 12, |
||
103 | } |
||
104 | ax3.get_xaxis().set_visible(False) |
||
105 | ax3.get_yaxis().set_visible(False) |
||
106 | ax3.set(frame_on=False) |
||
107 | |||
108 | ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3) |
||
109 | ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3) |
||
110 | ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3) |
||
111 | ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3) |
||
112 | ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3) |
||
113 | |||
114 | # ax4 - Scatter plot |
||
115 | ax4.get_yaxis().set_visible(False) |
||
116 | for _, spine in ax4.spines.items(): |
||
117 | spine.set_color(spine_color) |
||
118 | ax4.tick_params(axis='x', colors='#111111', length=1) |
||
119 | |||
120 | ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".") |
||
121 | ax4.set_ylim(0, len(mv_rows)) |
||
122 | ax4.set_ylim(ax4.get_ylim()[::-1]) # invert y-axis |
||
123 | ax4.grid(linestyle=':', linewidth=1) |
||
124 | |||
263 |