GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( a20a47...fb17b7 )
by Andreas
03:33
created

klib.describe.cat_plot()   B

Complexity

Conditions 5

Size

Total Lines 102
Code Lines 52

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 52
nop 6
dl 0
loc 102
rs 8.1042
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Functions for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import scipy
14
import seaborn as sns
15
16
from .clean import drop_missing
17
from .utils import _corr_selector
18
from .utils import _missing_vals
19
from .utils import _validate_input_0_1
20
from .utils import _validate_input_bool
21
22
23
# Functions
24
25
# Categorical Plot
26
def cat_plot(data, figsize=(10, 14), top=3, bottom=3, bar_color_top='#5ab4ac', bar_color_bottom='#d8b365'):
27
    '''
28
    Parameters
29
    ----------
30
31
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
32
    information is used to label the plots.
33
34
    figsize: tuple, default (10, 14)
35
        Use to control the figure size.
36
37
    top: int, default 3
38
        Show the "top" most frequent values in a column.
39
40
    bottom: int, default 3
41
        Show the "bottom" most frequent values in a column.
42
43
    bar_color_top: color, default '#5ab4ac'
44
        Use to control the color of the bars indicating the most common values.
45
46
    bar_color_bottom: color, default '#d8b365'
47
        Use to control the color of the bars indicating the least common values.
48
49
    Returns
50
    -------
51
    figure
52
53
    '''
54
55
    data = pd.DataFrame(data).copy()
56
    cols = list(data.select_dtypes(exclude=['number']).columns)  # categorical cols
57
    data = data[cols].applymap(str)
58
59
    if len(cols) == 0:
60
        print('No columns with categorical data were detected.')
61
62
    else:
63
        fig = plt.figure(figsize=figsize)
64
        gs = fig.add_gridspec(nrows=6, ncols=len(cols), wspace=0.2)
65
66
        for count, col in enumerate(cols):
67
68
            n_unique = data[col].nunique(dropna=False)
69
70
            if n_unique <= min(2, top+bottom):
71
                vals = int(n_unique//2)
72
                value_counts_top = data[col].value_counts(sort=True)[0:vals]
73
                value_counts_idx_top = list(map(str, data[col].value_counts()[0:vals].index.tolist()))
74
                value_counts_bot = data[col].value_counts(sort=True)[-vals:]
75
                value_counts_idx_bot = list(map(str, data[col].value_counts()[-vals:].index.tolist()))
76
77
                data[col][data[col].isin(value_counts_idx_top)] = 2
78
                data[col][data[col].isin(value_counts_idx_bot)] = -2
79
                data[col][~((data[col] == 2) | (data[col] == -2))] = 0
80
81
            else:
82
                value_counts_top = data[col].value_counts(sort=True)[0:top]
83
                value_counts_idx_top = list(map(str, data[col].value_counts()[0:top].index.tolist()))
84
                if bottom == 0:
85
                    value_counts_bot = []
86
                    value_counts_idx_bot = []
87
                else:
88
                    value_counts_bot = data[col].value_counts(sort=True)[-bottom:]
89
                    value_counts_idx_bot = list(map(str, data[col].value_counts()[-bottom:].index.tolist()))
90
91
                data[col][data[col].isin(value_counts_idx_top)] = 2
92
                data[col][data[col].isin(value_counts_idx_bot)] = -2
93
                data[col][~((data[col] == 2) | (data[col] == -2))] = 0
94
95
            # Barcharts
96
            ax_top = fig.add_subplot(gs[:1, count:count+1])
97
            ax_top.bar(value_counts_idx_top, value_counts_top, color=bar_color_top, width=0.85)
98
            ax_top.bar(value_counts_idx_bot, value_counts_bot, color=bar_color_bottom, width=0.85)
99
            ax_top.set(frame_on=False)
100
            ax_top.tick_params(axis='x', labelrotation=90)
101
102
            # Summary stats
103
            ax_bottom = fig.add_subplot(gs[1:2, count:count+1])
104
            ax_bottom.get_yaxis().set_visible(False)
105
            ax_bottom.get_xaxis().set_visible(False)
106
            ax_bottom.set(frame_on=False)
107
            ax_bottom.text(0, 0, f'Unique values: {n_unique}\n\n\
108
                        Top {top}vals: {sum(value_counts_top)} ({sum(value_counts_top)/data.shape[0]*100:.1f}%)\n\
109
                        Bottom {bottom} vals: {sum(value_counts_bot)} ({sum(value_counts_bot)/data.shape[0]*100:.1f}%)',
110
                           transform=ax_bottom.transAxes, color='#111111', fontsize=11)
111
112
        data = data.astype('int')
113
114
        # Heatmap
115
        ax_hm = fig.add_subplot(gs[2:, :])
116
        sns.heatmap(data, cmap='BrBG', cbar=False, vmin=-4.25, vmax=4.25, ax=ax_hm)
117
        ax_hm.set_yticks(np.round(ax_hm.get_yticks()[0::5], -1))
118
        ax_hm.set_yticklabels(ax_hm.get_yticks())
119
        ax_hm.set_xticklabels(ax_hm.get_xticklabels(),
120
                              horizontalalignment='center',
121
                              fontweight='light',
122
                              fontsize='medium')
123
        ax_hm.tick_params(length=1, colors='#111111')
124
125
        gs.figure.suptitle('Categorical data plot', x=0.47, y=0.925, fontsize=18, color='#111111')
126
127
        return fig
128
129
130
# Correlation Matrix
131
def corr_mat(data, split=None, threshold=0, method='pearson'):
132
    '''
133
    Parameters
134
    ----------
135
136
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
137
    information is used to label the plots.
138
139
    split: {None, 'pos', 'neg', 'high', 'low'}, default None
140
        Type of split to be performed.
141
142
    threshold: float, default 0
143
        Value between 0 <= threshold <= 1
144
145
    method: {'pearson', 'spearman', 'kendall'}, default 'pearson'
146
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
147
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
148
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more expensive but
149
                    more robus in smaller dataets than 'spearman'.
150
151
    Returns
152
    -------
153
    Pandas Styler object
154
155
    '''
156
157
    # Validate Inputs
158
    _validate_input_0_1(threshold, 'threshold')
159
160
    def color_negative_red(val):
161
        color = '#FF3344' if val < 0 else None
162
        return 'color: %s' % color
163
164
    data = pd.DataFrame(data)
165
    corr = data.corr(method=method)
166
167
    corr = _corr_selector(corr, split=split, threshold=threshold)
168
169
    return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')
170
171
172
# Correlation matrix / heatmap
173
def corr_plot(data, split=None, threshold=0, target=None, method='pearson', cmap='BrBG', figsize=(12, 10), annot=True,
174
              dev=False, **kwargs):
175
    '''
176
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
177
178
    Parameters
179
    ----------
180
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
181
    information is used to label the plots.
182
183
    split: {None, 'pos', 'neg', 'high', 'low'}, default None
184
        Type of split to be performed.
185
186
        * None: visualize all correlations between the feature-columns.
187
        * pos: visualize all positive correlations between the feature-columns above the threshold.
188
        * neg: visualize all negative correlations between the feature-columns below the threshold.
189
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
190
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
191
192
    threshold: float, default 0
193
        Value between 0 <= threshold <= 1
194
195
    target: string, list, np.array or pd.Series, default None
196
        Specify target for correlation. E.g. label column to generate only the correlations between each feature\
197
        and the label.
198
199
    method: {'pearson', 'spearman', 'kendall'}, default 'pearson'
200
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
201
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
202
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more expensive but
203
                   more robust in smaller dataets than 'spearman'.
204
205
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
206
        The mapping from data values to color space.
207
208
    figsize: tuple, default (12, 10)
209
        Use to control the figure size.
210
211
    annot: bool, default True
212
        Use to show or hide annotations.
213
214
    dev: bool, default False
215
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed.s
216
217
    **kwargs: optional
218
        Additional elements to control the visualization of the plot, e.g.:
219
220
        * mask: bool, default True
221
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
222
        case to avoid overlap.
223
        * vmax: float, default is calculated from the given correlation coefficients.
224
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
225
        * vmin: float, default is calculated from the given correlation coefficients.
226
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
227
        * linewidths: float, default 0.5
228
        Controls the line-width inbetween the squares.
229
        * annot_kws: dict, default {'size' : 10}
230
        Controls the font size of the annotations. Only available when annot = True.
231
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
232
        Controls the size of the colorbar.
233
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
234
235
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
236
237
    Returns
238
    -------
239
    ax: matplotlib Axes
240
        Returns the Axes object with the plot for further tweaking.
241
242
    '''
243
244
    # Validate Inputs
245
    _validate_input_0_1(threshold, 'threshold')
246
    _validate_input_bool(annot, 'annot')
247
    _validate_input_bool(dev, 'dev')
248
249
    data = pd.DataFrame(data)
250
251
    # Obtain correlations
252
    if isinstance(target, (str, list, pd.Series, np.ndarray)):
253
        target_data = []
254
        if isinstance(target, str):
255
            target_data = data[target]
256
            data = data.drop(target, axis=1)
257
258
        elif isinstance(target, (list, pd.Series, np.ndarray)):
259
            target_data = pd.Series(target)
260
261
        corr = pd.DataFrame(data.corrwith(target_data))
262
        corr.rename_axis(target, axis=1, inplace=True)
263
        corr = _corr_selector(corr, split=split, threshold=threshold)
264
        corr = corr.sort_values(corr.columns[0], ascending=False)
265
        vmax = np.round(np.nanmax(corr)-0.05, 2)
266
        vmin = np.round(np.nanmin(corr)+0.05, 2)
267
        mask = False
268
        square = False
269
270
    else:
271
        corr = corr_mat(data, split=split, threshold=threshold, method=method).data
272
273
        mask = np.triu(np.ones_like(corr, dtype=np.bool))  # Generate mask for the upper triangle
274
        square = True
275
276
        vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
277
        vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
278
279
    fig, ax = plt.subplots(figsize=figsize)
280
281
    # Specify kwargs for the heatmap
282
    kwargs = {'mask': mask,
283
              'cmap': cmap,
284
              'annot': annot,
285
              'vmax': vmax,
286
              'vmin': vmin,
287
              'linewidths': .5,
288
              'annot_kws': {'size': 10},
289
              'cbar_kws': {'shrink': .95, 'aspect': 30},
290
              **kwargs}
291
292
    # Draw heatmap with mask and some default settings
293
    sns.heatmap(corr,
294
                center=0,
295
                square=square,
296
                fmt='.2f',
297
                **kwargs
298
                )
299
300
    ax.set_title(f'Feature-correlation ({method})', fontdict={'fontsize': 18})
301
302
    # Display settings
303
    if dev:
304
        fig.suptitle(f"\
305
            Settings (dev-mode): \n\
306
            - split-mode: {split} \n\
307
            - threshold: {threshold} \n\
308
            - method: {method} \n\
309
            - annotations: {annot} \n\
310
            - cbar: \n\
311
                - vmax: {vmax} \n\
312
                - vmin: {vmin} \n\
313
            - linewidths: {kwargs['linewidths']} \n\
314
            - annot_kws: {kwargs['annot_kws']} \n\
315
            - cbar_kws: {kwargs['cbar_kws']}",
316
                     fontsize=12,
317
                     color='gray',
318
                     x=0.35,
319
                     y=0.85,
320
                     ha='left')
321
322
    return ax
323
324
325
# Distribution plot
326
def dist_plot(data, mean_color='orange', figsize=(14, 2), fill_range=(0.025, 0.975), hist=False, bins=None,
327
              showall=False, kde_kws=None, rug_kws=None, fill_kws=None, font_kws=None):
328
    '''
329
    Two-dimensional visualization of the distribution of numerical features.
330
331
    Parameters
332
    ----------
333
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
334
    information is used to label the plots.
335
336
    mean_color: color, default 'orange'
337
        Color of the vertical line indicating the mean of the data.
338
339
    figsize: tuple, default (14, 2)
340
        Use to control the figure size.
341
342
    fill_range: tuple, default (0.025, 0.975)
343
        Use to control set the quantiles for shading. Default spans 95% of the data, which is about two std. deviations\
344
        above and below the mean.
345
346
    hist: bool, default False
347
        Set to True to display histogram bars in the plot.
348
349
    bins: integer, default None
350
        Specification of the number of hist bins. Requires hist = True
351
352
    showall: bool, default False
353
        Set to True to remove the output limit of 20 plots.
354
355
    kdw_kws: dict, default None
356
        Keyword arguments for kdeplot().
357
358
    rug_kws: dict, default None
359
        Keyword arguments for rugplot().
360
361
    fill_kws: dict, default None
362
        Keyword arguments to control the fill.
363
364
    font_kws: dict, default None
365
        Keyword arguments to control the font.
366
367
    Returns
368
    -------
369
    ax: matplotlib Axes
370
        Returns the Axes object with the plot for further tweaking.
371
372
    '''
373
374
    # Validate Inputs
375
    _validate_input_bool(hist, 'hist')
376
    _validate_input_bool(showall, 'showall')
377
378
    # Handle dictionary defaults
379
    kde_kws = {} if kde_kws is None else kde_kws.copy()
380
    rug_kws = {} if rug_kws is None else rug_kws.copy()
381
    fill_kws = {} if fill_kws is None else fill_kws.copy()
382
    font_kws = {} if font_kws is None else font_kws.copy()
383
384
    data = drop_missing(pd.DataFrame(data).copy())  # drop empty columns and rows
385
    cols = list(data.select_dtypes(include=['number']).columns)  # numeric cols
386
    data = data[cols]
387
388
    # Default settings
389
    kde_kws = {'color': 'k', 'alpha': 0.7, 'linewidth': 1, **kde_kws}
390
    rug_kws = {'color': 'brown', 'alpha': 0.5, 'linewidth': 2, 'height': 0.04, **rug_kws}
391
    fill_kws = {'color': 'brown', 'alpha': 0.1, **fill_kws}
392
    font_kws = {'color':  '#111111', 'weight': 'normal', 'size': 11, **font_kws}
393
394
    if len(cols) == 0:
395
        print('No columns with numeric data were detected.')
396
        ax = None
397
398
    else:
399
        if len(cols) >= 20 and showall is False:
400
            print(f'Note: The number of numerical features is very large ({len(cols)}), please consider splitting the data.\
401
            Showing plots for the first 20 numerical features. Override this by setting showall=True.')
402
            cols = cols[:20]
403
404
        ax = []
405
        for col in cols:
406
            # Drop missing values
407
            dropped_values = data[col].isna().sum()
408
            if dropped_values > 0:
409
                print(f'Dropped {dropped_values} missing values from column {col}.')
410
                col_data = data[col].dropna(axis=0)
411
            else:
412
                col_data = data[col]
413
414
            _, ax = plt.subplots(figsize=figsize)
415
            ax = sns.distplot(col_data, bins=bins, hist=hist, rug=True, kde_kws=kde_kws,
416
                              rug_kws=rug_kws, hist_kws={'alpha': 0.5, 'histtype': 'step'})
417
418
            # Vertical lines and fill
419
            line = ax.lines[0]
420
            x = line.get_xydata()[:, 0]
421
            y = line.get_xydata()[:, 1]
422
            ax.fill_between(x, y,
423
                            where=(
424
                                (x >= np.quantile(col_data, fill_range[0])) &
425
                                (x <= np.quantile(col_data, fill_range[1]))),
426
                            label=f'{fill_range[0]*100:.0f}% - {fill_range[1]*100:.0f}%',
427
                            **fill_kws)
428
429
            ax.vlines(x=np.mean(col_data),
430
                      ymin=0,
431
                      ymax=np.interp(np.mean(col_data), x, y),
432
                      ls='dotted', color=mean_color, lw=2, label='mean')
433
            ax.vlines(x=np.median(col_data),
434
                      ymin=0,
435
                      ymax=np.interp(np.median(col_data), x, y),
436
                      ls=':', color='.3', label='median')
437
            ax.vlines(x=np.quantile(col_data, 0.25),
438
                      ymin=0,
439
                      ymax=np.interp(np.quantile(col_data, 0.25), x, y), ls=':', color='.5', label='25%')
440
            ax.vlines(x=np.quantile(col_data, 0.75),
441
                      ymin=0,
442
                      ymax=np.interp(np.quantile(col_data, 0.75), x, y), ls=':', color='.5', label='75%')
443
444
            ax.set_ylim(0,)
445
            ax.set_xlim(ax.get_xlim()[0]*1.1, ax.get_xlim()[1]*1.1)
446
447
            # Annotations and legend
448
            ax.text(0.01, 0.85, f'Mean: {np.round(np.mean(col_data),2)}',
449
                    fontdict=font_kws, transform=ax.transAxes)
450
            ax.text(0.01, 0.7, f'Std. dev: {np.round(scipy.stats.tstd(col_data),2)}',
451
                    fontdict=font_kws, transform=ax.transAxes)
452
            ax.text(0.01, 0.55, f'Skew: {np.round(scipy.stats.skew(col_data),2)}',
453
                    fontdict=font_kws, transform=ax.transAxes)
454
            ax.text(0.01, 0.4, f'Kurtosis: {np.round(scipy.stats.kurtosis(col_data),2)}',  # Excess Kurtosis
455
                    fontdict=font_kws, transform=ax.transAxes)
456
            ax.text(0.01, 0.25, f'Count: {np.round(len(col_data))}',
457
                    fontdict=font_kws, transform=ax.transAxes)
458
            ax.legend(loc='upper right')
459
460
    return ax
461
462
463
# Missing value plot
464
def missingval_plot(data, cmap='PuBuGn', figsize=(12, 12), sort=False, spine_color='#EEEEEE'):
465
    '''
466
    Two-dimensional visualization of the missing values in a dataset.
467
468
    Parameters
469
    ----------
470
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
471
    information is used to label the plots.
472
473
    cmap: colormap, default 'PuBuGn'
474
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
475
        documentation.
476
477
    figsize: tuple, default (20, 12)
478
        Use to control the figure size.
479
480
    sort: bool, default False
481
        Sort columns based on missing values in descending order and drop columns without any missing values
482
483
    spine_color: color, default '#EEEEEE'
484
        Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
485
486
    Returns
487
    -------
488
    figure
489
490
    '''
491
492
    data = pd.DataFrame(data)
493
494
    if sort:
495
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
496
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
497
        data = data[final_cols]
498
        print('Displaying only columns with missing values.')
499
500
    # Identify missing values
501
    mv_cols = _missing_vals(data)['mv_cols']  # data.isna().sum(axis=0)
502
    mv_rows = _missing_vals(data)['mv_rows']  # data.isna().sum(axis=1)
503
    mv_total = _missing_vals(data)['mv_total']
504
    mv_cols_ratio = _missing_vals(data)['mv_cols_ratio']  # mv_cols / data.shape[0]
505
    total_datapoints = data.shape[0]*data.shape[1]
506
507
    if mv_total == 0:
508
        print('No missing values found in the dataset.')
509
    else:
510
        # Create figure and axes
511
        fig = plt.figure(figsize=figsize)
512
        gs = fig.add_gridspec(nrows=6, ncols=6, left=0.05, wspace=0.05)
513
        ax1 = fig.add_subplot(gs[:1, :5])
514
        ax2 = fig.add_subplot(gs[1:, :5])
515
        ax3 = fig.add_subplot(gs[:1, 5:])
516
        ax4 = fig.add_subplot(gs[1:, 5:])
517
518
        # ax1 - Barplot
519
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
520
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
521
        ax1.get_xaxis().set_visible(False)
522
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
523
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
524
        ax1.grid(linestyle=':', linewidth=1)
525
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
526
        ax1.tick_params(axis='y', colors='#111111', length=1)
527
528
        # annotate values on top of the bars
529
        for rect, label in zip(ax1.patches, mv_cols):
530
            height = rect.get_height()
531
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
532
                     ha='center',
533
                     va='bottom',
534
                     rotation='90',
535
                     alpha=0.5,
536
                     fontsize='small')
537
538
        ax1.set_frame_on(True)
539
        for _, spine in ax1.spines.items():
540
            spine.set_visible(True)
541
            spine.set_color(spine_color)
542
        ax1.spines['top'].set_color(None)
543
544
        # ax2 - Heatmap
545
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
546
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
547
        ax2.set_yticklabels(ax2.get_yticks())
548
        ax2.set_xticklabels(
549
            ax2.get_xticklabels(),
550
            horizontalalignment='center',
551
            fontweight='light',
552
            fontsize='medium')
553
        ax2.tick_params(length=1, colors='#111111')
554
        for _, spine in ax2.spines.items():
555
            spine.set_visible(True)
556
            spine.set_color(spine_color)
557
558
        # ax3 - Summary
559
        fontax3 = {'color':  '#111111',
560
                   'weight': 'normal',
561
                   'size': 12,
562
                   }
563
        ax3.get_xaxis().set_visible(False)
564
        ax3.get_yaxis().set_visible(False)
565
        ax3.set(frame_on=False)
566
567
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
568
                 transform=ax3.transAxes,
569
                 fontdict=fontax3)
570
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
571
                 transform=ax3.transAxes,
572
                 fontdict=fontax3)
573
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
574
                 transform=ax3.transAxes,
575
                 fontdict=fontax3)
576
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
577
                 transform=ax3.transAxes,
578
                 fontdict=fontax3)
579
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
580
                 transform=ax3.transAxes,
581
                 fontdict=fontax3)
582
583
        # ax4 - Scatter plot
584
        ax4.get_yaxis().set_visible(False)
585
        for _, spine in ax4.spines.items():
586
            spine.set_color(spine_color)
587
        ax4.tick_params(axis='x', colors='#111111', length=1)
588
589
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
590
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
591
        ax4.set_xlim(0, max(mv_rows)+0.5)
592
        ax4.grid(linestyle=':', linewidth=1)
593
594
        gs.figure.suptitle('Missing value plot', x=0.45, y=0.94, fontsize=18, color='#111111')
595
596
        return fig
597