GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( db9ab6...174a77 )
by Andreas
01:14
created

klib.describe.corr_mat()   B

Complexity

Conditions 6

Size

Total Lines 63
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
eloc 21
nop 6
dl 0
loc 63
rs 8.4426
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Functions for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import scipy
14
import seaborn as sns
15
16
from .clean import drop_missing
17
from .utils import _corr_selector
18
from .utils import _missing_vals
19
from .utils import _validate_input_bool
20
from .utils import _validate_input_int
21
from .utils import _validate_input_range
22
from .utils import _validate_input_smaller
23
24
25
# Functions
26
27
# Categorical Plot
28
def cat_plot(data, figsize=(10, 14), top=3, bottom=3, bar_color_top='#5ab4ac', bar_color_bottom='#d8b365', cmap='BrBG'):
29
    '''
30
    Two-dimensional visualization of the number and frequency of categorical features.
31
32
    Parameters
33
    ----------
34
35
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
36
    information is used to label the plots.
37
38
    figsize: tuple, default (10, 14)
39
        Use to control the figure size.
40
41
    top: int, default 3
42
        Show the "top" most frequent values in a column.
43
44
    bottom: int, default 3
45
        Show the "bottom" most frequent values in a column.
46
47
    bar_color_top: color, default '#5ab4ac'
48
        Use to control the color of the bars indicating the most common values.
49
50
    bar_color_bottom: color, default '#d8b365'
51
        Use to control the color of the bars indicating the least common values.
52
53
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
54
        The mapping from data values to color space.
55
56
    Returns
57
    -------
58
    gs: Figure with array of Axes objects.
59
60
    '''
61
62
    # Validate Inputs
63
    _validate_input_int(top, 'top')
64
    _validate_input_int(bottom, 'bottom')
65
    _validate_input_range(top, 'top', 0, data.shape[1])
66
    _validate_input_range(bottom, 'bottom', 0, data.shape[1])
67
68
    data = pd.DataFrame(data).copy()
69
    cols = list(data.select_dtypes(exclude=['number']).columns)  # categorical cols
70
    data = data[cols].applymap(str)
71
72
    if len(cols) == 0:
73
        print('No columns with categorical data were detected.')
74
75
    fig = plt.figure(figsize=figsize)
76
    gs = fig.add_gridspec(nrows=6, ncols=len(cols), wspace=0.2)
77
78
    for count, col in enumerate(cols):
79
80
        n_unique = data[col].nunique(dropna=False)
81
        value_counts = data[col].value_counts()
82
        lim_top, lim_bot = top, bottom
83
84
        if n_unique < top+bottom:
85
            lim_top = lim_bot = int(n_unique//2)
86
87
        value_counts_top = value_counts[0:lim_top]
88
        value_counts_idx_top = list(map(str, value_counts_top.index.tolist()))
89
        value_counts_bot = value_counts[-lim_bot:]
90
        value_counts_idx_bot = list(map(str, value_counts_bot.index.tolist()))
91
92
        if top == 0:
93
            value_counts_top = value_counts_idx_top = []
94
95
        elif bottom == 0:
96
            value_counts_bot = value_counts_idx_bot = []
97
98
        data[col][data[col].isin(value_counts_idx_top)] = 2
99
        data[col][data[col].isin(value_counts_idx_bot)] = -2
100
        data[col][~((data[col] == 2) | (data[col] == -2))] = 0
101
102
        # Barcharts
103
        ax_top = fig.add_subplot(gs[:1, count:count+1])
104
        ax_top.bar(value_counts_idx_top, value_counts_top, color=bar_color_top, width=0.85)
105
        ax_top.bar(value_counts_idx_bot, value_counts_bot, color=bar_color_bottom, width=0.85)
106
        ax_top.set(frame_on=False)
107
        ax_top.tick_params(axis='x', labelrotation=90)
108
109
        # Summary stats
110
        ax_bottom = fig.add_subplot(gs[1:2, count:count+1])
111
        ax_bottom.get_yaxis().set_visible(False)
112
        ax_bottom.get_xaxis().set_visible(False)
113
        ax_bottom.set(frame_on=False)
114
        ax_bottom.text(0, 0, f'Unique values: {n_unique}\n\n'
115
                       f'Top {top} vals: {sum(value_counts_top)} ({sum(value_counts_top)/data.shape[0]*100:.1f}%)\n'
116
                       f'Bottom {bottom} vals: {sum(value_counts_bot)} ' +
117
                       f'({sum(value_counts_bot)/data.shape[0]*100:.1f}%)',
118
                       transform=ax_bottom.transAxes, color='#111111', fontsize=11)
119
120
    # Heatmap
121
    data = data.astype('int')
122
    ax_hm = fig.add_subplot(gs[2:, :])
123
    sns.heatmap(data, cmap=cmap, cbar=False, vmin=-4.25, vmax=4.25, ax=ax_hm)
124
    ax_hm.set_yticks(np.round(ax_hm.get_yticks()[0::5], -1))
125
    ax_hm.set_yticklabels(ax_hm.get_yticks())
126
    ax_hm.set_xticklabels(ax_hm.get_xticklabels(),
127
                          horizontalalignment='center',
128
                          fontweight='light',
129
                          fontsize='medium')
130
    ax_hm.tick_params(length=1, colors='#111111')
131
132
    gs.figure.suptitle('Categorical data plot', x=0.47, y=0.925, fontsize=18, color='#111111')
133
134
    return gs
135
136
137
# Correlation Matrix
138
def corr_mat(data, split=None, threshold=0, target=None, method='pearson', colored=True):
139
    '''
140
    Returns a color-encoded correlation matrix.
141
142
    Parameters
143
    ----------
144
145
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
146
    information is used to label the plots.
147
148
    split: {None, 'pos', 'neg', 'above', 'below'}, default None
149
        Type of split to be performed.
150
151
    threshold: float, default 0
152
        Value between 0 <= threshold <= 1
153
154
    method: {'pearson', 'spearman', 'kendall'}, default 'pearson'
155
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
156
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
157
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more expensive but
158
                    more robus in smaller dataets than 'spearman'.
159
160
    colored: bool, default True
161
        If True the negative values in the correlation matrix are colored in red.
162
163
    Returns
164
    -------
165
    Pandas Styler object if colored = True
166
    Pandas DataFrame if colored =False
167
168
    '''
169
170
    # Validate Inputs
171
    _validate_input_range(threshold, 'threshold', -1, 1)
172
173
    def color_negative_red(val):
174
        color = '#FF3344' if val < 0 else None
175
        return 'color: %s' % color
176
177
    data = pd.DataFrame(data)
178
179
    if isinstance(target, (str, list, pd.Series, np.ndarray)):
180
        target_data = []
181
        if isinstance(target, str):
182
            target_data = data[target]
183
            data = data.drop(target, axis=1)
184
185
        elif isinstance(target, (list, pd.Series, np.ndarray)):
186
            target_data = pd.Series(target)
187
            target = target.name
188
189
        corr = pd.DataFrame(data.corrwith(target_data)).rename_axis(target, axis=1)
190
        corr = corr.sort_values(corr.columns[0], ascending=False)
191
192
    else:
193
        corr = data.corr(method=method)
194
195
    corr = _corr_selector(corr, split=split, threshold=threshold)
196
197
    if colored:
198
        return corr.style.applymap(color_negative_red).format('{:.2f}', na_rep='-')
199
    else:
200
        return corr
201
202
203
# Correlation matrix / heatmap
204
def corr_plot(data, split=None, threshold=0, target=None, method='pearson', cmap='BrBG', figsize=(12, 10), annot=True,
205
              dev=False, **kwargs):
206
    '''
207
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
208
209
    Parameters
210
    ----------
211
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
212
    information is used to label the plots.
213
214
    split: {None, 'pos', 'neg', 'above', 'below'}, default None
215
        Type of split to be performed.
216
217
        * None: visualize all correlations between the feature-columns.
218
        * pos: visualize all positive correlations between the feature-columns above the threshold.
219
        * neg: visualize all negative correlations between the feature-columns below the threshold.
220
        * above: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
221
        * below: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
222
223
    threshold: float, default 0
224
        Value between 0 <= threshold <= 1
225
226
    target: string, list, np.array or pd.Series, default None
227
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
228
        and the label.
229
230
    method: {'pearson', 'spearman', 'kendall'}, default 'pearson'
231
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
232
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
233
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more expensive but
234
                   more robust in smaller dataets than 'spearman'.
235
236
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
237
        The mapping from data values to color space.
238
239
    figsize: tuple, default (12, 10)
240
        Use to control the figure size.
241
242
    annot: bool, default True
243
        Use to show or hide annotations.
244
245
    dev: bool, default False
246
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed.
247
248
    **kwargs: optional
249
        Additional elements to control the visualization of the plot, e.g.:
250
251
        * mask: bool, default True
252
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
253
        case to avoid overlap.
254
        * vmax: float, default is calculated from the given correlation coefficients.
255
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
256
        * vmin: float, default is calculated from the given correlation coefficients.
257
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
258
        * linewidths: float, default 0.5
259
        Controls the line-width inbetween the squares.
260
        * annot_kws: dict, default {'size' : 10}
261
        Controls the font size of the annotations. Only available when annot = True.
262
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
263
        Controls the size of the colorbar.
264
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
265
266
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
267
268
    Returns
269
    -------
270
    ax: matplotlib Axes
271
        Returns the Axes object with the plot for further tweaking.
272
273
    '''
274
275
    # Validate Inputs
276
    _validate_input_range(threshold, 'threshold', -1, 1)
277
    _validate_input_bool(annot, 'annot')
278
    _validate_input_bool(dev, 'dev')
279
280
    data = pd.DataFrame(data)
281
282
    corr = corr_mat(data, split=split, threshold=threshold, target=target, method=method, colored=False)
283
284
    mask = np.triu(np.ones_like(corr, dtype=np.bool))
285
    vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
286
    vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
287
288
    fig, ax = plt.subplots(figsize=figsize)
289
290
    # Specify kwargs for the heatmap
291
    kwargs = {'mask': mask,
292
              'cmap': cmap,
293
              'annot': annot,
294
              'vmax': vmax,
295
              'vmin': vmin,
296
              'linewidths': .5,
297
              'annot_kws': {'size': 10},
298
              'cbar_kws': {'shrink': .95, 'aspect': 30},
299
              **kwargs}
300
301
    # Draw heatmap with mask and default settings
302
    sns.heatmap(corr, center=0, fmt='.2f', **kwargs)
303
304
    ax.set_title(f'Feature-correlation ({method})', fontdict={'fontsize': 18})
305
306
    # Settings
307
    if dev:
308
        fig.suptitle(f"\
309
            Settings (dev-mode): \n\
310
            - split-mode: {split} \n\
311
            - threshold: {threshold} \n\
312
            - method: {method} \n\
313
            - annotations: {annot} \n\
314
            - cbar: \n\
315
                - vmax: {vmax} \n\
316
                - vmin: {vmin} \n\
317
            - linewidths: {kwargs['linewidths']} \n\
318
            - annot_kws: {kwargs['annot_kws']} \n\
319
            - cbar_kws: {kwargs['cbar_kws']}",
320
                     fontsize=12,
321
                     color='gray',
322
                     x=0.35,
323
                     y=0.85,
324
                     ha='left')
325
326
    return ax
327
328
329
# Distribution plot
330
def dist_plot(data, mean_color='orange', figsize=(14, 2), fill_range=(0.025, 0.975), hist=False, bins=None,
331
              showall=False, kde_kws=None, rug_kws=None, fill_kws=None, font_kws=None):
332
    '''
333
    Two-dimensional visualization of the distribution of numerical features.
334
335
    Parameters
336
    ----------
337
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
338
    information is used to label the plots.
339
340
    mean_color: color, default 'orange'
341
        Color of the vertical line indicating the mean of the data.
342
343
    figsize: tuple, default (14, 2)
344
        Controls the figure size.
345
346
    fill_range: tuple, default (0.025, 0.975)
347
        Set the quantiles for shading. Default spans 95% of the data, which is about two std. deviations\
348
        above and below the mean.
349
350
    hist: bool, default False
351
        Set to True to display histogram bars in the plot.
352
353
    bins: integer, default None
354
        Specification of the number of hist bins. Requires hist = True
355
356
    showall: bool, default False
357
        Set to True to remove the output limit of 20 plots.
358
359
    kdw_kws: dict, default {'color': 'k', 'alpha': 0.7, 'linewidth': 1}
360
        Keyword arguments for kdeplot().
361
362
    rug_kws: dict, default {'color': 'brown', 'alpha': 0.5, 'linewidth': 2, 'height': 0.04}
363
        Keyword arguments for rugplot().
364
365
    fill_kws: dict, default {'color': 'brown', 'alpha': 0.1}
366
        Keyword arguments to control the fill.
367
368
    font_kws: dict, default {'color':  '#111111', 'weight': 'normal', 'size': 11}
369
        Keyword arguments to control the font.
370
371
    Returns
372
    -------
373
    ax: matplotlib Axes
374
        Returns the Axes object with the plot for further tweaking.
375
376
    '''
377
378
    # Validate Inputs
379
    _validate_input_bool(hist, 'hist')
380
    _validate_input_bool(showall, 'showall')
381
    _validate_input_range(fill_range[0], 'fill_range_lower', 0, 1)
382
    _validate_input_range(fill_range[1], 'fill_range_upper', 0, 1)
383
    _validate_input_smaller(fill_range[0], fill_range[1], 'fill_range')
384
385
    # Handle dictionary defaults
386
    kde_kws = {'alpha': 0.7, 'linewidth': 1.5} if kde_kws is None else kde_kws.copy()
387
    rug_kws = {'color': 'brown', 'alpha': 0.5, 'linewidth': 2, 'height': 0.04} if rug_kws is None else rug_kws.copy()
388
    fill_kws = {'color': 'brown', 'alpha': 0.1} if fill_kws is None else fill_kws.copy()
389
    font_kws = {'color':  '#111111', 'weight': 'normal', 'size': 11} if font_kws is None else font_kws.copy()
390
391
    data = drop_missing(pd.DataFrame(data).copy())  # remove empty columns / rows
392
    cols = list(data.select_dtypes(include=['number']).columns)
393
    data = data[cols]
394
395
    if len(cols) == 0:
396
        print('No columns with numeric data were detected.')
397
398
    elif len(cols) >= 20 and showall is False:
399
        print(
400
            f'Note: The number of numerical features is very large ({len(cols)}), please consider splitting the data. '
401
            'Showing plots for the first 20 numerical features. Override this by setting showall=True.')
402
        cols = cols[:20]
403
404
    for col in cols:
405
        dropped_values = data[col].isna().sum()
406
        if dropped_values > 0:
407
            print(f'Dropped {dropped_values} missing values from column {col}.')
408
            col_data = data[col].dropna(axis=0)
409
        else:
410
            col_data = data[col]
411
412
        _, ax = plt.subplots(figsize=figsize)
413
        ax = sns.distplot(col_data, bins=bins, hist=hist, rug=True, kde_kws=kde_kws,
414
                          rug_kws=rug_kws, hist_kws={'alpha': 0.5, 'histtype': 'step'})
415
416
        # Vertical lines and fill
417
        x, y = ax.lines[0].get_xydata().T
418
        ax.fill_between(x, y,
419
                        where=(
420
                            (x >= np.quantile(col_data, fill_range[0])) &
421
                            (x <= np.quantile(col_data, fill_range[1]))),
422
                        label=f'{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%',
423
                        **fill_kws)
424
425
        mean = np.mean(col_data)
426
        std = scipy.stats.tstd(col_data)
427
        ax.vlines(x=mean,
428
                  ymin=0,
429
                  ymax=np.interp(mean, x, y),
430
                  ls='dotted', color=mean_color, lw=2, label='mean')
431
        ax.vlines(x=np.median(col_data),
432
                  ymin=0,
433
                  ymax=np.interp(np.median(col_data), x, y),
434
                  ls=':', color='.3', label='median')
435
        ax.vlines(x=[mean-std, mean+std],
436
                  ymin=0,
437
                  ymax=[np.interp(mean-std, x, y), np.interp(mean+std, x, y)], ls=':', color='.5',
438
                  label='\u03BC \u00B1 \u03C3')
439
440
        ax.set_ylim(0,)
441
        ax.set_xlim(ax.get_xlim()[0]*1.15, ax.get_xlim()[1]*1.15)
442
443
        # Annotations and legend
444
        ax.text(0.01, 0.85, f'Mean: {np.round(mean,2)}',
445
                fontdict=font_kws, transform=ax.transAxes)
446
        ax.text(0.01, 0.7, f'Std. dev: {np.round(std,2)}',
447
                fontdict=font_kws, transform=ax.transAxes)
448
        ax.text(0.01, 0.55, f'Skew: {np.round(scipy.stats.skew(col_data),2)}',
449
                fontdict=font_kws, transform=ax.transAxes)
450
        ax.text(0.01, 0.4, f'Kurtosis: {np.round(scipy.stats.kurtosis(col_data),2)}',  # Excess Kurtosis
451
                fontdict=font_kws, transform=ax.transAxes)
452
        ax.text(0.01, 0.25, f'Count: {np.round(len(col_data))}',
453
                fontdict=font_kws, transform=ax.transAxes)
454
        ax.legend(loc='upper right')
455
456
    return ax
457
458
459
# Missing value plot
460
def missingval_plot(data, cmap='PuBuGn', figsize=(12, 12), sort=False, spine_color='#EEEEEE'):
461
    '''
462
    Two-dimensional visualization of the missing values in a dataset.
463
464
    Parameters
465
    ----------
466
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
467
    information is used to label the plots.
468
469
    cmap: colormap, default 'PuBuGn'
470
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
471
        documentation.
472
473
    figsize: tuple, default (20, 12)
474
        Use to control the figure size.
475
476
    sort: bool, default False
477
        Sort columns based on missing values in descending order and drop columns without any missing values
478
479
    spine_color: color, default '#EEEEEE'
480
        Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
481
482
    Returns
483
    -------
484
    gs: Figure with array of Axes objects.
485
486
    '''
487
488
    # Validate Inputs
489
    _validate_input_bool(sort, 'sort')
490
491
    data = pd.DataFrame(data)
492
493
    if sort:
494
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
495
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
496
        data = data[final_cols]
497
        print('Displaying only columns with missing values.')
498
499
    # Identify missing values
500
    mv_total, mv_rows, mv_cols, _, mv_cols_ratio = _missing_vals(data).values()
501
    total_datapoints = data.shape[0]*data.shape[1]
502
503
    if mv_total == 0:
504
        print('No missing values found in the dataset.')
505
    else:
506
        # Create figure and axes
507
        fig = plt.figure(figsize=figsize)
508
        gs = fig.add_gridspec(nrows=6, ncols=6, left=0.05, wspace=0.05)
509
        ax1 = fig.add_subplot(gs[:1, :5])
510
        ax2 = fig.add_subplot(gs[1:, :5])
511
        ax3 = fig.add_subplot(gs[:1, 5:])
512
        ax4 = fig.add_subplot(gs[1:, 5:])
513
514
        # ax1 - Barplot
515
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
516
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
517
        ax1.get_xaxis().set_visible(False)
518
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
519
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
520
        ax1.grid(linestyle=':', linewidth=1)
521
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
522
        ax1.tick_params(axis='y', colors='#111111', length=1)
523
524
        # annotate values on top of the bars
525
        for rect, label in zip(ax1.patches, mv_cols):
526
            height = rect.get_height()
527
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
528
                     ha='center',
529
                     va='bottom',
530
                     rotation='90',
531
                     alpha=0.5,
532
                     fontsize='small')
533
534
        ax1.set_frame_on(True)
535
        for _, spine in ax1.spines.items():
536
            spine.set_visible(True)
537
            spine.set_color(spine_color)
538
        ax1.spines['top'].set_color(None)
539
540
        # ax2 - Heatmap
541
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
542
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
543
        ax2.set_yticklabels(ax2.get_yticks())
544
        ax2.set_xticklabels(
545
            ax2.get_xticklabels(),
546
            horizontalalignment='center',
547
            fontweight='light',
548
            fontsize='medium')
549
        ax2.tick_params(length=1, colors='#111111')
550
        for _, spine in ax2.spines.items():
551
            spine.set_visible(True)
552
            spine.set_color(spine_color)
553
554
        # ax3 - Summary
555
        fontax3 = {'color':  '#111111',
556
                   'weight': 'normal',
557
                   'size': 12,
558
                   }
559
        ax3.get_xaxis().set_visible(False)
560
        ax3.get_yaxis().set_visible(False)
561
        ax3.set(frame_on=False)
562
563
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
564
                 transform=ax3.transAxes,
565
                 fontdict=fontax3)
566
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
567
                 transform=ax3.transAxes,
568
                 fontdict=fontax3)
569
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
570
                 transform=ax3.transAxes,
571
                 fontdict=fontax3)
572
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
573
                 transform=ax3.transAxes,
574
                 fontdict=fontax3)
575
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
576
                 transform=ax3.transAxes,
577
                 fontdict=fontax3)
578
579
        # ax4 - Scatter plot
580
        ax4.get_yaxis().set_visible(False)
581
        for _, spine in ax4.spines.items():
582
            spine.set_color(spine_color)
583
        ax4.tick_params(axis='x', colors='#111111', length=1)
584
585
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
586
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
587
        ax4.set_xlim(0, max(mv_rows)+0.5)
588
        ax4.grid(linestyle=':', linewidth=1)
589
590
        gs.figure.suptitle('Missing value plot', x=0.45, y=0.94, fontsize=18, color='#111111')
591
592
        return gs
593