GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Branch master (5deb01)
by Andreas
02:32
created

klib.describe   A

Complexity

Total Complexity 32

Size/Duplication

Total Lines 607
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 275
dl 0
loc 607
rs 9.84
c 0
b 0
f 0
wmc 32

5 Functions

Rating   Name   Duplication   Size   Complexity  
C missingval_plot() 0 132 7
A corr_plot() 0 126 3
B corr_mat() 0 68 6
C dist_plot() 0 129 10
B cat_plot() 0 106 6
1
'''
2
Functions for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import scipy
14
import seaborn as sns
15
16
from .utils import (_corr_selector,
17
                    _missing_vals,
18
                    _validate_input_bool,
19
                    _validate_input_int,
20
                    _validate_input_smaller,
21
                    _validate_input_range)
22
23
24
__all__ = ['cat_plot',
25
           'corr_mat',
26
           'corr_plot',
27
           'dist_plot',
28
           'missingval_plot']
29
30
31
# Functions
32
33
# Categorical Plot
34
def cat_plot(data, figsize=(14, 14), top=3, bottom=3, bar_color_top='#5ab4ac', bar_color_bottom='#d8b365', cmap='BrBG'):
35
    '''
36
    Two-dimensional visualization of the number and frequency of categorical features.
37
38
    Parameters
39
    ----------
40
41
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
42
    information is used to label the plots.
43
44
    figsize: tuple, default (14, 14)
45
        Use to control the figure size.
46
47
    top: int, default 3
48
        Show the "top" most frequent values in a column.
49
50
    bottom: int, default 3
51
        Show the "bottom" most frequent values in a column.
52
53
    bar_color_top: color, default '#5ab4ac'
54
        Use to control the color of the bars indicating the most common values.
55
56
    bar_color_bottom: color, default '#d8b365'
57
        Use to control the color of the bars indicating the least common values.
58
59
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
60
        The mapping from data values to color space.
61
62
    Returns
63
    -------
64
    gs: Figure with array of Axes objects.
65
    '''
66
67
    # Validate Inputs
68
    _validate_input_int(top, 'top')
69
    _validate_input_int(bottom, 'bottom')
70
    _validate_input_range(top, 'top', 0, data.shape[1])
71
    _validate_input_range(bottom, 'bottom', 0, data.shape[1])
72
73
    data = pd.DataFrame(data).copy()
74
    cols = data.select_dtypes(exclude=['number']).columns.tolist()
75
    data = data[cols]
76
77
    if len(cols) == 0:
78
        print('No columns with categorical data were detected.')
79
80
    fig = plt.figure(figsize=figsize)
81
    gs = fig.add_gridspec(nrows=6, ncols=len(cols), wspace=0.2)
82
83
    for count, col in enumerate(cols):
84
85
        n_unique = data[col].nunique(dropna=False)
86
        value_counts = data[col].value_counts()
87
        lim_top, lim_bot = top, bottom
88
89
        if n_unique < top+bottom:
90
            lim_top = lim_bot = int(n_unique//2)
91
92
        value_counts_top = value_counts[0:lim_top]
93
        value_counts_idx_top = value_counts_top.index.tolist()
94
        value_counts_bot = value_counts[-lim_bot:]
95
        value_counts_idx_bot = value_counts_bot.index.tolist()
96
97
        if top == 0:
98
            value_counts_top = value_counts_idx_top = None
99
100
        elif bottom == 0:
101
            value_counts_bot = value_counts_idx_bot = None
102
103
        data.loc[data[col].isin(value_counts_idx_top), col] = 2
104
        data.loc[data[col].isin(value_counts_idx_bot), col] = -2
105
        data.loc[~((data[col] == 2) | (data[col] == -2)), col] = 0
106
107
        # Barcharts
108
        ax_top = fig.add_subplot(gs[:1, count:count+1])
109
        ax_top.bar(value_counts_idx_top, value_counts_top, color=bar_color_top, width=0.85)
110
        ax_top.bar(value_counts_idx_bot, value_counts_bot, color=bar_color_bottom, width=0.85)
111
        ax_top.set(frame_on=False)
112
        ax_top.tick_params(axis='x', labelrotation=90)
113
114
        # Summary stats
115
        ax_bottom = fig.add_subplot(gs[1:2, count:count+1])
116
        ax_bottom.get_yaxis().set_visible(False)
117
        ax_bottom.get_xaxis().set_visible(False)
118
        ax_bottom.set(frame_on=False)
119
        ax_bottom.text(0, 0, f'Unique values: {n_unique}\n\n'
120
                       f'Top {top} vals: {sum(value_counts_top)} ({sum(value_counts_top)/data.shape[0]*100:.1f}%)\n'
121
                       f'Bot {bottom} vals: {sum(value_counts_bot)} ' +
122
                       f'({sum(value_counts_bot)/data.shape[0]*100:.1f}%)',
123
                       transform=ax_bottom.transAxes, color='#111111', fontsize=11)
124
125
    # Heatmap
126
    data = data.astype('int')
127
    ax_hm = fig.add_subplot(gs[2:, :])
128
    sns.heatmap(data, cmap=cmap, cbar=False, vmin=-4.25, vmax=4.25, ax=ax_hm)
129
    ax_hm.set_yticks(np.round(ax_hm.get_yticks()[0::5], -1))
130
    ax_hm.set_yticklabels(ax_hm.get_yticks())
131
    ax_hm.set_xticklabels(ax_hm.get_xticklabels(),
132
                          horizontalalignment='center',
133
                          fontweight='light',
134
                          fontsize='medium')
135
    ax_hm.tick_params(length=1, colors='#111111')
136
137
    gs.figure.suptitle('Categorical data plot', x=0.47, y=0.925, fontsize=18, color='#111111')
138
139
    return gs
140
141
142
# Correlation Matrix
143
def corr_mat(data, split=None, threshold=0, target=None, method='pearson', colored=True):
144
    '''
145
    Returns a color-encoded correlation matrix.
146
147
    Parameters
148
    ----------
149
150
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
151
    information is used to label the plots.
152
153
    split: {None, 'pos', 'neg', 'above', 'below'}, default None
154
        Type of split to be performed.
155
156
    threshold: float, default 0
157
        Value between 0 <= threshold <= 1
158
159
    target: string, list, np.array or pd.Series, default None
160
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
161
        and the label.
162
163
    method: {'pearson', 'spearman', 'kendall'}, default 'pearson'
164
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
165
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
166
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more expensive but \
167
                    more robus in smaller dataets than 'spearman'.
168
169
    colored: bool, default True
170
        If True the negative values in the correlation matrix are colored in red.
171
172
    Returns
173
    -------
174
    If colored = True - corr: Pandas Styler object
175
    If colored = False - corr: Pandas DataFrame
176
    '''
177
178
    # Validate Inputs
179
    _validate_input_range(threshold, 'threshold', -1, 1)
180
    _validate_input_bool(colored, 'colored')
181
182
    def color_negative_red(val):
183
        color = '#FF3344' if val < 0 else None
184
        return 'color: %s' % color
185
186
    data = pd.DataFrame(data)
187
188
    if isinstance(target, (str, list, pd.Series, np.ndarray)):
189
        target_data = []
190
        if isinstance(target, str):
191
            target_data = data[target]
192
            data = data.drop(target, axis=1)
193
194
        elif isinstance(target, (list, pd.Series, np.ndarray)):
195
            target_data = pd.Series(target)
196
            target = target_data.name
197
198
        corr = pd.DataFrame(data.corrwith(target_data))
199
        corr = corr.sort_values(corr.columns[0], ascending=False)
200
        corr.columns = [target]
201
202
    else:
203
        corr = data.corr(method=method)
204
205
    corr = _corr_selector(corr, split=split, threshold=threshold)
206
207
    if colored:
208
        return corr.style.applymap(color_negative_red).format('{:.2f}', na_rep='-')
209
    else:
210
        return corr
211
212
213
# Correlation matrix / heatmap
214
def corr_plot(data, split=None, threshold=0, target=None, method='pearson', cmap='BrBG', figsize=(12, 10), annot=True,
215
              dev=False, **kwargs):
216
    '''
217
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
218
219
    Parameters
220
    ----------
221
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
222
    information is used to label the plots.
223
224
    split: {None, 'pos', 'neg', 'above', 'below'}, default None
225
        Type of split to be performed.
226
227
        * None: visualize all correlations between the feature-columns.
228
        * pos: visualize all positive correlations between the feature-columns above the threshold.
229
        * neg: visualize all negative correlations between the feature-columns below the threshold.
230
        * above: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
231
        * below: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
232
233
    threshold: float, default 0
234
        Value between 0 <= threshold <= 1
235
236
    target: string, list, np.array or pd.Series, default None
237
        Specify target for correlation. E.g. label column to generate only the correlations between each feature \
238
        and the label.
239
240
    method: {'pearson', 'spearman', 'kendall'}, default 'pearson'
241
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
242
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
243
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more expensive but \
244
                   more robust in smaller dataets than 'spearman'.
245
246
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
247
        The mapping from data values to color space.
248
249
    figsize: tuple, default (12, 10)
250
        Use to control the figure size.
251
252
    annot: bool, default True
253
        Use to show or hide annotations.
254
255
    dev: bool, default False
256
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed.
257
258
    **kwargs: optional
259
        Additional elements to control the visualization of the plot, e.g.:
260
261
        * mask: bool, default True
262
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
263
        case to avoid overlap.
264
        * vmax: float, default is calculated from the given correlation coefficients.
265
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
266
        * vmin: float, default is calculated from the given correlation coefficients.
267
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
268
        * linewidths: float, default 0.5
269
        Controls the line-width inbetween the squares.
270
        * annot_kws: dict, default {'size' : 10}
271
        Controls the font size of the annotations. Only available when annot = True.
272
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
273
        Controls the size of the colorbar.
274
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
275
276
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
277
278
    Returns
279
    -------
280
    ax: matplotlib Axes
281
        Returns the Axes object with the plot for further tweaking.
282
    '''
283
284
    # Validate Inputs
285
    _validate_input_range(threshold, 'threshold', -1, 1)
286
    _validate_input_bool(annot, 'annot')
287
    _validate_input_bool(dev, 'dev')
288
289
    data = pd.DataFrame(data)
290
291
    corr = corr_mat(data, split=split, threshold=threshold, target=target, method=method, colored=False)
292
293
    mask = np.zeros_like(corr, dtype=np.bool)
294
295
    if target is None:
296
        mask = np.triu(np.ones_like(corr, dtype=np.bool))
297
298
    vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
299
    vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
300
301
    fig, ax = plt.subplots(figsize=figsize)
302
303
    # Specify kwargs for the heatmap
304
    kwargs = {'mask': mask,
305
              'cmap': cmap,
306
              'annot': annot,
307
              'vmax': vmax,
308
              'vmin': vmin,
309
              'linewidths': .5,
310
              'annot_kws': {'size': 10},
311
              'cbar_kws': {'shrink': .95, 'aspect': 30},
312
              **kwargs}
313
314
    # Draw heatmap with mask and default settings
315
    sns.heatmap(corr, center=0, fmt='.2f', **kwargs)
316
317
    ax.set_title(f'Feature-correlation ({method})', fontdict={'fontsize': 18})
318
319
    # Settings
320
    if dev:
321
        fig.suptitle(f"\
322
            Settings (dev-mode): \n\
323
            - split-mode: {split} \n\
324
            - threshold: {threshold} \n\
325
            - method: {method} \n\
326
            - annotations: {annot} \n\
327
            - cbar: \n\
328
                - vmax: {vmax} \n\
329
                - vmin: {vmin} \n\
330
            - linewidths: {kwargs['linewidths']} \n\
331
            - annot_kws: {kwargs['annot_kws']} \n\
332
            - cbar_kws: {kwargs['cbar_kws']}",
333
                     fontsize=12,
334
                     color='gray',
335
                     x=0.35,
336
                     y=0.85,
337
                     ha='left')
338
339
    return ax
340
341
342
# Distribution plot
343
def dist_plot(data, mean_color='orange', figsize=(14, 2), fill_range=(0.025, 0.975), hist=False, bins=10,
344
              showall=False, kde_kws=None, rug_kws=None, fill_kws=None, font_kws=None):
345
    '''
346
    Two-dimensional visualization of the distribution of numerical features.
347
348
    Parameters
349
    ----------
350
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
351
    information is used to label the plots.
352
353
    mean_color: color, default 'orange'
354
        Color of the vertical line indicating the mean of the data.
355
356
    figsize: tuple, default (14, 2)
357
        Controls the figure size.
358
359
    fill_range: tuple, default (0.025, 0.975)
360
        Set the quantiles for shading. Default spans 95% of the data, which is about two std. deviations \
361
        above and below the mean.
362
363
    hist: bool, default False
364
        Set to True to display histogram bars in the plot.
365
366
    bins: integer, default 10
367
        Specification of the number of hist bins. Requires hist = True
368
369
    showall: bool, default False
370
        Set to True to remove the output limit of 20 plots.
371
372
    kdw_kws: dict, default {'color': 'k', 'alpha': 0.7, 'linewidth': 1}
373
        Keyword arguments for kdeplot().
374
375
    rug_kws: dict, default {'color': 'brown', 'alpha': 0.5, 'linewidth': 2, 'height': 0.04}
376
        Keyword arguments for rugplot().
377
378
    fill_kws: dict, default {'color': 'brown', 'alpha': 0.1}
379
        Keyword arguments to control the fill.
380
381
    font_kws: dict, default {'color':  '#111111', 'weight': 'normal', 'size': 11}
382
        Keyword arguments to control the font.
383
384
    Returns
385
    -------
386
    ax: matplotlib Axes
387
        Returns the Axes object with the plot for further tweaking.
388
    '''
389
390
    # Validate Inputs
391
    _validate_input_range(fill_range[0], 'fill_range_lower', 0, 1)
392
    _validate_input_range(fill_range[1], 'fill_range_upper', 0, 1)
393
    _validate_input_smaller(fill_range[0], fill_range[1], 'fill_range')
394
    _validate_input_bool(hist, 'hist')
395
    _validate_input_int(bins, 'bins')
396
    _validate_input_range(bins, 'bins', 0, data.shape[0])
397
    _validate_input_bool(showall, 'showall')
398
399
    # Handle dictionary defaults
400
    kde_kws = {'alpha': 0.7, 'linewidth': 1.5} if kde_kws is None else kde_kws.copy()
401
    rug_kws = {'color': 'brown', 'alpha': 0.5, 'linewidth': 2, 'height': 0.04} if rug_kws is None else rug_kws.copy()
402
    fill_kws = {'color': 'brown', 'alpha': 0.1} if fill_kws is None else fill_kws.copy()
403
    font_kws = {'color':  '#111111', 'weight': 'normal', 'size': 11} if font_kws is None else font_kws.copy()
404
405
    data = pd.DataFrame(data.copy()).dropna(axis=1, how='all')
406
    cols = list(data.select_dtypes(include=['number']).columns)
407
    data = data[cols]
408
409
    if len(cols) == 0:
410
        print('No columns with numeric data were detected.')
411
412
    elif len(cols) >= 20 and showall is False:
413
        print(
414
            f'Note: The number of numerical features is very large ({len(cols)}), please consider splitting the data. '
415
            'Showing plots for the first 20 numerical features. Override this by setting showall=True.')
416
        cols = cols[:20]
417
418
    for col in cols:
419
        dropped_values = data[col].isna().sum()
420
        if dropped_values > 0:
421
            col_data = data[col].dropna(axis=0)
422
            print(f'Dropped {dropped_values} missing values from column {col}.')
423
424
        else:
425
            col_data = data[col]
426
427
        _, ax = plt.subplots(figsize=figsize)
428
        ax = sns.distplot(col_data, bins=bins, hist=hist, rug=True, kde_kws=kde_kws,
429
                          rug_kws=rug_kws, hist_kws={'alpha': 0.5, 'histtype': 'step'})
430
431
        # Vertical lines and fill
432
        x, y = ax.lines[0].get_xydata().T
433
        ax.fill_between(x, y,
434
                        where=(
435
                            (x >= np.quantile(col_data, fill_range[0])) &
436
                            (x <= np.quantile(col_data, fill_range[1]))),
437
                        label=f'{fill_range[0]*100:.1f}% - {fill_range[1]*100:.1f}%',
438
                        **fill_kws)
439
440
        mean = np.mean(col_data)
441
        std = scipy.stats.tstd(col_data)
442
        ax.vlines(x=mean,
443
                  ymin=0,
444
                  ymax=np.interp(mean, x, y),
445
                  ls='dotted', color=mean_color, lw=2, label='mean')
446
        ax.vlines(x=np.median(col_data),
447
                  ymin=0,
448
                  ymax=np.interp(np.median(col_data), x, y),
449
                  ls=':', color='.3', label='median')
450
        ax.vlines(x=[mean-std, mean+std],
451
                  ymin=0,
452
                  ymax=[np.interp(mean-std, x, y), np.interp(mean+std, x, y)], ls=':', color='.5',
453
                  label='\u03BC \u00B1 \u03C3')
454
455
        ax.set_ylim(0,)
456
        ax.set_xlim(ax.get_xlim()[0]*1.15, ax.get_xlim()[1]*1.15)
457
458
        # Annotations and legend
459
        ax.text(0.01, 0.85, f'Mean: {np.round(mean,2)}',
460
                fontdict=font_kws, transform=ax.transAxes)
461
        ax.text(0.01, 0.7, f'Std. dev: {np.round(std,2)}',
462
                fontdict=font_kws, transform=ax.transAxes)
463
        ax.text(0.01, 0.55, f'Skew: {np.round(scipy.stats.skew(col_data),2)}',
464
                fontdict=font_kws, transform=ax.transAxes)
465
        ax.text(0.01, 0.4, f'Kurtosis: {np.round(scipy.stats.kurtosis(col_data),2)}',  # Excess Kurtosis
466
                fontdict=font_kws, transform=ax.transAxes)
467
        ax.text(0.01, 0.25, f'Count: {np.round(len(col_data))}',
468
                fontdict=font_kws, transform=ax.transAxes)
469
        ax.legend(loc='upper right')
470
471
    return ax
472
473
474
# Missing value plot
475
def missingval_plot(data, cmap='PuBuGn', figsize=(12, 12), sort=False, spine_color='#EEEEEE'):
476
    '''
477
    Two-dimensional visualization of the missing values in a dataset.
478
479
    Parameters
480
    ----------
481
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
482
    information is used to label the plots.
483
484
    cmap: colormap, default 'PuBuGn'
485
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
486
        documentation.
487
488
    figsize: tuple, default (20, 12)
489
        Use to control the figure size.
490
491
    sort: bool, default False
492
        Sort columns based on missing values in descending order and drop columns without any missing values.
493
494
    spine_color: color, default '#EEEEEE'
495
        Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
496
497
    Returns
498
    -------
499
    gs: Figure with array of Axes objects.
500
    '''
501
502
    # Validate Inputs
503
    _validate_input_bool(sort, 'sort')
504
505
    data = pd.DataFrame(data)
506
507
    if sort:
508
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
509
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
510
        data = data[final_cols]
511
        print('Displaying only columns with missing values.')
512
513
    # Identify missing values
514
    mv_total, mv_rows, mv_cols, _, mv_cols_ratio = _missing_vals(data).values()
515
    total_datapoints = data.shape[0]*data.shape[1]
516
517
    if mv_total == 0:
518
        print('No missing values found in the dataset.')
519
    else:
520
        # Create figure and axes
521
        fig = plt.figure(figsize=figsize)
522
        gs = fig.add_gridspec(nrows=6, ncols=6, left=0.05, wspace=0.05)
523
        ax1 = fig.add_subplot(gs[:1, :5])
524
        ax2 = fig.add_subplot(gs[1:, :5])
525
        ax3 = fig.add_subplot(gs[:1, 5:])
526
        ax4 = fig.add_subplot(gs[1:, 5:])
527
528
        # ax1 - Barplot
529
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
530
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
531
        ax1.get_xaxis().set_visible(False)
532
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
533
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
534
        ax1.grid(linestyle=':', linewidth=1)
535
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
536
        ax1.tick_params(axis='y', colors='#111111', length=1)
537
538
        # annotate values on top of the bars
539
        for rect, label in zip(ax1.patches, mv_cols):
540
            height = rect.get_height()
541
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
542
                     ha='center',
543
                     va='bottom',
544
                     rotation='90',
545
                     alpha=0.5,
546
                     fontsize='small')
547
548
        ax1.set_frame_on(True)
549
        for _, spine in ax1.spines.items():
550
            spine.set_visible(True)
551
            spine.set_color(spine_color)
552
        ax1.spines['top'].set_color(None)
553
554
        # ax2 - Heatmap
555
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
556
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
557
        ax2.set_yticklabels(ax2.get_yticks())
558
        ax2.set_xticklabels(
559
            ax2.get_xticklabels(),
560
            horizontalalignment='center',
561
            fontweight='light',
562
            fontsize='medium')
563
        ax2.tick_params(length=1, colors='#111111')
564
        for _, spine in ax2.spines.items():
565
            spine.set_visible(True)
566
            spine.set_color(spine_color)
567
568
        # ax3 - Summary
569
        fontax3 = {'color':  '#111111',
570
                   'weight': 'normal',
571
                   'size': 12,
572
                   }
573
        ax3.get_xaxis().set_visible(False)
574
        ax3.get_yaxis().set_visible(False)
575
        ax3.set(frame_on=False)
576
577
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
578
                 transform=ax3.transAxes,
579
                 fontdict=fontax3)
580
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
581
                 transform=ax3.transAxes,
582
                 fontdict=fontax3)
583
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
584
                 transform=ax3.transAxes,
585
                 fontdict=fontax3)
586
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
587
                 transform=ax3.transAxes,
588
                 fontdict=fontax3)
589
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
590
                 transform=ax3.transAxes,
591
                 fontdict=fontax3)
592
593
        # ax4 - Scatter plot
594
        ax4.get_yaxis().set_visible(False)
595
        for _, spine in ax4.spines.items():
596
            spine.set_color(spine_color)
597
        ax4.tick_params(axis='x', colors='#111111', length=1)
598
599
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
600
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
601
        ax4.set_xlim(0, max(mv_rows)+0.5)
602
        ax4.grid(linestyle=':', linewidth=1)
603
604
        gs.figure.suptitle('Missing value plot', x=0.45, y=0.94, fontsize=18, color='#111111')
605
606
        return gs
607