GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 4c58c3...2f319e )
by Andreas
01:17
created

klib.describe.missingval_plot()   C

Complexity

Conditions 7

Size

Total Lines 114
Code Lines 71

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 71
dl 0
loc 114
rs 6.549
c 0
b 0
f 0
cc 7
nop 5

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Utilities for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import seaborn as sns
14
15
from matplotlib import cm
16
17
18
# Missing value plot
19
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
20
    '''
21
    Two-dimensional visualization of the missing values in a dataset.
22
23
    Parameters:
24
    ----------
25
    data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
26
27
    cmap: colormap, default 'PuBuGn'
28
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib documentation.
29
30
    figsize: tuple, default (20,12)
31
        Use to control the figure size.
32
33
    sort: bool, default False
34
        Sort columns based on missing values in descending order and drop columns without any missing values
35
36
    spine_color: color-code, default '#EEEEEE'
37
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
38
39
    Returns:
40
    -------
41
    ax: matplotlib Axes. Axes object with the heatmap.
42
    '''
43
44
    if sort:
45
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
46
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
47
        data = data[final_cols]
48
        print('Displaying only columns with missing values.')
49
50
    # Identify missing values
51
    mv_cols = data.isna().sum(axis=0)
52
    mv_rows = data.isna().sum(axis=1)
53
    mv_total = mv_cols.sum()
54
    mv_cols_rel = mv_cols / data.shape[0]
55
    total_datapoints = data.shape[0]*data.shape[1]
56
57
    if mv_total == 0:
58
        print('No missing values found in the dataset.')
59
    else:
60
        # Create figure and axes
61
        fig = plt.figure(figsize=figsize)
62
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
63
        ax1 = fig.add_subplot(grid[:1, :5])
64
        ax2 = fig.add_subplot(grid[1:, :5])
65
        ax3 = fig.add_subplot(grid[:1, 5:])
66
        ax4 = fig.add_subplot(grid[1:, 5:])
67
68
        # ax1 - Barplot
69
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
70
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_rel)*100, 2), color=colors)
71
        ax1.get_xaxis().set_visible(False)
72
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
73
        ax1.set_ylim(0, np.max(mv_cols_rel)*100)
74
        ax1.grid(linestyle=':', linewidth=1)
75
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
76
        ax1.tick_params(axis='y', colors='#111111', length=1)
77
78
        # annotate values on top of the bars
79
        for rect, label in zip(ax1.patches, mv_cols):
80
            height = rect.get_height()
81
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
82
                     ha='center',
83
                     va='bottom',
84
                     rotation='90',
85
                     alpha=0.5,
86
                     fontsize='small')
87
88
        ax1.set_frame_on(True)
89
        for _, spine in ax1.spines.items():
90
            spine.set_visible(True)
91
            spine.set_color(spine_color)
92
        ax1.spines['top'].set_color(None)
93
94
        # ax2 - Heatmap
95
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
96
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
97
        ax2.set_yticklabels(ax2.get_yticks())
98
        ax2.set_xticklabels(
99
            ax2.get_xticklabels(),
100
            horizontalalignment='center',
101
            fontweight='light',
102
            fontsize='medium')
103
        ax2.tick_params(length=1, colors='#111111')
104
        for _, spine in ax2.spines.items():
105
            spine.set_visible(True)
106
            spine.set_color(spine_color)
107
108
        # ax3 - Summary
109
        fontax3 = {'color':  '#111111',
110
                   'weight': 'normal',
111
                   'size': 12,
112
                   }
113
        ax3.get_xaxis().set_visible(False)
114
        ax3.get_yaxis().set_visible(False)
115
        ax3.set(frame_on=False)
116
117
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
118
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
119
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3)
120
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
121
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
122
123
        # ax4 - Scatter plot
124
        ax4.get_yaxis().set_visible(False)
125
        for _, spine in ax4.spines.items():
126
            spine.set_color(spine_color)
127
        ax4.tick_params(axis='x', colors='#111111', length=1)
128
129
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".")
130
        ax4.set_ylim(0, len(mv_rows))
131
        ax4.set_ylim(ax4.get_ylim()[::-1])  # invert y-axis
132
        ax4.grid(linestyle=':', linewidth=1)
133
134
135
# Correlation matrix / heatmap
136
def corr_plot(data, split=None, threshold=0, cmap=sns.color_palette("BrBG", 250), figsize=(12, 10), annot=True, dev=False, **kwargs):
137
    '''
138
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
139
140
    Parameters:
141
    ----------
142
    data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information will be used to label the columns and rows.
143
144
    split: {'None', 'pos', 'neg', 'high', 'low'}, default 'None'
145
        Type of split to be performed.
146
147
        * None: visualize all correlations between the feature-columns.
148
        * pos: visualize all positive correlations between the feature-columns above the threshold.
149
        * neg: visualize all negative correlations between the feature-columns below the threshold.
150
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
151
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
152
153
    threshold: float, default 0
154
        Value between 0 <= threshold <= 1
155
156
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
157
        The mapping from data values to color space.
158
159
    figsize: tuple, default (12, 10)
160
        Use to control the figure size.
161
162
    annot: bool, default True
163
        Use to show or hide annotations.
164
165
    dev: bool, default False
166
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for presentations.
167
168
    **kwargs: optional
169
        Additional elements to control the visualization of the plot, e.g.:
170
171
        * mask: bool, default True
172
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this case to avoid overlap.
173
        * cmap: matplotlib colormap name or object, or list of colors, optional
174
        The mapping from data values to color space. If not provided, the
175
        default is sns.color_palette("BrBG", 150).
176
        * annot:bool, default True for 20 or less columns, False for more than 20 feature-columns.
177
        * vmax: float, default is calculated from the given correlation coefficients. 
178
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
179
        * vmin: float, default is calculated from the given correlation coefficients. 
180
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
181
        * linewidths: float, default 0.5
182
        Controls the line-width inbetween the squares.
183
        * annot_kws: dict, default {'size' : 10}
184
        Controls the font size of the annotations. Only available when annot = True.
185
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
186
        Controls the size of the colorbar.
187
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
188
189
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
190
191
    Returns:
192
    ------- 
193
    ax: matplotlib Axes. Axes object with the heatmap.
194
    '''
195
196
    if split == 'pos':
197
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
198
        threshold = '-'
199
    elif split == 'neg':
200
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
201
        threshold = '-'
202
    elif split == 'high':
203
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
204
    elif split == 'low':
205
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
206
    else:
207
        corr = data.corr()
208
        split = "full"
209
        threshold = 'None'
210
211
    # Generate mask for the upper triangle
212
    mask = np.triu(np.ones_like(corr, dtype=np.bool))
213
214
    # Compute dimensions and correlation range to adjust settings
215
    vmax = np.round(np.nanmax(corr.where(mask == False))-0.05, 2)
216
    vmin = np.round(np.nanmin(corr.where(mask == False))+0.05, 2)
217
218
    # Set up the matplotlib figure and generate colormap
219
    fig, ax = plt.subplots(figsize=figsize)
220
221
    # kwargs for the heatmap
222
    kwargs = {'mask': mask,
223
              'cmap': cmap,
224
              'annot': annot,
225
              'vmax': vmax,
226
              'vmin': vmin,
227
              'linewidths': .5,
228
              'annot_kws': {'size': 10},
229
              'cbar_kws': {'shrink': .95, 'aspect': 30},
230
              **kwargs}
231
232
    # Draw heatmap with mask and some default settings
233
    sns.heatmap(corr,
234
                center=0,
235
                square=True,
236
                fmt='.2f',
237
                **kwargs
238
                )
239
240
    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
241
242
    if dev == False:
243
        pass
244
    else:  # show settings
245
        fig.suptitle(f"\
246
            Settings (dev-mode): \n\
247
            - split-mode: {split} \n\
248
            - threshold: {threshold} \n\
249
            - cbar: \n\
250
                - vmax: {vmax} \n\
251
                - vmin: {vmin} \n\
252
            - linewidths: {kwargs['linewidths']} \n\
253
            - annot_kws: {kwargs['annot_kws']} \n\
254
            - cbar_kws: {kwargs['cbar_kws']}",
255
                     fontsize=12,
256
                     color='gray',
257
                     x=0.35,
258
                     y=0.8,
259
                     ha='left')
260
261
    return ax
262
263
264
# TODO - summary statistics
265
# TODO - visualize distributions
266
    # numerical
267
    # categorical
268
# todo export charts and summary statistics?
269
270
# FIXME something
271
# FIX something else
272
273
# BUG none known
274