GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( c0ff44...3c81b0 )
by Andreas
01:16
created

klib.describe.missingval_plot()   C

Complexity

Conditions 7

Size

Total Lines 126
Code Lines 82

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 82
dl 0
loc 126
rs 6.189
c 0
b 0
f 0
cc 7
nop 5

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Utilities for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import seaborn as sns
14
15
16
# Missing value plot
17
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
18
    '''
19
    Two-dimensional visualization of the missing values in a dataset.
20
21
    Parameters
22
    ----------
23
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
24
25
    cmap: colormap, default 'PuBuGn'
26
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib documentation.
27
28
    figsize: tuple, default (20,12)
29
        Use to control the figure size.
30
31
    sort: bool, default False
32
        Sort columns based on missing values in descending order and drop columns without any missing values
33
34
    spine_color: color-code, default '#EEEEEE'
35
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
36
37
    Returns
38
    -------
39
    ax: matplotlib Axes. Axes object with the heatmap.
40
    '''
41
42
    data = pd.DataFrame(data)
43
44
    if sort:
45
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
46
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
47
        data = data[final_cols]
48
        print('Displaying only columns with missing values.')
49
50
    # Identify missing values
51
    mv_cols = data.isna().sum(axis=0)
52
    mv_rows = data.isna().sum(axis=1)
53
    mv_total = mv_cols.sum()
54
    mv_cols_rel = mv_cols / data.shape[0]
55
    total_datapoints = data.shape[0]*data.shape[1]
56
57
    if mv_total == 0:
58
        print('No missing values found in the dataset.')
59
    else:
60
        # Create figure and axes
61
        fig = plt.figure(figsize=figsize)
62
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
63
        ax1 = fig.add_subplot(grid[:1, :5])
64
        ax2 = fig.add_subplot(grid[1:, :5])
65
        ax3 = fig.add_subplot(grid[:1, 5:])
66
        ax4 = fig.add_subplot(grid[1:, 5:])
67
68
        # ax1 - Barplot
69
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
70
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_rel)*100, 2), color=colors)
71
        ax1.get_xaxis().set_visible(False)
72
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
73
        ax1.set_ylim(0, np.max(mv_cols_rel)*100)
74
        ax1.grid(linestyle=':', linewidth=1)
75
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
76
        ax1.tick_params(axis='y', colors='#111111', length=1)
77
78
        # annotate values on top of the bars
79
        for rect, label in zip(ax1.patches, mv_cols):
80
            height = rect.get_height()
81
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
82
                     ha='center',
83
                     va='bottom',
84
                     rotation='90',
85
                     alpha=0.5,
86
                     fontsize='small')
87
88
        ax1.set_frame_on(True)
89
        for _, spine in ax1.spines.items():
90
            spine.set_visible(True)
91
            spine.set_color(spine_color)
92
        ax1.spines['top'].set_color(None)
93
94
        # ax2 - Heatmap
95
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
96
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
97
        ax2.set_yticklabels(ax2.get_yticks())
98
        ax2.set_xticklabels(
99
            ax2.get_xticklabels(),
100
            horizontalalignment='center',
101
            fontweight='light',
102
            fontsize='medium')
103
        ax2.tick_params(length=1, colors='#111111')
104
        for _, spine in ax2.spines.items():
105
            spine.set_visible(True)
106
            spine.set_color(spine_color)
107
108
        # ax3 - Summary
109
        fontax3 = {'color':  '#111111',
110
                   'weight': 'normal',
111
                   'size': 12,
112
                   }
113
        ax3.get_xaxis().set_visible(False)
114
        ax3.get_yaxis().set_visible(False)
115
        ax3.set(frame_on=False)
116
117
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
118
                 transform=ax3.transAxes,
119
                 fontdict=fontax3)
120
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
121
                 transform=ax3.transAxes,
122
                 fontdict=fontax3)
123
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
124
                 transform=ax3.transAxes,
125
                 fontdict=fontax3)
126
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
127
                 transform=ax3.transAxes,
128
                 fontdict=fontax3)
129
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
130
                 transform=ax3.transAxes,
131
                 fontdict=fontax3)
132
133
        # ax4 - Scatter plot
134
        ax4.get_yaxis().set_visible(False)
135
        for _, spine in ax4.spines.items():
136
            spine.set_color(spine_color)
137
        ax4.tick_params(axis='x', colors='#111111', length=1)
138
139
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".")
140
        ax4.set_ylim(0, len(mv_rows))
141
        ax4.set_ylim(ax4.get_ylim()[::-1])  # invert y-axis
142
        ax4.grid(linestyle=':', linewidth=1)
143
144
145
# Correlation matrix / heatmap
146
def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
147
    '''
148
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
149
150
    Parameters
151
    ----------
152
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
153
154
    split: {None, 'pos', 'neg', 'high', 'low'}, default None
155
        Type of split to be performed.
156
157
        * None: visualize all correlations between the feature-columns.
158
        * pos: visualize all positive correlations between the feature-columns above the threshold.
159
        * neg: visualize all negative correlations between the feature-columns below the threshold.
160
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
161
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
162
163
    threshold: float, default 0
164
        Value between 0 <= threshold <= 1
165
166
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
167
        The mapping from data values to color space.
168
169
    figsize: tuple, default (12, 10)
170
        Use to control the figure size.
171
172
    annot: bool, default True
173
        Use to show or hide annotations.
174
175
    dev: bool, default False
176
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for presentations.
177
178
    **kwargs: optional
179
        Additional elements to control the visualization of the plot, e.g.:
180
181
        * mask: bool, default True
182
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this case to avoid overlap.
183
        * vmax: float, default is calculated from the given correlation coefficients.
184
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
185
        * vmin: float, default is calculated from the given correlation coefficients.
186
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
187
        * linewidths: float, default 0.5
188
        Controls the line-width inbetween the squares.
189
        * annot_kws: dict, default {'size' : 10}
190
        Controls the font size of the annotations. Only available when annot = True.
191
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
192
        Controls the size of the colorbar.
193
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
194
195
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
196
197
    Returns
198
    -------
199
    ax: matplotlib Axes. Axes object with the heatmap.
200
    '''
201
202
    data = pd.DataFrame(data)
203
204
    if split == 'pos':
205
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
206
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
207
    elif split == 'neg':
208
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
209
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
210
    elif split == 'high':
211
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
212
        print('Displaying absolute correlations above a chosen threshold.')
213
    elif split == 'low':
214
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
215
        print('Displaying absolute correlations below a chosen threshold.')
216
    else:
217
        corr = data.corr()
218
        split = 'None'
219
        threshold = 'None'
220
221
    # Generate mask for the upper triangle
222
    mask = np.triu(np.ones_like(corr, dtype=np.bool))
223
224
    # Compute dimensions and correlation range to adjust settings
225
    vmax = np.round(np.nanmax(corr.where(mask == False))-0.05, 2)
226
    vmin = np.round(np.nanmin(corr.where(mask == False))+0.05, 2)
227
228
    # Set up the matplotlib figure and generate colormap
229
    fig, ax = plt.subplots(figsize=figsize)
230
231
    # kwargs for the heatmap
232
    kwargs = {'mask': mask,
233
              'cmap': cmap,
234
              'annot': annot,
235
              'vmax': vmax,
236
              'vmin': vmin,
237
              'linewidths': .5,
238
              'annot_kws': {'size': 10},
239
              'cbar_kws': {'shrink': .95, 'aspect': 30},
240
              **kwargs}
241
242
    # Draw heatmap with mask and some default settings
243
    sns.heatmap(corr,
244
                center=0,
245
                square=True,
246
                fmt='.2f',
247
                **kwargs
248
                )
249
250
    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
251
252
    if dev:  # show settings
253
        fig.suptitle(f"\
254
            Settings (dev-mode): \n\
255
            - split-mode: {split} \n\
256
            - threshold: {threshold} \n\
257
            - annotations: {annot} \n\
258
            - cbar: \n\
259
                - vmax: {vmax} \n\
260
                - vmin: {vmin} \n\
261
            - linewidths: {kwargs['linewidths']} \n\
262
            - annot_kws: {kwargs['annot_kws']} \n\
263
            - cbar_kws: {kwargs['cbar_kws']}",
264
                     fontsize=12,
265
                     color='gray',
266
                     x=0.35,
267
                     y=0.85,
268
                     ha='left')
269
270
271
# _functions
272
273
def _memory_usage(data):
274
    '''
275
    Gives the total memory usage in kilobytes.
276
277
    Parameters
278
    ----------
279
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
280
281
    Returns
282
    -------
283
    memory_usage: float
284
285
    '''
286
287
    data = pd.DataFrame(data)
288
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
289
290
    return memory_usage
291
292
293
def _missing_vals(data):
294
    '''
295
    Gives metrics of missing values in the dataset.
296
297
    Parameters
298
    ----------
299
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
300
301
    Returns
302
    -------
303
    total_mv: float, number of missing values in the entire dataset
304
    rows_mv: float, number of missing values in each row
305
    cols_mv: float, number of missing values in each column
306
    rows_mv_ratio: float, ratio of missing values for each row
307
    cols_mv_ratio: float, ratio of missing values for each column
308
    '''
309
310
    data = pd.DataFrame(data)
311
    rows_mv = data.isna().sum(axis=0)
312
    cols_mv = data.isna().sum(axis=1)
313
    total_mv = data.isna().sum().sum()
314
    rows_mv_ratio = rows_mv/data.shape[0]
315
    cols_mv_ratio = cols_mv/data.shape[1]
316
317
    return total_mv, rows_mv, cols_mv, rows_mv_ratio, cols_mv_ratio
318