GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 92e4a4...505056 )
by Andreas
01:12
created

klib.describe.corr_plot()   A

Complexity

Conditions 2

Size

Total Lines 115
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 30
dl 0
loc 115
rs 9.16
c 0
b 0
f 0
cc 2
nop 8

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
'''
2
Utilities for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import seaborn as sns
14
15
16
# _functions
17
18
def _memory_usage(data):
19
    '''
20
    Gives the total memory usage in kilobytes.
21
22
    Parameters
23
    ----------
24
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
25
    information is used to label the plots.
26
27
    Returns
28
    -------
29
    memory_usage: float
30
31
    '''
32
33
    data = pd.DataFrame(data)
34
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
35
36
    return memory_usage
37
38
39
def _missing_vals(data):
40
    '''
41
    Gives metrics of missing values in the dataset.
42
43
    Parameters
44
    ----------
45
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
46
    information is used to label the plots.
47
48
    Returns
49
    -------
50
    mv_total: float, number of missing values in the entire dataset
51
    mv_rows: float, number of missing values in each row
52
    mv_cols: float, number of missing values in each column
53
    mv_rows_ratio: float, ratio of missing values for each row
54
    mv_cols_ratio: float, ratio of missing values for each column
55
    '''
56
57
    data = pd.DataFrame(data)
58
    mv_rows = data.isna().sum(axis=1)
59
    mv_cols = data.isna().sum(axis=0)
60
    mv_total = data.isna().sum().sum()
61
    mv_rows_ratio = mv_rows/data.shape[1]
62
    mv_cols_ratio = mv_cols/data.shape[0]
63
64
    return {'mv_total': mv_total,
65
            'mv_rows': mv_rows,
66
            'mv_cols': mv_cols,
67
            'mv_rows_ratio': mv_rows_ratio,
68
            'mv_cols_ratio': mv_cols_ratio}
69
70
71
# Functions
72
73
# Correlation Matrix
74
def corr_mat(data, split=None, threshold=0):
75
    def color_negative_red(val):
76
        color = '#FF3344' if val < 0 else None
77
        return 'color: %s' % color
78
79
    data = pd.DataFrame(data)
80
81
    if split == 'pos':
82
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
83
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
84
    elif split == 'neg':
85
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
86
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
87
    elif split == 'high':
88
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
89
        print('Displaying absolute correlations above a chosen threshold.')
90
    elif split == 'low':
91
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
92
        print('Displaying absolute correlations below a chosen threshold.')
93
    else:
94
        corr = data.corr()
95
        split = 'None'
96
        threshold = 'None'
97
98
    return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')
99
100
101
# Missing value plot
102
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
103
    '''
104
    Two-dimensional visualization of the missing values in a dataset.
105
106
    Parameters
107
    ----------
108
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
109
    information is used to label the plots.
110
111
    cmap: colormap, default 'PuBuGn'
112
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
113
        documentation.
114
115
    figsize: tuple, default (20,12)
116
        Use to control the figure size.
117
118
    sort: bool, default False
119
        Sort columns based on missing values in descending order and drop columns without any missing values
120
121
    spine_color: color-code, default '#EEEEEE'
122
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
123
124
    Returns
125
    -------
126
    figure
127
    '''
128
129
    data = pd.DataFrame(data)
130
131
    if sort:
132
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
133
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
134
        data = data[final_cols]
135
        print('Displaying only columns with missing values.')
136
137
    # Identify missing values
138
    mv_cols = _missing_vals(data)['mv_cols']  # data.isna().sum(axis=0)
139
    mv_rows = _missing_vals(data)['mv_rows']  # data.isna().sum(axis=1)
140
    mv_total = _missing_vals(data)['mv_total']
141
    mv_cols_ratio = _missing_vals(data)['mv_cols_ratio']  # mv_cols / data.shape[0]
142
    total_datapoints = data.shape[0]*data.shape[1]
143
144
    if mv_total == 0:
145
        print('No missing values found in the dataset.')
146
    else:
147
        # Create figure and axes
148
        fig = plt.figure(figsize=figsize)
149
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
150
        ax1 = fig.add_subplot(grid[:1, :5])
151
        ax2 = fig.add_subplot(grid[1:, :5])
152
        ax3 = fig.add_subplot(grid[:1, 5:])
153
        ax4 = fig.add_subplot(grid[1:, 5:])
154
155
        # ax1 - Barplot
156
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
157
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
158
        ax1.get_xaxis().set_visible(False)
159
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
160
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
161
        ax1.grid(linestyle=':', linewidth=1)
162
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
163
        ax1.tick_params(axis='y', colors='#111111', length=1)
164
165
        # annotate values on top of the bars
166
        for rect, label in zip(ax1.patches, mv_cols):
167
            height = rect.get_height()
168
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
169
                     ha='center',
170
                     va='bottom',
171
                     rotation='90',
172
                     alpha=0.5,
173
                     fontsize='small')
174
175
        ax1.set_frame_on(True)
176
        for _, spine in ax1.spines.items():
177
            spine.set_visible(True)
178
            spine.set_color(spine_color)
179
        ax1.spines['top'].set_color(None)
180
181
        # ax2 - Heatmap
182
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
183
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
184
        ax2.set_yticklabels(ax2.get_yticks())
185
        ax2.set_xticklabels(
186
            ax2.get_xticklabels(),
187
            horizontalalignment='center',
188
            fontweight='light',
189
            fontsize='medium')
190
        ax2.tick_params(length=1, colors='#111111')
191
        for _, spine in ax2.spines.items():
192
            spine.set_visible(True)
193
            spine.set_color(spine_color)
194
195
        # ax3 - Summary
196
        fontax3 = {'color':  '#111111',
197
                   'weight': 'normal',
198
                   'size': 12,
199
                   }
200
        ax3.get_xaxis().set_visible(False)
201
        ax3.get_yaxis().set_visible(False)
202
        ax3.set(frame_on=False)
203
204
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
205
                 transform=ax3.transAxes,
206
                 fontdict=fontax3)
207
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
208
                 transform=ax3.transAxes,
209
                 fontdict=fontax3)
210
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
211
                 transform=ax3.transAxes,
212
                 fontdict=fontax3)
213
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
214
                 transform=ax3.transAxes,
215
                 fontdict=fontax3)
216
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
217
                 transform=ax3.transAxes,
218
                 fontdict=fontax3)
219
220
        # ax4 - Scatter plot
221
        ax4.get_yaxis().set_visible(False)
222
        for _, spine in ax4.spines.items():
223
            spine.set_color(spine_color)
224
        ax4.tick_params(axis='x', colors='#111111', length=1)
225
226
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
227
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
228
        ax4.set_xlim(0, max(mv_rows)+0.5)
229
        ax4.grid(linestyle=':', linewidth=1)
230
231
        ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18})
232
        return grid
233
234
235
# Correlation matrix / heatmap
236
def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
237
    '''
238
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
239
240
    Parameters
241
    ----------
242
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
243
    information is used to label the plots.
244
245
    split: {None, 'pos', 'neg', 'high', 'low'}, default None
246
        Type of split to be performed.
247
248
        * None: visualize all correlations between the feature-columns.
249
        * pos: visualize all positive correlations between the feature-columns above the threshold.
250
        * neg: visualize all negative correlations between the feature-columns below the threshold.
251
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
252
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
253
254
    threshold: float, default 0
255
        Value between 0 <= threshold <= 1
256
257
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
258
        The mapping from data values to color space.
259
260
    figsize: tuple, default (12, 10)
261
        Use to control the figure size.
262
263
    annot: bool, default True
264
        Use to show or hide annotations.
265
266
    dev: bool, default False
267
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for \
268
        presentations.
269
270
    **kwargs: optional
271
        Additional elements to control the visualization of the plot, e.g.:
272
273
        * mask: bool, default True
274
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
275
        case to avoid overlap.
276
        * vmax: float, default is calculated from the given correlation coefficients.
277
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
278
        * vmin: float, default is calculated from the given correlation coefficients.
279
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
280
        * linewidths: float, default 0.5
281
        Controls the line-width inbetween the squares.
282
        * annot_kws: dict, default {'size' : 10}
283
        Controls the font size of the annotations. Only available when annot = True.
284
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
285
        Controls the size of the colorbar.
286
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
287
288
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
289
290
    Returns
291
    -------
292
    figure
293
    '''
294
295
    data = pd.DataFrame(data)
296
297
    # Obtain correlation matrix
298
    corr = corr_mat(data, split=split, threshold=threshold).data
299
300
    # Generate mask for the upper triangle
301
    mask = np.triu(np.ones_like(corr, dtype=np.bool))
302
303
    # Compute dimensions and correlation range to adjust settings
304
    vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
305
    vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
306
307
    # Set up the matplotlib figure and generate colormap
308
    fig, ax = plt.subplots(figsize=figsize)
309
310
    # Specify kwargs for the heatmap
311
    kwargs = {'mask': mask,
312
              'cmap': cmap,
313
              'annot': annot,
314
              'vmax': vmax,
315
              'vmin': vmin,
316
              'linewidths': .5,
317
              'annot_kws': {'size': 10},
318
              'cbar_kws': {'shrink': .95, 'aspect': 30},
319
              **kwargs}
320
321
    # Draw heatmap with mask and some default settings
322
    sns.heatmap(corr,
323
                center=0,
324
                square=True,
325
                fmt='.2f',
326
                **kwargs
327
                )
328
329
    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
330
331
    # Display settings
332
    if dev:
333
        fig.suptitle(f"\
334
            Settings (dev-mode): \n\
335
            - split-mode: {split} \n\
336
            - threshold: {threshold} \n\
337
            - annotations: {annot} \n\
338
            - cbar: \n\
339
                - vmax: {vmax} \n\
340
                - vmin: {vmin} \n\
341
            - linewidths: {kwargs['linewidths']} \n\
342
            - annot_kws: {kwargs['annot_kws']} \n\
343
            - cbar_kws: {kwargs['cbar_kws']}",
344
                     fontsize=12,
345
                     color='gray',
346
                     x=0.35,
347
                     y=0.85,
348
                     ha='left')
349
350
    return ax
351