GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 732d38...4c58c3 )
by Andreas
01:10
created

klib.describe.corr_plot()   B

Complexity

Conditions 7

Size

Total Lines 124
Code Lines 43

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 43
dl 0
loc 124
rs 7.448
c 0
b 0
f 0
cc 7
nop 7

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
'''
2
Utilities for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import seaborn as sns
14
15
from matplotlib import cm
16
17
18
# Missing value plot
19
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), spine_color='#EEEEEE'):
20
    '''
21
    Two-dimensional visualization of the missing values in a dataset.
22
23
    Parameters:
24
    ----------
25
    data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
26
27
    cmap: colormap, default 'PuBuGn'
28
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib documentation.
29
30
    figsize: tuple, default (20,12)
31
        Use to control the figure size.
32
33
    spine_color: color-code, default '#EEEEEE'
34
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
35
36
    Returns:
37
    -------
38
    ax: matplotlib Axes. Axes object with the heatmap.
39
    '''
40
41
    # Identify missing values
42
    mv_cols = data.isna().sum(axis=0)
43
    mv_rows = data.isna().sum(axis=1)
44
    mv_total = mv_cols.sum()
45
    mv_cols_rel = mv_cols / data.shape[0]
46
    total_datapoints = data.shape[0]*data.shape[1]
47
48
    if mv_total == 0:
49
        print('No missing values found in the dataset.')
50
    else:
51
        # Create figure and axes
52
        fig = plt.figure(figsize=figsize)
53
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
54
        ax1 = fig.add_subplot(grid[:1, :5])
55
        ax2 = fig.add_subplot(grid[1:, :5])
56
        ax3 = fig.add_subplot(grid[:1, 5:])
57
        ax4 = fig.add_subplot(grid[1:, 5:])
58
59
        # ax1 - Barplot
60
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
61
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_rel)*100, 2), color=colors)
62
        ax1.get_xaxis().set_visible(False)
63
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
64
        ax1.set_ylim(0, np.max(mv_cols_rel)*100)
65
        ax1.grid(linestyle=':', linewidth=1)
66
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
67
        ax1.tick_params(axis='y', colors='#111111', length=1)
68
69
        # annotate values on top of the bars
70
        for rect, label in zip(ax1.patches, mv_cols):
71
            height = rect.get_height()
72
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
73
                     ha='center',
74
                     va='bottom',
75
                     rotation='90',
76
                     alpha=0.5,
77
                     fontsize='small')
78
79
        ax1.set_frame_on(True)
80
        for _, spine in ax1.spines.items():
81
            spine.set_visible(True)
82
            spine.set_color(spine_color)
83
        ax1.spines['top'].set_color(None)
84
85
        # ax2 - Heatmap
86
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
87
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
88
        ax2.set_yticklabels(ax2.get_yticks())
89
        ax2.set_xticklabels(
90
            ax2.get_xticklabels(),
91
            horizontalalignment='center',
92
            fontweight='light',
93
            fontsize='medium')
94
        ax2.tick_params(length=1, colors='#111111')
95
        for _, spine in ax2.spines.items():
96
            spine.set_visible(True)
97
            spine.set_color(spine_color)
98
99
        # ax3 - Summary
100
        fontax3 = {'color':  '#111111',
101
                   'weight': 'normal',
102
                   'size': 12,
103
                   }
104
        ax3.get_xaxis().set_visible(False)
105
        ax3.get_yaxis().set_visible(False)
106
        ax3.set(frame_on=False)
107
108
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
109
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
110
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3)
111
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
112
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
113
114
        # ax4 - Scatter plot
115
        ax4.get_yaxis().set_visible(False)
116
        for _, spine in ax4.spines.items():
117
            spine.set_color(spine_color)
118
        ax4.tick_params(axis='x', colors='#111111', length=1)
119
120
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".")
121
        ax4.set_ylim(0, len(mv_rows))
122
        ax4.set_ylim(ax4.get_ylim()[::-1])  # invert y-axis
123
        ax4.grid(linestyle=':', linewidth=1)
124
125
126
# Correlation matrix / heatmap
127
def corr_plot(data, split=None, threshold=0, cmap=sns.color_palette("BrBG", 250), figsize=(12, 10), dev=False, **kwargs):
128
    '''
129
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
130
131
    Parameters:
132
    ----------
133
    data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information will be used to label the columns and rows.
134
135
    split: {'None', 'pos', 'neg', 'high', 'low'}, default 'None'
136
        Type of split to be performed.
137
138
        * None: visualize all correlations between the feature-columns.
139
        * pos: visualize all positive correlations between the feature-columns above the threshold.
140
        * neg: visualize all negative correlations between the feature-columns below the threshold.
141
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
142
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
143
144
    threshold: float, default 0
145
        Value between 0 <= threshold <= 1
146
147
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
148
        The mapping from data values to color space.
149
150
    figsize: tuple, default (12, 10)
151
        Use to control the figure size.
152
153
    dev: bool, default False
154
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for presentations.
155
156
    **kwargs: optional
157
        Additional elements to control the visualization of the plot, e.g.:
158
159
        * mask: bool, default True
160
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this case to avoid overlap.
161
        * cmap: matplotlib colormap name or object, or list of colors, optional
162
        The mapping from data values to color space. If not provided, the
163
        default is sns.color_palette("BrBG", 150).
164
        * annot:bool, default True for 20 or less columns, False for more than 20 feature-columns.
165
        * vmax: float, default is calculated from the given correlation coefficients. 
166
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
167
        * vmin: float, default is calculated from the given correlation coefficients. 
168
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
169
        * linewidths: float, default 0.5
170
        Controls the line-width inbetween the squares.
171
        * annot_kws: dict, default {'size' : 10}
172
        Controls the font size of the annotations. Only available when annot = True.
173
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
174
        Controls the size of the colorbar.
175
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
176
177
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
178
179
    Returns:
180
    ------- 
181
    ax: matplotlib Axes. Axes object with the heatmap.
182
    '''
183
184
    if split == 'pos':
185
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
186
        threshold = '-'
187
    elif split == 'neg':
188
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
189
        threshold = '-'
190
    elif split == 'high':
191
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
192
    elif split == 'low':
193
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
194
    else:
195
        corr = data.corr()
196
        split = "full"
197
        threshold = 'None'
198
199
    # Generate mask for the upper triangle
200
    mask = np.triu(np.ones_like(corr, dtype=np.bool))
201
202
    # Compute dimensions and correlation range to adjust settings
203
    annot = True if np.max(corr.shape) < 21 else False
204
    vmax = np.round(np.nanmax(corr.where(mask == False))-0.05, 2)
205
    vmin = np.round(np.nanmin(corr.where(mask == False))+0.05, 2)
206
207
    # Set up the matplotlib figure and generate colormap
208
    fig, ax = plt.subplots(figsize=figsize)
209
210
    # kwargs for the heatmap
211
    kwargs = {'mask': mask,
212
              'cmap': cmap,
213
              'annot': annot,
214
              'vmax': vmax,
215
              'vmin': vmin,
216
              'linewidths': .5,
217
              'annot_kws': {'size': 10},
218
              'cbar_kws': {'shrink': .95, 'aspect': 30},
219
              **kwargs}
220
221
    # Draw heatmap with mask and some default settings
222
    sns.heatmap(corr,
223
                center=0,
224
                square=True,
225
                fmt='.2f',
226
                **kwargs
227
                )
228
229
    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
230
231
    if dev == False:
232
        pass
233
    else:  # show settings
234
        fig.suptitle(f"\
235
            Settings (dev-mode): \n\
236
            - split-mode: {split} \n\
237
            - threshold: {threshold} \n\
238
            - cbar: \n\
239
                - vmax: {vmax} \n\
240
                - vmin: {vmin} \n\
241
            - linewidths: {kwargs['linewidths']} \n\
242
            - annot_kws: {kwargs['annot_kws']} \n\
243
            - cbar_kws: {kwargs['cbar_kws']}",
244
                     fontsize=12,
245
                     color='gray',
246
                     x=0.35,
247
                     y=0.8,
248
                     ha='left')
249
250
    return ax
251
252
253
# TODO - summary statistics
254
# TODO - visualize distributions
255
    # numerical
256
    # categorical
257
# todo export charts and summary statistics?
258
259
# FIXME something
260
# FIX something else
261
262
# BUG none known
263