GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 0be18a...fe1083 )
by Andreas
01:16
created

klib.describe   A

Complexity

Total Complexity 15

Size/Duplication

Total Lines 298
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 143
dl 0
loc 298
rs 10
c 0
b 0
f 0
wmc 15

3 Functions

Rating   Name   Duplication   Size   Complexity  
A corr_plot() 0 115 2
C missingval_plot() 0 131 7
B corr_mat() 0 25 6
1
'''
2
Functions for descriptive analytics.
3
4
:author: Andreas Kanz
5
6
'''
7
8
# Imports
9
import matplotlib.pyplot as plt
10
import matplotlib.ticker as ticker
11
import numpy as np
12
import pandas as pd
13
import seaborn as sns
14
15
from .utils import _missing_vals
16
17
18
# Functions
19
20
# Correlation Matrix
21
def corr_mat(data, split=None, threshold=0):
22
    def color_negative_red(val):
23
        color = '#FF3344' if val < 0 else None
24
        return 'color: %s' % color
25
26
    data = pd.DataFrame(data)
27
28
    if split == 'pos':
29
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
30
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
31
    elif split == 'neg':
32
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
33
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
34
    elif split == 'high':
35
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
36
        print('Displaying absolute correlations above a chosen threshold.')
37
    elif split == 'low':
38
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
39
        print('Displaying absolute correlations below a chosen threshold.')
40
    else:
41
        corr = data.corr()
42
        split = 'None'
43
        threshold = 'None'
44
45
    return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')
46
47
48
# Missing value plot
49
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
50
    '''
51
    Two-dimensional visualization of the missing values in a dataset.
52
53
    Parameters
54
    ----------
55
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
56
    information is used to label the plots.
57
58
    cmap: colormap, default 'PuBuGn'
59
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
60
        documentation.
61
62
    figsize: tuple, default (20,12)
63
        Use to control the figure size.
64
65
    sort: bool, default False
66
        Sort columns based on missing values in descending order and drop columns without any missing values
67
68
    spine_color: color-code, default '#EEEEEE'
69
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
70
71
    Returns
72
    -------
73
    figure
74
    '''
75
76
    data = pd.DataFrame(data)
77
78
    if sort:
79
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
80
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
81
        data = data[final_cols]
82
        print('Displaying only columns with missing values.')
83
84
    # Identify missing values
85
    mv_cols = _missing_vals(data)['mv_cols']  # data.isna().sum(axis=0)
86
    mv_rows = _missing_vals(data)['mv_rows']  # data.isna().sum(axis=1)
87
    mv_total = _missing_vals(data)['mv_total']
88
    mv_cols_ratio = _missing_vals(data)['mv_cols_ratio']  # mv_cols / data.shape[0]
89
    total_datapoints = data.shape[0]*data.shape[1]
90
91
    if mv_total == 0:
92
        print('No missing values found in the dataset.')
93
    else:
94
        # Create figure and axes
95
        fig = plt.figure(figsize=figsize)
96
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
97
        ax1 = fig.add_subplot(grid[:1, :5])
98
        ax2 = fig.add_subplot(grid[1:, :5])
99
        ax3 = fig.add_subplot(grid[:1, 5:])
100
        ax4 = fig.add_subplot(grid[1:, 5:])
101
102
        # ax1 - Barplot
103
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
104
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
105
        ax1.get_xaxis().set_visible(False)
106
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
107
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
108
        ax1.grid(linestyle=':', linewidth=1)
109
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
110
        ax1.tick_params(axis='y', colors='#111111', length=1)
111
112
        # annotate values on top of the bars
113
        for rect, label in zip(ax1.patches, mv_cols):
114
            height = rect.get_height()
115
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
116
                     ha='center',
117
                     va='bottom',
118
                     rotation='90',
119
                     alpha=0.5,
120
                     fontsize='small')
121
122
        ax1.set_frame_on(True)
123
        for _, spine in ax1.spines.items():
124
            spine.set_visible(True)
125
            spine.set_color(spine_color)
126
        ax1.spines['top'].set_color(None)
127
128
        # ax2 - Heatmap
129
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
130
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
131
        ax2.set_yticklabels(ax2.get_yticks())
132
        ax2.set_xticklabels(
133
            ax2.get_xticklabels(),
134
            horizontalalignment='center',
135
            fontweight='light',
136
            fontsize='medium')
137
        ax2.tick_params(length=1, colors='#111111')
138
        for _, spine in ax2.spines.items():
139
            spine.set_visible(True)
140
            spine.set_color(spine_color)
141
142
        # ax3 - Summary
143
        fontax3 = {'color':  '#111111',
144
                   'weight': 'normal',
145
                   'size': 12,
146
                   }
147
        ax3.get_xaxis().set_visible(False)
148
        ax3.get_yaxis().set_visible(False)
149
        ax3.set(frame_on=False)
150
151
        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
152
                 transform=ax3.transAxes,
153
                 fontdict=fontax3)
154
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
155
                 transform=ax3.transAxes,
156
                 fontdict=fontax3)
157
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
158
                 transform=ax3.transAxes,
159
                 fontdict=fontax3)
160
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
161
                 transform=ax3.transAxes,
162
                 fontdict=fontax3)
163
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
164
                 transform=ax3.transAxes,
165
                 fontdict=fontax3)
166
167
        # ax4 - Scatter plot
168
        ax4.get_yaxis().set_visible(False)
169
        for _, spine in ax4.spines.items():
170
            spine.set_color(spine_color)
171
        ax4.tick_params(axis='x', colors='#111111', length=1)
172
173
        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
174
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
175
        ax4.set_xlim(0, max(mv_rows)+0.5)
176
        ax4.grid(linestyle=':', linewidth=1)
177
178
        ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18})
179
        return grid
180
181
182
# Correlation matrix / heatmap
183
def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
184
    '''
185
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
186
187
    Parameters
188
    ----------
189
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
190
    information is used to label the plots.
191
192
    split: {None, 'pos', 'neg', 'high', 'low'}, default None
193
        Type of split to be performed.
194
195
        * None: visualize all correlations between the feature-columns.
196
        * pos: visualize all positive correlations between the feature-columns above the threshold.
197
        * neg: visualize all negative correlations between the feature-columns below the threshold.
198
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
199
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
200
201
    threshold: float, default 0
202
        Value between 0 <= threshold <= 1
203
204
    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
205
        The mapping from data values to color space.
206
207
    figsize: tuple, default (12, 10)
208
        Use to control the figure size.
209
210
    annot: bool, default True
211
        Use to show or hide annotations.
212
213
    dev: bool, default False
214
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for \
215
        presentations.
216
217
    **kwargs: optional
218
        Additional elements to control the visualization of the plot, e.g.:
219
220
        * mask: bool, default True
221
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
222
        case to avoid overlap.
223
        * vmax: float, default is calculated from the given correlation coefficients.
224
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
225
        * vmin: float, default is calculated from the given correlation coefficients.
226
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
227
        * linewidths: float, default 0.5
228
        Controls the line-width inbetween the squares.
229
        * annot_kws: dict, default {'size' : 10}
230
        Controls the font size of the annotations. Only available when annot = True.
231
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
232
        Controls the size of the colorbar.
233
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
234
235
        Kwargs can be supplied through a dictionary of key-value pairs (see above).
236
237
    Returns
238
    -------
239
    figure
240
    '''
241
242
    data = pd.DataFrame(data)
243
244
    # Obtain correlation matrix
245
    corr = corr_mat(data, split=split, threshold=threshold).data
246
247
    # Generate mask for the upper triangle
248
    mask = np.triu(np.ones_like(corr, dtype=np.bool))
249
250
    # Compute dimensions and correlation range to adjust settings
251
    vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
252
    vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
253
254
    # Set up the matplotlib figure and generate colormap
255
    fig, ax = plt.subplots(figsize=figsize)
256
257
    # Specify kwargs for the heatmap
258
    kwargs = {'mask': mask,
259
              'cmap': cmap,
260
              'annot': annot,
261
              'vmax': vmax,
262
              'vmin': vmin,
263
              'linewidths': .5,
264
              'annot_kws': {'size': 10},
265
              'cbar_kws': {'shrink': .95, 'aspect': 30},
266
              **kwargs}
267
268
    # Draw heatmap with mask and some default settings
269
    sns.heatmap(corr,
270
                center=0,
271
                square=True,
272
                fmt='.2f',
273
                **kwargs
274
                )
275
276
    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
277
278
    # Display settings
279
    if dev:
280
        fig.suptitle(f"\
281
            Settings (dev-mode): \n\
282
            - split-mode: {split} \n\
283
            - threshold: {threshold} \n\
284
            - annotations: {annot} \n\
285
            - cbar: \n\
286
                - vmax: {vmax} \n\
287
                - vmin: {vmin} \n\
288
            - linewidths: {kwargs['linewidths']} \n\
289
            - annot_kws: {kwargs['annot_kws']} \n\
290
            - cbar_kws: {kwargs['cbar_kws']}",
291
                     fontsize=12,
292
                     color='gray',
293
                     x=0.35,
294
                     y=0.85,
295
                     ha='left')
296
297
    return ax
298