klib.describe - Code Metrics - Inspection of "Move auxiliary functions to utils.py" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 0be18a...fe1083 )

by Andreas

created 2020-04-11 09:56 UTC

klib.describe A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	298
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	143
dl	0
loc	298
rs	10
c	0
b	0
f	0
wmc	15

3 Functions

Rating	Name	Size	Complexity
A	corr_plot()	115	2
C	missingval_plot()	131	7
B	corr_mat()	25	6

'''
Functions for descriptive analytics.

:author: Andreas Kanz

'''

# Imports
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns

from .utils import _missing_vals


# Functions

# Correlation Matrix
def corr_mat(data, split=None, threshold=0):
    def color_negative_red(val):
        color = '#FF3344' if val < 0 else None
        return 'color: %s' % color

    data = pd.DataFrame(data)

    if split == 'pos':
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
    elif split == 'neg':
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
    elif split == 'high':
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
        print('Displaying absolute correlations above a chosen threshold.')
    elif split == 'low':
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
        print('Displaying absolute correlations below a chosen threshold.')
    else:
        corr = data.corr()
        split = 'None'
        threshold = 'None'

    return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')


# Missing value plot
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
    '''
    Two-dimensional visualization of the missing values in a dataset.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    cmap: colormap, default 'PuBuGn'
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
        documentation.

    figsize: tuple, default (20,12)
        Use to control the figure size.

    sort: bool, default False
        Sort columns based on missing values in descending order and drop columns without any missing values

    spine_color: color-code, default '#EEEEEE'
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.

    Returns
    -------
    figure
    '''

    data = pd.DataFrame(data)

    if sort:
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
        data = data[final_cols]
        print('Displaying only columns with missing values.')

    # Identify missing values
    mv_cols = _missing_vals(data)['mv_cols']  # data.isna().sum(axis=0)
    mv_rows = _missing_vals(data)['mv_rows']  # data.isna().sum(axis=1)
    mv_total = _missing_vals(data)['mv_total']
    mv_cols_ratio = _missing_vals(data)['mv_cols_ratio']  # mv_cols / data.shape[0]
    total_datapoints = data.shape[0]*data.shape[1]

    if mv_total == 0:
        print('No missing values found in the dataset.')
    else:
        # Create figure and axes
        fig = plt.figure(figsize=figsize)
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
        ax1 = fig.add_subplot(grid[:1, :5])
        ax2 = fig.add_subplot(grid[1:, :5])
        ax3 = fig.add_subplot(grid[:1, 5:])
        ax4 = fig.add_subplot(grid[1:, 5:])

        # ax1 - Barplot
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
        ax1.get_xaxis().set_visible(False)
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
        ax1.grid(linestyle=':', linewidth=1)
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
        ax1.tick_params(axis='y', colors='#111111', length=1)

        # annotate values on top of the bars
        for rect, label in zip(ax1.patches, mv_cols):
            height = rect.get_height()
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
                     ha='center',
                     va='bottom',
                     rotation='90',
                     alpha=0.5,
                     fontsize='small')

        ax1.set_frame_on(True)
        for _, spine in ax1.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)
        ax1.spines['top'].set_color(None)

        # ax2 - Heatmap
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
        ax2.set_yticklabels(ax2.get_yticks())
        ax2.set_xticklabels(
            ax2.get_xticklabels(),
            horizontalalignment='center',
            fontweight='light',
            fontsize='medium')
        ax2.tick_params(length=1, colors='#111111')
        for _, spine in ax2.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)

        # ax3 - Summary
        fontax3 = {'color':  '#111111',
                   'weight': 'normal',
                   'size': 12,
                   }
        ax3.get_xaxis().set_visible(False)
        ax3.get_yaxis().set_visible(False)
        ax3.set(frame_on=False)

        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
                 transform=ax3.transAxes,
                 fontdict=fontax3)

        # ax4 - Scatter plot
        ax4.get_yaxis().set_visible(False)
        for _, spine in ax4.spines.items():
            spine.set_color(spine_color)
        ax4.tick_params(axis='x', colors='#111111', length=1)

        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
        ax4.set_xlim(0, max(mv_rows)+0.5)
        ax4.grid(linestyle=':', linewidth=1)

        ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18})
        return grid


# Correlation matrix / heatmap
def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
    '''
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    split: {None, 'pos', 'neg', 'high', 'low'}, default None
        Type of split to be performed.

        * None: visualize all correlations between the feature-columns.
        * pos: visualize all positive correlations between the feature-columns above the threshold.
        * neg: visualize all negative correlations between the feature-columns below the threshold.
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.

    threshold: float, default 0
        Value between 0 <= threshold <= 1

    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
        The mapping from data values to color space.

    figsize: tuple, default (12, 10)
        Use to control the figure size.

    annot: bool, default True
        Use to show or hide annotations.

    dev: bool, default False
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for \
        presentations.

    **kwargs: optional
        Additional elements to control the visualization of the plot, e.g.:

        * mask: bool, default True
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
        case to avoid overlap.
        * vmax: float, default is calculated from the given correlation coefficients.
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
        * vmin: float, default is calculated from the given correlation coefficients.
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
        * linewidths: float, default 0.5
        Controls the line-width inbetween the squares.
        * annot_kws: dict, default {'size' : 10}
        Controls the font size of the annotations. Only available when annot = True.
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
        Controls the size of the colorbar.
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...

        Kwargs can be supplied through a dictionary of key-value pairs (see above).

    Returns
    -------
    figure
    '''

    data = pd.DataFrame(data)

    # Obtain correlation matrix
    corr = corr_mat(data, split=split, threshold=threshold).data

    # Generate mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=np.bool))

    # Compute dimensions and correlation range to adjust settings
    vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
    vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)

    # Set up the matplotlib figure and generate colormap
    fig, ax = plt.subplots(figsize=figsize)

    # Specify kwargs for the heatmap
    kwargs = {'mask': mask,
              'cmap': cmap,
              'annot': annot,
              'vmax': vmax,
              'vmin': vmin,
              'linewidths': .5,
              'annot_kws': {'size': 10},
              'cbar_kws': {'shrink': .95, 'aspect': 30},
              **kwargs}

    # Draw heatmap with mask and some default settings
    sns.heatmap(corr,
                center=0,
                square=True,
                fmt='.2f',
                **kwargs
                )

    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})

    # Display settings
    if dev:
        fig.suptitle(f"\
            Settings (dev-mode): \n\
            - split-mode: {split} \n\
            - threshold: {threshold} \n\
            - annotations: {annot} \n\
            - cbar: \n\
                - vmax: {vmax} \n\
                - vmin: {vmin} \n\
            - linewidths: {kwargs['linewidths']} \n\
            - annot_kws: {kwargs['annot_kws']} \n\
            - cbar_kws: {kwargs['cbar_kws']}",
                     fontsize=12,
                     color='gray',
                     x=0.35,
                     y=0.85,
                     ha='left')

    return ax


1			'''
2			Functions for descriptive analytics.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import matplotlib.pyplot as plt
10			import matplotlib.ticker as ticker
11			import numpy as np
12			import pandas as pd
13			import seaborn as sns
14
15			from .utils import _missing_vals
16
17
18			# Functions
19
20			# Correlation Matrix
21			def corr_mat(data, split=None, threshold=0):
22			def color_negative_red(val):
23			color = '#FF3344' if val < 0 else None
24			return 'color: %s' % color
25
26			data = pd.DataFrame(data)
27
28			if split == 'pos':
29			corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
30			print('Displaying positive correlations. Use "threshold" to further limit the results.')
31			elif split == 'neg':
32			corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
33			print('Displaying negative correlations. Use "threshold" to further limit the results.')
34			elif split == 'high':
35			corr = data.corr().where(np.abs(data.corr()) >= threshold)
36			print('Displaying absolute correlations above a chosen threshold.')
37			elif split == 'low':
38			corr = data.corr().where(np.abs(data.corr()) <= threshold)
39			print('Displaying absolute correlations below a chosen threshold.')
40			else:
41			corr = data.corr()
42			split = 'None'
43			threshold = 'None'
44
45			return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')
46
47
48			# Missing value plot
49			def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
50			'''
51			Two-dimensional visualization of the missing values in a dataset.
52
53			Parameters
54			----------
55			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
56			information is used to label the plots.
57
58			cmap: colormap, default 'PuBuGn'
59			Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
60			documentation.
61
62			figsize: tuple, default (20,12)
63			Use to control the figure size.
64
65			sort: bool, default False
66			Sort columns based on missing values in descending order and drop columns without any missing values
67
68			spine_color: color-code, default '#EEEEEE'
69			Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
70
71			Returns
72			-------
73			figure
74			'''
75
76			data = pd.DataFrame(data)
77
78			if sort:
79			mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
80			final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
81			data = data[final_cols]
82			print('Displaying only columns with missing values.')
83
84			# Identify missing values
85			mv_cols = _missing_vals(data)['mv_cols'] # data.isna().sum(axis=0)
86			mv_rows = _missing_vals(data)['mv_rows'] # data.isna().sum(axis=1)
87			mv_total = _missing_vals(data)['mv_total']
88			mv_cols_ratio = _missing_vals(data)['mv_cols_ratio'] # mv_cols / data.shape[0]
89			total_datapoints = data.shape[0]*data.shape[1]
90
91			if mv_total == 0:
92			print('No missing values found in the dataset.')
93			else:
94			# Create figure and axes
95			fig = plt.figure(figsize=figsize)
96			grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
97			ax1 = fig.add_subplot(grid[:1, :5])
98			ax2 = fig.add_subplot(grid[1:, :5])
99			ax3 = fig.add_subplot(grid[:1, 5:])
100			ax4 = fig.add_subplot(grid[1:, 5:])
101
102			# ax1 - Barplot
103			colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height
104			ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
105			ax1.get_xaxis().set_visible(False)
106			ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
107			ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
108			ax1.grid(linestyle=':', linewidth=1)
109			ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
110			ax1.tick_params(axis='y', colors='#111111', length=1)
111
112			# annotate values on top of the bars
113			for rect, label in zip(ax1.patches, mv_cols):
114			height = rect.get_height()
115			ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
116			ha='center',
117			va='bottom',
118			rotation='90',
119			alpha=0.5,
120			fontsize='small')
121
122			ax1.set_frame_on(True)
123			for _, spine in ax1.spines.items():
124			spine.set_visible(True)
125			spine.set_color(spine_color)
126			ax1.spines['top'].set_color(None)
127
128			# ax2 - Heatmap
129			sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
130			ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
131			ax2.set_yticklabels(ax2.get_yticks())
132			ax2.set_xticklabels(
133			ax2.get_xticklabels(),
134			horizontalalignment='center',
135			fontweight='light',
136			fontsize='medium')
137			ax2.tick_params(length=1, colors='#111111')
138			for _, spine in ax2.spines.items():
139			spine.set_visible(True)
140			spine.set_color(spine_color)
141
142			# ax3 - Summary
143			fontax3 = {'color': '#111111',
144			'weight': 'normal',
145			'size': 12,
146			}
147			ax3.get_xaxis().set_visible(False)
148			ax3.get_yaxis().set_visible(False)
149			ax3.set(frame_on=False)
150
151			ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
152			transform=ax3.transAxes,
153			fontdict=fontax3)
154			ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
155			transform=ax3.transAxes,
156			fontdict=fontax3)
157			ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
158			transform=ax3.transAxes,
159			fontdict=fontax3)
160			ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
161			transform=ax3.transAxes,
162			fontdict=fontax3)
163			ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
164			transform=ax3.transAxes,
165			fontdict=fontax3)
166
167			# ax4 - Scatter plot
168			ax4.get_yaxis().set_visible(False)
169			for _, spine in ax4.spines.items():
170			spine.set_color(spine_color)
171			ax4.tick_params(axis='x', colors='#111111', length=1)
172
173			ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
174			ax4.set_ylim((0, len(mv_rows))[::-1]) # limit and invert y-axis
175			ax4.set_xlim(0, max(mv_rows)+0.5)
176			ax4.grid(linestyle=':', linewidth=1)
177
178			ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18})
179			return grid
180
181
182			# Correlation matrix / heatmap
183			def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
184			'''
185			Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
186
187			Parameters
188			----------
189			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
190			information is used to label the plots.
191
192			split: {None, 'pos', 'neg', 'high', 'low'}, default None
193			Type of split to be performed.
194
195			* None: visualize all correlations between the feature-columns.
196			* pos: visualize all positive correlations between the feature-columns above the threshold.
197			* neg: visualize all negative correlations between the feature-columns below the threshold.
198			* high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
199			* low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
200
201			threshold: float, default 0
202			Value between 0 <= threshold <= 1
203
204			cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
205			The mapping from data values to color space.
206
207			figsize: tuple, default (12, 10)
208			Use to control the figure size.
209
210			annot: bool, default True
211			Use to show or hide annotations.
212
213			dev: bool, default False
214			Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for \
215			presentations.
216
217			**kwargs: optional
218			Additional elements to control the visualization of the plot, e.g.:
219
220			* mask: bool, default True
221			If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
222			case to avoid overlap.
223			* vmax: float, default is calculated from the given correlation coefficients.
224			Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
225			* vmin: float, default is calculated from the given correlation coefficients.
226			Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
227			* linewidths: float, default 0.5
228			Controls the line-width inbetween the squares.
229			* annot_kws: dict, default {'size' : 10}
230			Controls the font size of the annotations. Only available when annot = True.
231			* cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
232			Controls the size of the colorbar.
233			* Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
234
235			Kwargs can be supplied through a dictionary of key-value pairs (see above).
236
237			Returns
238			-------
239			figure
240			'''
241
242			data = pd.DataFrame(data)
243
244			# Obtain correlation matrix
245			corr = corr_mat(data, split=split, threshold=threshold).data
246
247			# Generate mask for the upper triangle
248			mask = np.triu(np.ones_like(corr, dtype=np.bool))
249
250			# Compute dimensions and correlation range to adjust settings
251			vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
252			vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
253
254			# Set up the matplotlib figure and generate colormap
255			fig, ax = plt.subplots(figsize=figsize)
256
257			# Specify kwargs for the heatmap
258			kwargs = {'mask': mask,
259			'cmap': cmap,
260			'annot': annot,
261			'vmax': vmax,
262			'vmin': vmin,
263			'linewidths': .5,
264			'annot_kws': {'size': 10},
265			'cbar_kws': {'shrink': .95, 'aspect': 30},
266			**kwargs}
267
268			# Draw heatmap with mask and some default settings
269			sns.heatmap(corr,
270			center=0,
271			square=True,
272			fmt='.2f',
273			**kwargs
274			)
275
276			ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
277
278			# Display settings
279			if dev:
280			fig.suptitle(f"\
281			Settings (dev-mode): \n\
282			- split-mode: {split} \n\
283			- threshold: {threshold} \n\
284			- annotations: {annot} \n\
285			- cbar: \n\
286			- vmax: {vmax} \n\
287			- vmin: {vmin} \n\
288			- linewidths: {kwargs['linewidths']} \n\
289			- annot_kws: {kwargs['annot_kws']} \n\
290			- cbar_kws: {kwargs['cbar_kws']}",
291			fontsize=12,
292			color='gray',
293			x=0.35,
294			y=0.85,
295			ha='left')
296
297			return ax
298

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 0be18a...fe1083 )

klib.describe A

Complexity

Size/Duplication

Importance

3 Functions

Duplication Side-by-Side

Filter issues like