klib.describe.missingval_plot() - Code Metrics - Inspection of "add sort to missingval_plot and annotation option..." - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 4c58c3...2f319e )

by Andreas

created 2020-03-30 12:37 UTC

klib.describe.missingval_plot() C

↳ Parent: klib.describe

Complexity

Conditions

Size

Total Lines	114
Code Lines	71

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	71
dl	0
loc	114
rs	6.549
c	0
b	0
f	0
cc	7
nop	5

How to fix Long Method

'''
Utilities for descriptive analytics.

:author: Andreas Kanz

'''

# Imports
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import cm


# Missing value plot
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
    '''
    Two-dimensional visualization of the missing values in a dataset.

    Parameters:
    ----------
    data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information is used to label the plots.

    cmap: colormap, default 'PuBuGn'
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib documentation.

    figsize: tuple, default (20,12)
        Use to control the figure size.

    sort: bool, default False
        Sort columns based on missing values in descending order and drop columns without any missing values

    spine_color: color-code, default '#EEEEEE'
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.

    Returns:
    -------
    ax: matplotlib Axes. Axes object with the heatmap.
    '''

    if sort:
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
        data = data[final_cols]
        print('Displaying only columns with missing values.')

    # Identify missing values
    mv_cols = data.isna().sum(axis=0)
    mv_rows = data.isna().sum(axis=1)
    mv_total = mv_cols.sum()
    mv_cols_rel = mv_cols / data.shape[0]
    total_datapoints = data.shape[0]*data.shape[1]

    if mv_total == 0:
        print('No missing values found in the dataset.')
    else:
        # Create figure and axes
        fig = plt.figure(figsize=figsize)
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
        ax1 = fig.add_subplot(grid[:1, :5])
        ax2 = fig.add_subplot(grid[1:, :5])
        ax3 = fig.add_subplot(grid[:1, 5:])
        ax4 = fig.add_subplot(grid[1:, 5:])

        # ax1 - Barplot
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_rel)*100, 2), color=colors)
        ax1.get_xaxis().set_visible(False)
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
        ax1.set_ylim(0, np.max(mv_cols_rel)*100)
        ax1.grid(linestyle=':', linewidth=1)
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
        ax1.tick_params(axis='y', colors='#111111', length=1)

        # annotate values on top of the bars
        for rect, label in zip(ax1.patches, mv_cols):
            height = rect.get_height()
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
                     ha='center',
                     va='bottom',
                     rotation='90',
                     alpha=0.5,
                     fontsize='small')

        ax1.set_frame_on(True)
        for _, spine in ax1.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)
        ax1.spines['top'].set_color(None)

        # ax2 - Heatmap
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
        ax2.set_yticklabels(ax2.get_yticks())
        ax2.set_xticklabels(
            ax2.get_xticklabels(),
            horizontalalignment='center',
            fontweight='light',
            fontsize='medium')
        ax2.tick_params(length=1, colors='#111111')
        for _, spine in ax2.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)

        # ax3 - Summary
        fontax3 = {'color':  '#111111',
                   'weight': 'normal',
                   'size': 12,
                   }
        ax3.get_xaxis().set_visible(False)
        ax3.get_yaxis().set_visible(False)
        ax3.set(frame_on=False)

        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3)
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3)

        # ax4 - Scatter plot
        ax4.get_yaxis().set_visible(False)
        for _, spine in ax4.spines.items():
            spine.set_color(spine_color)
        ax4.tick_params(axis='x', colors='#111111', length=1)

        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".")
        ax4.set_ylim(0, len(mv_rows))
        ax4.set_ylim(ax4.get_ylim()[::-1])  # invert y-axis
        ax4.grid(linestyle=':', linewidth=1)


# Correlation matrix / heatmap
def corr_plot(data, split=None, threshold=0, cmap=sns.color_palette("BrBG", 250), figsize=(12, 10), annot=True, dev=False, **kwargs):
    '''
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.

    Parameters:
    ----------
    data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information will be used to label the columns and rows.

    split: {'None', 'pos', 'neg', 'high', 'low'}, default 'None'
        Type of split to be performed.

        * None: visualize all correlations between the feature-columns.
        * pos: visualize all positive correlations between the feature-columns above the threshold.
        * neg: visualize all negative correlations between the feature-columns below the threshold.
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.

    threshold: float, default 0
        Value between 0 <= threshold <= 1

    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
        The mapping from data values to color space.

    figsize: tuple, default (12, 10)
        Use to control the figure size.

    annot: bool, default True
        Use to show or hide annotations.

    dev: bool, default False
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for presentations.

    **kwargs: optional
        Additional elements to control the visualization of the plot, e.g.:

        * mask: bool, default True
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this case to avoid overlap.
        * cmap: matplotlib colormap name or object, or list of colors, optional
        The mapping from data values to color space. If not provided, the
        default is sns.color_palette("BrBG", 150).
        * annot:bool, default True for 20 or less columns, False for more than 20 feature-columns.
        * vmax: float, default is calculated from the given correlation coefficients. 
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
        * vmin: float, default is calculated from the given correlation coefficients. 
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
        * linewidths: float, default 0.5
        Controls the line-width inbetween the squares.
        * annot_kws: dict, default {'size' : 10}
        Controls the font size of the annotations. Only available when annot = True.
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
        Controls the size of the colorbar.
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...

        Kwargs can be supplied through a dictionary of key-value pairs (see above).

    Returns:
    ------- 
    ax: matplotlib Axes. Axes object with the heatmap.
    '''

    if split == 'pos':
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
        threshold = '-'
    elif split == 'neg':
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
        threshold = '-'
    elif split == 'high':
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
    elif split == 'low':
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
    else:
        corr = data.corr()
        split = "full"
        threshold = 'None'

    # Generate mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=np.bool))

    # Compute dimensions and correlation range to adjust settings
    vmax = np.round(np.nanmax(corr.where(mask == False))-0.05, 2)
    vmin = np.round(np.nanmin(corr.where(mask == False))+0.05, 2)

    # Set up the matplotlib figure and generate colormap
    fig, ax = plt.subplots(figsize=figsize)

    # kwargs for the heatmap
    kwargs = {'mask': mask,
              'cmap': cmap,
              'annot': annot,
              'vmax': vmax,
              'vmin': vmin,
              'linewidths': .5,
              'annot_kws': {'size': 10},
              'cbar_kws': {'shrink': .95, 'aspect': 30},
              **kwargs}

    # Draw heatmap with mask and some default settings
    sns.heatmap(corr,
                center=0,
                square=True,
                fmt='.2f',
                **kwargs
                )

    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})

    if dev == False:
        pass
    else:  # show settings
        fig.suptitle(f"\
            Settings (dev-mode): \n\
            - split-mode: {split} \n\
            - threshold: {threshold} \n\
            - cbar: \n\
                - vmax: {vmax} \n\
                - vmin: {vmin} \n\
            - linewidths: {kwargs['linewidths']} \n\
            - annot_kws: {kwargs['annot_kws']} \n\
            - cbar_kws: {kwargs['cbar_kws']}",
                     fontsize=12,
                     color='gray',
                     x=0.35,
                     y=0.8,
                     ha='left')

    return ax


# TODO - summary statistics
# TODO - visualize distributions
    # numerical
    # categorical
# todo export charts and summary statistics?

# FIXME something
# FIX something else

# BUG none known


1			'''
2			Utilities for descriptive analytics.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import matplotlib.pyplot as plt
10			import matplotlib.ticker as ticker
11			import numpy as np
12			import pandas as pd
13			import seaborn as sns
14
15			from matplotlib import cm
16
17
18			# Missing value plot
19			def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
20			'''
21			Two-dimensional visualization of the missing values in a dataset.
22
23			Parameters:
24			----------
25			data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information is used to label the plots.
26
27			cmap: colormap, default 'PuBuGn'
28			Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib documentation.
29
30			figsize: tuple, default (20,12)
31			Use to control the figure size.
32
33			sort: bool, default False
34			Sort columns based on missing values in descending order and drop columns without any missing values
35
36			spine_color: color-code, default '#EEEEEE'
37			Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
38
39			Returns:
40			-------
41			ax: matplotlib Axes. Axes object with the heatmap.
42			'''
43
44			if sort:
45			mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
46			final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
47			data = data[final_cols]
48			print('Displaying only columns with missing values.')
49
50			# Identify missing values
51			mv_cols = data.isna().sum(axis=0)
52			mv_rows = data.isna().sum(axis=1)
53			mv_total = mv_cols.sum()
54			mv_cols_rel = mv_cols / data.shape[0]
55			total_datapoints = data.shape[0]*data.shape[1]
56
57			if mv_total == 0:
58			print('No missing values found in the dataset.')
59			else:
60			# Create figure and axes
61			fig = plt.figure(figsize=figsize)
62			grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
63			ax1 = fig.add_subplot(grid[:1, :5])
64			ax2 = fig.add_subplot(grid[1:, :5])
65			ax3 = fig.add_subplot(grid[:1, 5:])
66			ax4 = fig.add_subplot(grid[1:, 5:])
67
68			# ax1 - Barplot
69			colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height
70			ax1.bar(range(len(mv_cols)), np.round((mv_cols_rel)*100, 2), color=colors)
71			ax1.get_xaxis().set_visible(False)
72			ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
73			ax1.set_ylim(0, np.max(mv_cols_rel)*100)
74			ax1.grid(linestyle=':', linewidth=1)
75			ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
76			ax1.tick_params(axis='y', colors='#111111', length=1)
77
78			# annotate values on top of the bars
79			for rect, label in zip(ax1.patches, mv_cols):
80			height = rect.get_height()
81			ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
82			ha='center',
83			va='bottom',
84			rotation='90',
85			alpha=0.5,
86			fontsize='small')
87
88			ax1.set_frame_on(True)
89			for _, spine in ax1.spines.items():
90			spine.set_visible(True)
91			spine.set_color(spine_color)
92			ax1.spines['top'].set_color(None)
93
94			# ax2 - Heatmap
95			sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
96			ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
97			ax2.set_yticklabels(ax2.get_yticks())
98			ax2.set_xticklabels(
99			ax2.get_xticklabels(),
100			horizontalalignment='center',
101			fontweight='light',
102			fontsize='medium')
103			ax2.tick_params(length=1, colors='#111111')
104			for _, spine in ax2.spines.items():
105			spine.set_visible(True)
106			spine.set_color(spine_color)
107
108			# ax3 - Summary
109			fontax3 = {'color': '#111111',
110			'weight': 'normal',
111			'size': 12,
112			}
113			ax3.get_xaxis().set_visible(False)
114			ax3.get_yaxis().set_visible(False)
115			ax3.set(frame_on=False)
116
117			ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
118			ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K", transform=ax3.transAxes, fontdict=fontax3)
119			ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%", transform=ax3.transAxes, fontdict=fontax3)
120			ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
121			ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%", transform=ax3.transAxes, fontdict=fontax3)
122
123			# ax4 - Scatter plot
124			ax4.get_yaxis().set_visible(False)
125			for _, spine in ax4.spines.items():
126			spine.set_color(spine_color)
127			ax4.tick_params(axis='x', colors='#111111', length=1)
128
129			ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".")
130			ax4.set_ylim(0, len(mv_rows))
131			ax4.set_ylim(ax4.get_ylim()[::-1]) # invert y-axis
132			ax4.grid(linestyle=':', linewidth=1)
133
134
135			# Correlation matrix / heatmap
136			def corr_plot(data, split=None, threshold=0, cmap=sns.color_palette("BrBG", 250), figsize=(12, 10), annot=True, dev=False, **kwargs):
137			'''
138			Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
139
140			Parameters:
141			----------
142			data: 2D dataset that can be coerced into an ndarray. If a Pandas DataFrame is provided, the index/column information will be used to label the columns and rows.
143
144			split: {'None', 'pos', 'neg', 'high', 'low'}, default 'None'
145			Type of split to be performed.
146
147			* None: visualize all correlations between the feature-columns.
148			* pos: visualize all positive correlations between the feature-columns above the threshold.
149			* neg: visualize all negative correlations between the feature-columns below the threshold.
150			* high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
151			* low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
152
153			threshold: float, default 0
154			Value between 0 <= threshold <= 1
155
156			cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
157			The mapping from data values to color space.
158
159			figsize: tuple, default (12, 10)
160			Use to control the figure size.
161
162			annot: bool, default True
163			Use to show or hide annotations.
164
165			dev: bool, default False
166			Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for presentations.
167
168			**kwargs: optional
169			Additional elements to control the visualization of the plot, e.g.:
170
171			* mask: bool, default True
172			If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this case to avoid overlap.
173			* cmap: matplotlib colormap name or object, or list of colors, optional
174			The mapping from data values to color space. If not provided, the
175			default is sns.color_palette("BrBG", 150).
176			* annot:bool, default True for 20 or less columns, False for more than 20 feature-columns.
177			* vmax: float, default is calculated from the given correlation coefficients.
178			Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
179			* vmin: float, default is calculated from the given correlation coefficients.
180			Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
181			* linewidths: float, default 0.5
182			Controls the line-width inbetween the squares.
183			* annot_kws: dict, default {'size' : 10}
184			Controls the font size of the annotations. Only available when annot = True.
185			* cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
186			Controls the size of the colorbar.
187			* Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
188
189			Kwargs can be supplied through a dictionary of key-value pairs (see above).
190
191			Returns:
192			-------
193			ax: matplotlib Axes. Axes object with the heatmap.
194			'''
195
196			if split == 'pos':
197			corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
198			threshold = '-'
199			elif split == 'neg':
200			corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
201			threshold = '-'
202			elif split == 'high':
203			corr = data.corr().where(np.abs(data.corr()) >= threshold)
204			elif split == 'low':
205			corr = data.corr().where(np.abs(data.corr()) <= threshold)
206			else:
207			corr = data.corr()
208			split = "full"
209			threshold = 'None'
210
211			# Generate mask for the upper triangle
212			mask = np.triu(np.ones_like(corr, dtype=np.bool))
213
214			# Compute dimensions and correlation range to adjust settings
215			vmax = np.round(np.nanmax(corr.where(mask == False))-0.05, 2)
216			vmin = np.round(np.nanmin(corr.where(mask == False))+0.05, 2)
217
218			# Set up the matplotlib figure and generate colormap
219			fig, ax = plt.subplots(figsize=figsize)
220
221			# kwargs for the heatmap
222			kwargs = {'mask': mask,
223			'cmap': cmap,
224			'annot': annot,
225			'vmax': vmax,
226			'vmin': vmin,
227			'linewidths': .5,
228			'annot_kws': {'size': 10},
229			'cbar_kws': {'shrink': .95, 'aspect': 30},
230			**kwargs}
231
232			# Draw heatmap with mask and some default settings
233			sns.heatmap(corr,
234			center=0,
235			square=True,
236			fmt='.2f',
237			**kwargs
238			)
239
240			ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
241
242			if dev == False:
243			pass
244			else: # show settings
245			fig.suptitle(f"\
246			Settings (dev-mode): \n\
247			- split-mode: {split} \n\
248			- threshold: {threshold} \n\
249			- cbar: \n\
250			- vmax: {vmax} \n\
251			- vmin: {vmin} \n\
252			- linewidths: {kwargs['linewidths']} \n\
253			- annot_kws: {kwargs['annot_kws']} \n\
254			- cbar_kws: {kwargs['cbar_kws']}",
255			fontsize=12,
256			color='gray',
257			x=0.35,
258			y=0.8,
259			ha='left')
260
261			return ax
262
263
264			# TODO - summary statistics
265			# TODO - visualize distributions
266			# numerical
267			# categorical
268			# todo export charts and summary statistics?
269
270			# FIXME something
271			# FIX something else
272
273			# BUG none known
274

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 4c58c3...2f319e )

klib.describe.missingval_plot() C

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like