klib.describe.corr_mat() - Code Metrics - Inspection of "add tests, refactor corrplot" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 92e4a4...505056 )

by Andreas

created 2020-04-07 15:33 UTC

klib.describe.corr_mat() B

↳ Parent: klib.describe

Complexity

Conditions

Size

Total Lines	25
Code Lines	21

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	21
dl	0
loc	25
rs	8.4426
c	0
b	0
f	0
cc	6
nop	3

'''
Utilities for descriptive analytics.

:author: Andreas Kanz

'''

# Imports
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns


# _functions

def _memory_usage(data):
    '''
    Gives the total memory usage in kilobytes.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    Returns
    -------
    memory_usage: float

    '''

    data = pd.DataFrame(data)
    memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)

    return memory_usage


def _missing_vals(data):
    '''
    Gives metrics of missing values in the dataset.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    Returns
    -------
    mv_total: float, number of missing values in the entire dataset
    mv_rows: float, number of missing values in each row
    mv_cols: float, number of missing values in each column
    mv_rows_ratio: float, ratio of missing values for each row
    mv_cols_ratio: float, ratio of missing values for each column
    '''

    data = pd.DataFrame(data)
    mv_rows = data.isna().sum(axis=1)
    mv_cols = data.isna().sum(axis=0)
    mv_total = data.isna().sum().sum()
    mv_rows_ratio = mv_rows/data.shape[1]
    mv_cols_ratio = mv_cols/data.shape[0]

    return {'mv_total': mv_total,
            'mv_rows': mv_rows,
            'mv_cols': mv_cols,
            'mv_rows_ratio': mv_rows_ratio,
            'mv_cols_ratio': mv_cols_ratio}


# Functions

# Correlation Matrix
def corr_mat(data, split=None, threshold=0):
    def color_negative_red(val):
        color = '#FF3344' if val < 0 else None
        return 'color: %s' % color

    data = pd.DataFrame(data)

    if split == 'pos':
        corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
        print('Displaying positive correlations. Use "threshold" to further limit the results.')
    elif split == 'neg':
        corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
        print('Displaying negative correlations. Use "threshold" to further limit the results.')
    elif split == 'high':
        corr = data.corr().where(np.abs(data.corr()) >= threshold)
        print('Displaying absolute correlations above a chosen threshold.')
    elif split == 'low':
        corr = data.corr().where(np.abs(data.corr()) <= threshold)
        print('Displaying absolute correlations below a chosen threshold.')
    else:
        corr = data.corr()
        split = 'None'
        threshold = 'None'

    return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')


# Missing value plot
def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
    '''
    Two-dimensional visualization of the missing values in a dataset.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    cmap: colormap, default 'PuBuGn'
        Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
        documentation.

    figsize: tuple, default (20,12)
        Use to control the figure size.

    sort: bool, default False
        Sort columns based on missing values in descending order and drop columns without any missing values

    spine_color: color-code, default '#EEEEEE'
    Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.

    Returns
    -------
    figure
    '''

    data = pd.DataFrame(data)

    if sort:
        mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
        final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
        data = data[final_cols]
        print('Displaying only columns with missing values.')

    # Identify missing values
    mv_cols = _missing_vals(data)['mv_cols']  # data.isna().sum(axis=0)
    mv_rows = _missing_vals(data)['mv_rows']  # data.isna().sum(axis=1)
    mv_total = _missing_vals(data)['mv_total']
    mv_cols_ratio = _missing_vals(data)['mv_cols_ratio']  # mv_cols / data.shape[0]
    total_datapoints = data.shape[0]*data.shape[1]

    if mv_total == 0:
        print('No missing values found in the dataset.')
    else:
        # Create figure and axes
        fig = plt.figure(figsize=figsize)
        grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
        ax1 = fig.add_subplot(grid[:1, :5])
        ax2 = fig.add_subplot(grid[1:, :5])
        ax3 = fig.add_subplot(grid[:1, 5:])
        ax4 = fig.add_subplot(grid[1:, 5:])

        # ax1 - Barplot
        colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols))  # color bars by height
        ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
        ax1.get_xaxis().set_visible(False)
        ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
        ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
        ax1.grid(linestyle=':', linewidth=1)
        ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
        ax1.tick_params(axis='y', colors='#111111', length=1)

        # annotate values on top of the bars
        for rect, label in zip(ax1.patches, mv_cols):
            height = rect.get_height()
            ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
                     ha='center',
                     va='bottom',
                     rotation='90',
                     alpha=0.5,
                     fontsize='small')

        ax1.set_frame_on(True)
        for _, spine in ax1.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)
        ax1.spines['top'].set_color(None)

        # ax2 - Heatmap
        sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
        ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
        ax2.set_yticklabels(ax2.get_yticks())
        ax2.set_xticklabels(
            ax2.get_xticklabels(),
            horizontalalignment='center',
            fontweight='light',
            fontsize='medium')
        ax2.tick_params(length=1, colors='#111111')
        for _, spine in ax2.spines.items():
            spine.set_visible(True)
            spine.set_color(spine_color)

        # ax3 - Summary
        fontax3 = {'color':  '#111111',
                   'weight': 'normal',
                   'size': 12,
                   }
        ax3.get_xaxis().set_visible(False)
        ax3.get_yaxis().set_visible(False)
        ax3.set(frame_on=False)

        ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
                 transform=ax3.transAxes,
                 fontdict=fontax3)
        ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
                 transform=ax3.transAxes,
                 fontdict=fontax3)

        # ax4 - Scatter plot
        ax4.get_yaxis().set_visible(False)
        for _, spine in ax4.spines.items():
            spine.set_color(spine_color)
        ax4.tick_params(axis='x', colors='#111111', length=1)

        ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
        ax4.set_ylim((0, len(mv_rows))[::-1])  # limit and invert y-axis
        ax4.set_xlim(0, max(mv_rows)+0.5)
        ax4.grid(linestyle=':', linewidth=1)

        ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18})
        return grid


# Correlation matrix / heatmap
def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
    '''
    Two-dimensional visualization of the correlation between feature-columns, excluding NA values.

    Parameters
    ----------
    data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
    information is used to label the plots.

    split: {None, 'pos', 'neg', 'high', 'low'}, default None
        Type of split to be performed.

        * None: visualize all correlations between the feature-columns.
        * pos: visualize all positive correlations between the feature-columns above the threshold.
        * neg: visualize all negative correlations between the feature-columns below the threshold.
        * high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
        * low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.

    threshold: float, default 0
        Value between 0 <= threshold <= 1

    cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
        The mapping from data values to color space.

    figsize: tuple, default (12, 10)
        Use to control the figure size.

    annot: bool, default True
        Use to show or hide annotations.

    dev: bool, default False
        Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for \
        presentations.

    **kwargs: optional
        Additional elements to control the visualization of the plot, e.g.:

        * mask: bool, default True
        If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
        case to avoid overlap.
        * vmax: float, default is calculated from the given correlation coefficients.
        Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
        * vmin: float, default is calculated from the given correlation coefficients.
        Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
        * linewidths: float, default 0.5
        Controls the line-width inbetween the squares.
        * annot_kws: dict, default {'size' : 10}
        Controls the font size of the annotations. Only available when annot = True.
        * cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
        Controls the size of the colorbar.
        * Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...

        Kwargs can be supplied through a dictionary of key-value pairs (see above).

    Returns
    -------
    figure
    '''

    data = pd.DataFrame(data)

    # Obtain correlation matrix
    corr = corr_mat(data, split=split, threshold=threshold).data

    # Generate mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=np.bool))

    # Compute dimensions and correlation range to adjust settings
    vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
    vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)

    # Set up the matplotlib figure and generate colormap
    fig, ax = plt.subplots(figsize=figsize)

    # Specify kwargs for the heatmap
    kwargs = {'mask': mask,
              'cmap': cmap,
              'annot': annot,
              'vmax': vmax,
              'vmin': vmin,
              'linewidths': .5,
              'annot_kws': {'size': 10},
              'cbar_kws': {'shrink': .95, 'aspect': 30},
              **kwargs}

    # Draw heatmap with mask and some default settings
    sns.heatmap(corr,
                center=0,
                square=True,
                fmt='.2f',
                **kwargs
                )

    ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})

    # Display settings
    if dev:
        fig.suptitle(f"\
            Settings (dev-mode): \n\
            - split-mode: {split} \n\
            - threshold: {threshold} \n\
            - annotations: {annot} \n\
            - cbar: \n\
                - vmax: {vmax} \n\
                - vmin: {vmin} \n\
            - linewidths: {kwargs['linewidths']} \n\
            - annot_kws: {kwargs['annot_kws']} \n\
            - cbar_kws: {kwargs['cbar_kws']}",
                     fontsize=12,
                     color='gray',
                     x=0.35,
                     y=0.85,
                     ha='left')

    return ax


1			'''
2			Utilities for descriptive analytics.
3
4			:author: Andreas Kanz
5
6			'''
7
8			# Imports
9			import matplotlib.pyplot as plt
10			import matplotlib.ticker as ticker
11			import numpy as np
12			import pandas as pd
13			import seaborn as sns
14
15
16			# _functions
17
18			def _memory_usage(data):
19			'''
20			Gives the total memory usage in kilobytes.
21
22			Parameters
23			----------
24			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
25			information is used to label the plots.
26
27			Returns
28			-------
29			memory_usage: float
30
31			'''
32
33			data = pd.DataFrame(data)
34			memory_usage = round(data.memory_usage(index=True, deep=True).sum()/1024, 2)
35
36			return memory_usage
37
38
39			def _missing_vals(data):
40			'''
41			Gives metrics of missing values in the dataset.
42
43			Parameters
44			----------
45			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
46			information is used to label the plots.
47
48			Returns
49			-------
50			mv_total: float, number of missing values in the entire dataset
51			mv_rows: float, number of missing values in each row
52			mv_cols: float, number of missing values in each column
53			mv_rows_ratio: float, ratio of missing values for each row
54			mv_cols_ratio: float, ratio of missing values for each column
55			'''
56
57			data = pd.DataFrame(data)
58			mv_rows = data.isna().sum(axis=1)
59			mv_cols = data.isna().sum(axis=0)
60			mv_total = data.isna().sum().sum()
61			mv_rows_ratio = mv_rows/data.shape[1]
62			mv_cols_ratio = mv_cols/data.shape[0]
63
64			return {'mv_total': mv_total,
65			'mv_rows': mv_rows,
66			'mv_cols': mv_cols,
67			'mv_rows_ratio': mv_rows_ratio,
68			'mv_cols_ratio': mv_cols_ratio}
69
70
71			# Functions
72
73			# Correlation Matrix
74			def corr_mat(data, split=None, threshold=0):
75			def color_negative_red(val):
76			color = '#FF3344' if val < 0 else None
77			return 'color: %s' % color
78
79			data = pd.DataFrame(data)
80
81			if split == 'pos':
82			corr = data.corr().where((data.corr() >= threshold) & (data.corr() > 0))
83			print('Displaying positive correlations. Use "threshold" to further limit the results.')
84			elif split == 'neg':
85			corr = data.corr().where((data.corr() <= threshold) & (data.corr() < 0))
86			print('Displaying negative correlations. Use "threshold" to further limit the results.')
87			elif split == 'high':
88			corr = data.corr().where(np.abs(data.corr()) >= threshold)
89			print('Displaying absolute correlations above a chosen threshold.')
90			elif split == 'low':
91			corr = data.corr().where(np.abs(data.corr()) <= threshold)
92			print('Displaying absolute correlations below a chosen threshold.')
93			else:
94			corr = data.corr()
95			split = 'None'
96			threshold = 'None'
97
98			return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep='-')
99
100
101			# Missing value plot
102			def missingval_plot(data, cmap='PuBuGn', figsize=(20, 12), sort=False, spine_color='#EEEEEE'):
103			'''
104			Two-dimensional visualization of the missing values in a dataset.
105
106			Parameters
107			----------
108			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
109			information is used to label the plots.
110
111			cmap: colormap, default 'PuBuGn'
112			Any valid colormap can be used. E.g. 'Greys', 'RdPu'. More information can be found in the matplotlib \
113			documentation.
114
115			figsize: tuple, default (20,12)
116			Use to control the figure size.
117
118			sort: bool, default False
119			Sort columns based on missing values in descending order and drop columns without any missing values
120
121			spine_color: color-code, default '#EEEEEE'
122			Set to 'None' to hide the spines on all plots or use any valid matplotlib color argument.
123
124			Returns
125			-------
126			figure
127			'''
128
129			data = pd.DataFrame(data)
130
131			if sort:
132			mv_cols_sorted = data.isna().sum(axis=0).sort_values(ascending=False)
133			final_cols = mv_cols_sorted.drop(mv_cols_sorted[mv_cols_sorted.values == 0].keys().tolist()).keys().tolist()
134			data = data[final_cols]
135			print('Displaying only columns with missing values.')
136
137			# Identify missing values
138			mv_cols = _missing_vals(data)['mv_cols'] # data.isna().sum(axis=0)
139			mv_rows = _missing_vals(data)['mv_rows'] # data.isna().sum(axis=1)
140			mv_total = _missing_vals(data)['mv_total']
141			mv_cols_ratio = _missing_vals(data)['mv_cols_ratio'] # mv_cols / data.shape[0]
142			total_datapoints = data.shape[0]*data.shape[1]
143
144			if mv_total == 0:
145			print('No missing values found in the dataset.')
146			else:
147			# Create figure and axes
148			fig = plt.figure(figsize=figsize)
149			grid = fig.add_gridspec(nrows=6, ncols=6, left=0.05, right=0.48, wspace=0.05)
150			ax1 = fig.add_subplot(grid[:1, :5])
151			ax2 = fig.add_subplot(grid[1:, :5])
152			ax3 = fig.add_subplot(grid[:1, 5:])
153			ax4 = fig.add_subplot(grid[1:, 5:])
154
155			# ax1 - Barplot
156			colors = plt.get_cmap(cmap)(mv_cols / np.max(mv_cols)) # color bars by height
157			ax1.bar(range(len(mv_cols)), np.round((mv_cols_ratio)*100, 2), color=colors)
158			ax1.get_xaxis().set_visible(False)
159			ax1.set(frame_on=False, xlim=(-.5, len(mv_cols)-0.5))
160			ax1.set_ylim(0, np.max(mv_cols_ratio)*100)
161			ax1.grid(linestyle=':', linewidth=1)
162			ax1.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0))
163			ax1.tick_params(axis='y', colors='#111111', length=1)
164
165			# annotate values on top of the bars
166			for rect, label in zip(ax1.patches, mv_cols):
167			height = rect.get_height()
168			ax1.text(.1 + rect.get_x() + rect.get_width() / 2, height+0.5, label,
169			ha='center',
170			va='bottom',
171			rotation='90',
172			alpha=0.5,
173			fontsize='small')
174
175			ax1.set_frame_on(True)
176			for _, spine in ax1.spines.items():
177			spine.set_visible(True)
178			spine.set_color(spine_color)
179			ax1.spines['top'].set_color(None)
180
181			# ax2 - Heatmap
182			sns.heatmap(data.isna(), cbar=False, cmap='binary', ax=ax2)
183			ax2.set_yticks(np.round(ax2.get_yticks()[0::5], -1))
184			ax2.set_yticklabels(ax2.get_yticks())
185			ax2.set_xticklabels(
186			ax2.get_xticklabels(),
187			horizontalalignment='center',
188			fontweight='light',
189			fontsize='medium')
190			ax2.tick_params(length=1, colors='#111111')
191			for _, spine in ax2.spines.items():
192			spine.set_visible(True)
193			spine.set_color(spine_color)
194
195			# ax3 - Summary
196			fontax3 = {'color': '#111111',
197			'weight': 'normal',
198			'size': 12,
199			}
200			ax3.get_xaxis().set_visible(False)
201			ax3.get_yaxis().set_visible(False)
202			ax3.set(frame_on=False)
203
204			ax3.text(0.1, 0.9, f"Total: {np.round(total_datapoints/1000,1)}K",
205			transform=ax3.transAxes,
206			fontdict=fontax3)
207			ax3.text(0.1, 0.7, f"Missing: {np.round(mv_total/1000,1)}K",
208			transform=ax3.transAxes,
209			fontdict=fontax3)
210			ax3.text(0.1, 0.5, f"Relative: {np.round(mv_total/total_datapoints*100,1)}%",
211			transform=ax3.transAxes,
212			fontdict=fontax3)
213			ax3.text(0.1, 0.3, f"Max-col: {np.round(mv_cols.max()/data.shape[0]*100)}%",
214			transform=ax3.transAxes,
215			fontdict=fontax3)
216			ax3.text(0.1, 0.1, f"Max-row: {np.round(mv_rows.max()/data.shape[1]*100)}%",
217			transform=ax3.transAxes,
218			fontdict=fontax3)
219
220			# ax4 - Scatter plot
221			ax4.get_yaxis().set_visible(False)
222			for _, spine in ax4.spines.items():
223			spine.set_color(spine_color)
224			ax4.tick_params(axis='x', colors='#111111', length=1)
225
226			ax4.scatter(mv_rows, range(len(mv_rows)), s=mv_rows, c=mv_rows, cmap=cmap, marker=".", vmin=1)
227			ax4.set_ylim((0, len(mv_rows))[::-1]) # limit and invert y-axis
228			ax4.set_xlim(0, max(mv_rows)+0.5)
229			ax4.grid(linestyle=':', linewidth=1)
230
231			ax1.set_title('Missing value plot', pad=40, fontdict={'fontsize': 18})
232			return grid
233
234
235			# Correlation matrix / heatmap
236			def corr_plot(data, split=None, threshold=0, cmap='BrBG', figsize=(12, 10), annot=True, dev=False, **kwargs):
237			'''
238			Two-dimensional visualization of the correlation between feature-columns, excluding NA values.
239
240			Parameters
241			----------
242			data: 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the index/column \
243			information is used to label the plots.
244
245			split: {None, 'pos', 'neg', 'high', 'low'}, default None
246			Type of split to be performed.
247
248			* None: visualize all correlations between the feature-columns.
249			* pos: visualize all positive correlations between the feature-columns above the threshold.
250			* neg: visualize all negative correlations between the feature-columns below the threshold.
251			* high: visualize all correlations between the feature-columns for which abs(corr) > threshold is True.
252			* low: visualize all correlations between the feature-columns for which abs(corr) < threshold is True.
253
254			threshold: float, default 0
255			Value between 0 <= threshold <= 1
256
257			cmap: matplotlib colormap name or object, or list of colors, default 'BrBG'
258			The mapping from data values to color space.
259
260			figsize: tuple, default (12, 10)
261			Use to control the figure size.
262
263			annot: bool, default True
264			Use to show or hide annotations.
265
266			dev: bool, default False
267			Display figure settings in the plot by setting dev = True. If False, the settings are not displayed. Use for \
268			presentations.
269
270			**kwargs: optional
271			Additional elements to control the visualization of the plot, e.g.:
272
273			* mask: bool, default True
274			If set to False the entire correlation matrix, including the upper triangle is shown. Set dev = False in this \
275			case to avoid overlap.
276			* vmax: float, default is calculated from the given correlation coefficients.
277			Value between -1 or vmin <= vmax <= 1, limits the range of the colorbar.
278			* vmin: float, default is calculated from the given correlation coefficients.
279			Value between -1 <= vmin <= 1 or vmax, limits the range of the colorbar.
280			* linewidths: float, default 0.5
281			Controls the line-width inbetween the squares.
282			* annot_kws: dict, default {'size' : 10}
283			Controls the font size of the annotations. Only available when annot = True.
284			* cbar_kws: dict, default {'shrink': .95, 'aspect': 30}
285			Controls the size of the colorbar.
286			* Many more kwargs are available, i.e. 'alpha' to control blending, or options to adjust labels, ticks ...
287
288			Kwargs can be supplied through a dictionary of key-value pairs (see above).
289
290			Returns
291			-------
292			figure
293			'''
294
295			data = pd.DataFrame(data)
296
297			# Obtain correlation matrix
298			corr = corr_mat(data, split=split, threshold=threshold).data
299
300			# Generate mask for the upper triangle
301			mask = np.triu(np.ones_like(corr, dtype=np.bool))
302
303			# Compute dimensions and correlation range to adjust settings
304			vmax = np.round(np.nanmax(corr.where(~mask))-0.05, 2)
305			vmin = np.round(np.nanmin(corr.where(~mask))+0.05, 2)
306
307			# Set up the matplotlib figure and generate colormap
308			fig, ax = plt.subplots(figsize=figsize)
309
310			# Specify kwargs for the heatmap
311			kwargs = {'mask': mask,
312			'cmap': cmap,
313			'annot': annot,
314			'vmax': vmax,
315			'vmin': vmin,
316			'linewidths': .5,
317			'annot_kws': {'size': 10},
318			'cbar_kws': {'shrink': .95, 'aspect': 30},
319			**kwargs}
320
321			# Draw heatmap with mask and some default settings
322			sns.heatmap(corr,
323			center=0,
324			square=True,
325			fmt='.2f',
326			**kwargs
327			)
328
329			ax.set_title('Feature-correlation Matrix', fontdict={'fontsize': 18})
330
331			# Display settings
332			if dev:
333			fig.suptitle(f"\
334			Settings (dev-mode): \n\
335			- split-mode: {split} \n\
336			- threshold: {threshold} \n\
337			- annotations: {annot} \n\
338			- cbar: \n\
339			- vmax: {vmax} \n\
340			- vmin: {vmin} \n\
341			- linewidths: {kwargs['linewidths']} \n\
342			- annot_kws: {kwargs['annot_kws']} \n\
343			- cbar_kws: {kwargs['cbar_kws']}",
344			fontsize=12,
345			color='gray',
346			x=0.35,
347			y=0.85,
348			ha='left')
349
350			return ax
351

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 92e4a4...505056 )

klib.describe.corr_mat() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like