responsibly.dataset.fico.build_FICO_dataset() - Code Metrics - Inspection of "Useful threshold api" - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#31)

by Shlomi

created 2019-08-04 02:52 UTC

responsibly.dataset.fico.build_FICO_dataset() A

↳ Parent: responsibly.dataset.fico

Complexity

Conditions

Size

Total Lines	81
Code Lines	24

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	24
nop	0
dl	0
loc	81
rs	9.304
c	0
b	0
f	0

How to fix Long Method

__all__ = ['build_FICO_dataset']


import numpy as np
import pandas as pd
from pkg_resources import resource_filename
from sklearn.metrics import auc


CDF_BY_RACE_PATH = resource_filename(__name__,
                                     'transrisk_cdf_by_race_ssa.csv')


PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
                                             'transrisk_performance_by_race_ssa.csv')  # pylint: disable=line-too-long

TOTAL_BY_RACE_PATH = resource_filename(__name__,
                                       'totals.csv')


def _cleanup_frame(frame):
    """Rename and re-order columns."""
    frame = frame.rename(columns={'Non- Hispanic white': 'White'})
    frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
                          axis=1)
    return frame


def _read_totals():
    """Read the total number of people of each race."""
    frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
    return {r: frame[r]['SSA'] for r in frame.columns}


def _parse_data():
    """Parse sqf data set."""
    cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
    performance = (100
                   - _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
                                                index_col=0)))
    return (cdfs / 100, performance / 100)


def _load_data():
    totals = _read_totals()
    cdfs_df, performance_df = _parse_data()
    return totals, cdfs_df, performance_df


def _get_pdfs(cdfs_df):
    cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
                             cdfs_df.values])
    pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
    pdfs_df = pd.DataFrame(pdf_vs,
                           columns=cdfs_df.columns, index=cdfs_df.index)
    return pdfs_df


def _calc_tpr_fpr(pdfs_df, performance_df):
    dfs = []
    for value in [performance_df, 1 - performance_df]:
        proportion_per_score = value * pdfs_df

        proportion_over_all_scores = proportion_per_score.sum(axis=0)

        cum_prop_per_score = proportion_per_score[::-1].cumsum(axis=0)[::-1]

        rate = cum_prop_per_score / proportion_over_all_scores

        # by sklean convention, thresholds[0]
        # represents no instances being predicted positive
        # and is arbitrarily set to max(y_score) + 1
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
        rate.loc[max(rate.index) + 1] = [0] * len(rate.columns)


        dfs.append(rate)

    tpr_df, fpr_df = dfs  # pylint: disable=unbalanced-tuple-unpacking
    return tpr_df, fpr_df


def _build_rocs(fpr_df, tpr_df):
    rocs = {}
    for group in fpr_df.columns:
        fprs = fpr_df[group].values[::-1]
        tprs = tpr_df[group].values[::-1]
        thresholds = fpr_df.index[::-1]

        rocs[group] = (fprs,
                       tprs,
                       thresholds)

    return rocs


def build_FICO_dataset():
    """Build the FICO dataset.

    Dataset of the credit score of TransUnion (called TransRisk).
    The TransRisk score is in turn based on
    a proprietary model created by FICO,
    hence often referred to as FICO scores.

    The data is *aggregated*, i.e., there is no outcome
    and prediction information per individual,
    but summarized statistics for each FICO score
    and race/race/ethnicity group.

    +---------------+------------------------------------------------------+
    | FICO key      | Meaning                                              |
    +===============+======================================================+
    | `total`       | Total number of individuals                          |
    +---------------+------------------------------------------------------+
    | `totals`      | Number of individuals per group                      |
    +---------------+------------------------------------------------------+
    | `cdf`         | Cumulative distribution function of score per group  |
    +---------------+------------------------------------------------------+
    | `pdf`         | Probability distribution function of score per group |
    +---------------+------------------------------------------------------+
    | `performance` | Fraction of non-defaulters per score and group       |
    +---------------+------------------------------------------------------+
    | `base_rates`  | Base rate of non-defaulters per group                |
    +---------------+------------------------------------------------------+
    | `base_rate`   | The overall base rate non-defaulters                 |
    +---------------+------------------------------------------------------+
    | `proportions` | Fraction of individuals per group                    |
    +---------------+------------------------------------------------------+
    | `fpr`         | True Positive Rate by score as threshold per group   |
    +---------------+------------------------------------------------------+
    | `tpr`         | False Positive Rate by score as threshold per group  |
    +---------------+------------------------------------------------------+
    | `rocs`        | ROC per group                                        |
    +---------------+------------------------------------------------------+
    | `aucs`        | ROC AUC per group                                    |
    +---------------+------------------------------------------------------+

    :return: Dictionary of various aggregated statics
             of the FICO credit score.
    :rtype: dict

    References:
        - Based on code (MIT License) by Moritz Hardt
          from https://github.com/fairmlbook/fairmlbook.github.io
        - https://fairmlbook.org/demographic.html#case-study-credit-scoring

    """

    totals, cdfs_df, performance_df = _load_data()
    pdfs_df = _get_pdfs(cdfs_df)

    total = sum(totals.values())

    proportions = {group: total / sum(totals.values())
                   for group, total in totals.items()}

    base_rates = (pdfs_df * performance_df).sum()
    base_rate = (base_rates * pd.Series(proportions)).sum()

    tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df)
    rocs = _build_rocs(fpr_df, tpr_df)

    aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _)
            in rocs.items()}

    return {'total': total,
            'totals': totals,
            'cdf': cdfs_df,
            'pdf': pdfs_df,
            'performance': performance_df,
            'base_rates': base_rates,
            'base_rate': base_rate,
            'proportions': proportions,
            'fpr': fpr_df,
            'tpr': tpr_df,
            'rocs': rocs,
            'aucs': aucs}


1			__all__ = ['build_FICO_dataset']
2
3
4			import numpy as np
5			import pandas as pd
6			from pkg_resources import resource_filename
7			from sklearn.metrics import auc
8
9
10			CDF_BY_RACE_PATH = resource_filename(__name__,
11			'transrisk_cdf_by_race_ssa.csv')
12
13
14			PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
15			'transrisk_performance_by_race_ssa.csv') # pylint: disable=line-too-long
16
17			TOTAL_BY_RACE_PATH = resource_filename(__name__,
18			'totals.csv')
19
20
21			def _cleanup_frame(frame):
22			"""Rename and re-order columns."""
23			frame = frame.rename(columns={'Non- Hispanic white': 'White'})
24			frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
25			axis=1)
26			return frame
27
28
29			def _read_totals():
30			"""Read the total number of people of each race."""
31			frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
32			return {r: frame[r]['SSA'] for r in frame.columns}
33
34
35			def _parse_data():
36			"""Parse sqf data set."""
37			cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
38			performance = (100
39			- _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
40			index_col=0)))
41			return (cdfs / 100, performance / 100)
42
43
44			def _load_data():
45			totals = _read_totals()
46			cdfs_df, performance_df = _parse_data()
47			return totals, cdfs_df, performance_df
48
49
50			def _get_pdfs(cdfs_df):
51			cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
52			cdfs_df.values])
53			pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
54			pdfs_df = pd.DataFrame(pdf_vs,
55			columns=cdfs_df.columns, index=cdfs_df.index)
56			return pdfs_df
57
58
59			def _calc_tpr_fpr(pdfs_df, performance_df):
60			dfs = []
61			for value in [performance_df, 1 - performance_df]:
62			proportion_per_score = value * pdfs_df
63
64			proportion_over_all_scores = proportion_per_score.sum(axis=0)
65
66			cum_prop_per_score = proportion_per_score[::-1].cumsum(axis=0)[::-1]
67
68			rate = cum_prop_per_score / proportion_over_all_scores
69
70			# by sklean convention, thresholds[0]
71			# represents no instances being predicted positive
72			# and is arbitrarily set to max(y_score) + 1
73			# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
74			rate.loc[max(rate.index) + 1] = [0] * len(rate.columns)
			0 ignored issues – show Comprehensibility Best Practice introduced 2019-08-04 02:50 UTC by Report Bug Copy Issue Report The variable `max` does not seem to be defined. Loading history...
75
76			dfs.append(rate)
77
78			tpr_df, fpr_df = dfs # pylint: disable=unbalanced-tuple-unpacking
79			return tpr_df, fpr_df
80
81
82			def _build_rocs(fpr_df, tpr_df):
83			rocs = {}
84			for group in fpr_df.columns:
85			fprs = fpr_df[group].values[::-1]
86			tprs = tpr_df[group].values[::-1]
87			thresholds = fpr_df.index[::-1]
88
89			rocs[group] = (fprs,
90			tprs,
91			thresholds)
92
93			return rocs
94
95
96			def build_FICO_dataset():
97			"""Build the FICO dataset.
98
99			Dataset of the credit score of TransUnion (called TransRisk).
100			The TransRisk score is in turn based on
101			a proprietary model created by FICO,
102			hence often referred to as FICO scores.
103
104			The data is aggregated, i.e., there is no outcome
105			and prediction information per individual,
106			but summarized statistics for each FICO score
107			and race/race/ethnicity group.
108
109			+---------------+------------------------------------------------------+
110			\| FICO key \| Meaning \|
111			+===============+======================================================+
112			\| `total` \| Total number of individuals \|
113			+---------------+------------------------------------------------------+
114			\| `totals` \| Number of individuals per group \|
115			+---------------+------------------------------------------------------+
116			\| `cdf` \| Cumulative distribution function of score per group \|
117			+---------------+------------------------------------------------------+
118			\| `pdf` \| Probability distribution function of score per group \|
119			+---------------+------------------------------------------------------+
120			\| `performance` \| Fraction of non-defaulters per score and group \|
121			+---------------+------------------------------------------------------+
122			\| `base_rates` \| Base rate of non-defaulters per group \|
123			+---------------+------------------------------------------------------+
124			\| `base_rate` \| The overall base rate non-defaulters \|
125			+---------------+------------------------------------------------------+
126			\| `proportions` \| Fraction of individuals per group \|
127			+---------------+------------------------------------------------------+
128			\| `fpr` \| True Positive Rate by score as threshold per group \|
129			+---------------+------------------------------------------------------+
130			\| `tpr` \| False Positive Rate by score as threshold per group \|
131			+---------------+------------------------------------------------------+
132			\| `rocs` \| ROC per group \|
133			+---------------+------------------------------------------------------+
134			\| `aucs` \| ROC AUC per group \|
135			+---------------+------------------------------------------------------+
136
137			:return: Dictionary of various aggregated statics
138			of the FICO credit score.
139			:rtype: dict
140
141			References:
142			- Based on code (MIT License) by Moritz Hardt
143			from https://github.com/fairmlbook/fairmlbook.github.io
144			- https://fairmlbook.org/demographic.html#case-study-credit-scoring
145
146			"""
147
148			totals, cdfs_df, performance_df = _load_data()
149			pdfs_df = _get_pdfs(cdfs_df)
150
151			total = sum(totals.values())
152
153			proportions = {group: total / sum(totals.values())
154			for group, total in totals.items()}
155
156			base_rates = (pdfs_df * performance_df).sum()
157			base_rate = (base_rates * pd.Series(proportions)).sum()
158
159			tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df)
160			rocs = _build_rocs(fpr_df, tpr_df)
161
162			aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _)
163			in rocs.items()}
164
165			return {'total': total,
166			'totals': totals,
167			'cdf': cdfs_df,
168			'pdf': pdfs_df,
169			'performance': performance_df,
170			'base_rates': base_rates,
171			'base_rate': base_rate,
172			'proportions': proportions,
173			'fpr': fpr_df,
174			'tpr': tpr_df,
175			'rocs': rocs,
176			'aucs': aucs}
177

ResponsiblyAI / responsibly

Pull Request — master (#31)

responsibly.dataset.fico.build_FICO_dataset() A

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like