ethically.dataset.fico.build_FICO_dataset() - Code Metrics - Inspection of "Merge pull request #18 from EthicallyAI/dev" - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 170db5...8af2aa )

by Shlomi

created 2019-04-10 20:28 UTC

ethically.dataset.fico.build_FICO_dataset() A

↳ Parent: ethically.dataset.fico

Complexity

Conditions

Size

Total Lines	76
Code Lines	22

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	22
nop	0
dl	0
loc	76
rs	9.352
c	0
b	0
f	0

How to fix Long Method

__all__ = ['build_FICO_dataset']


import numpy as np
import pandas as pd
from pkg_resources import resource_filename
from sklearn.metrics import auc


CDF_BY_RACE_PATH = resource_filename(__name__,
                                     'transrisk_cdf_by_race_ssa.csv')


PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
                                             'transrisk_performance_by_race_ssa.csv')  # pylint: disable=line-too-long

TOTAL_BY_RACE_PATH = resource_filename(__name__,
                                       'totals.csv')


def _cleanup_frame(frame):
    """Rename and re-order columns."""
    frame = frame.rename(columns={'Non- Hispanic white': 'White'})
    frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
                          axis=1)
    return frame


def _read_totals():
    """Read the total number of people of each race."""
    frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
    return {r: frame[r]['SSA'] for r in frame.columns}


def _parse_data():
    """Parse sqf data set."""
    cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
    performance = (100
                   - _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
                                                index_col=0)))
    return (cdfs / 100, performance / 100)


def _load_data():
    totals = _read_totals()
    cdfs_df, performance_df = _parse_data()
    return totals, cdfs_df, performance_df


def _get_pdfs(cdfs_df):
    cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
                             cdfs_df.values])
    pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
    pdfs_df = pd.DataFrame(pdf_vs,
                           columns=cdfs_df.columns, index=cdfs_df.index)
    return pdfs_df


def _calc_tpr_fpr(pdfs_df, performance_df):
    dfs = []
    for value in [performance_df, 1 - performance_df]:
        proportion_per_score = value * pdfs_df

        proportion_over_all_scores = proportion_per_score.sum(axis=0)

        cum_prop_per_score = proportion_per_score.cumsum(axis=0)

        rate = cum_prop_per_score / proportion_over_all_scores

        dfs.append(pd.DataFrame(1 - rate,
                                index=pdfs_df.index,
                                columns=pdfs_df.columns))

    tpr_df, fpr_df = dfs  # pylint: disable=unbalanced-tuple-unpacking
    return tpr_df, fpr_df


def _build_rocs(fpr_df, tpr_df):
    rocs = {}
    for group in fpr_df.columns:
        rocs[group] = (fpr_df[group].values,
                       tpr_df[group].values,
                       fpr_df.index)
    return rocs


def build_FICO_dataset():
    """Build the FICO dataset.

    Dataset of the credit score of TransUnion (called TransRisk).
    The TransRisk score is in turn based on
    a proprietary model created by FICO,
    hence often referred to as FICO scores.

    The data is *aggregated*, i.e., there is no outcome
    and prediction information per individual,
    but summarized statistics for each FICO score
    and race/race/ethnicity group.

    +---------------+------------------------------------------------------+
    | FICO key      | Meaning                                              |
    +===============+======================================================+
    | `totals`      | Number of individuals per group                      |
    +---------------+------------------------------------------------------+
    | `cdf`         | Cumulative distribution function of score per group  |
    +---------------+------------------------------------------------------+
    | `pdf`         | Probability distribution function of score per group |
    +---------------+------------------------------------------------------+
    | `performance` | Fraction of non-defaulters per score and group       |
    +---------------+------------------------------------------------------+
    | `base_rates`  | Base rate of non-defaulters per group                |
    +---------------+------------------------------------------------------+
    | `base_rate`   | The overall base rate non-defaulters                 |
    +---------------+------------------------------------------------------+
    | `proportions` | Fraction of individuals per group                    |
    +---------------+------------------------------------------------------+
    | `fpr`         | True Positive Rate by score as threshold per group   |
    +---------------+------------------------------------------------------+
    | `tpr`         | False Positive Rate by score as threshold per group  |
    +---------------+------------------------------------------------------+
    | `rocs`        | ROC per group                                        |
    +---------------+------------------------------------------------------+
    | `aucs`        | ROC AUC per group                                    |
    +---------------+------------------------------------------------------+

    :return: Dictionary of various aggregated statics
             of the FICO credit score.
    :rtype: dict

    References:
        - Based on code (MIT License) by Moritz Hardt
          from https://github.com/fairmlbook/fairmlbook.github.io
        - https://fairmlbook.org/demographic.html#case-study-credit-scoring

    """

    totals, cdfs_df, performance_df = _load_data()
    pdfs_df = _get_pdfs(cdfs_df)

    proportions = {group: total / sum(totals.values())
                   for group, total in totals.items()}

    base_rates = (pdfs_df * performance_df).sum()
    base_rate = (base_rates * pd.Series(proportions)).sum()

    tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df)
    rocs = _build_rocs(fpr_df, tpr_df)

    aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _)
            in rocs.items()}

    return {'totals': totals,
            'cdf': cdfs_df,
            'pdf': pdfs_df,
            'performance': performance_df,
            'base_rates': base_rates,
            'base_rate': base_rate,
            'proportions': proportions,
            'fpr': fpr_df,
            'tpr': tpr_df,
            'rocs': rocs,
            'aucs': aucs}


1			__all__ = ['build_FICO_dataset']
2
3
4			import numpy as np
5			import pandas as pd
6			from pkg_resources import resource_filename
7			from sklearn.metrics import auc
8
9
10			CDF_BY_RACE_PATH = resource_filename(__name__,
11			'transrisk_cdf_by_race_ssa.csv')
12
13
14			PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
15			'transrisk_performance_by_race_ssa.csv') # pylint: disable=line-too-long
16
17			TOTAL_BY_RACE_PATH = resource_filename(__name__,
18			'totals.csv')
19
20
21			def _cleanup_frame(frame):
22			"""Rename and re-order columns."""
23			frame = frame.rename(columns={'Non- Hispanic white': 'White'})
24			frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
25			axis=1)
26			return frame
27
28
29			def _read_totals():
30			"""Read the total number of people of each race."""
31			frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
32			return {r: frame[r]['SSA'] for r in frame.columns}
33
34
35			def _parse_data():
36			"""Parse sqf data set."""
37			cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
38			performance = (100
39			- _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
40			index_col=0)))
41			return (cdfs / 100, performance / 100)
42
43
44			def _load_data():
45			totals = _read_totals()
46			cdfs_df, performance_df = _parse_data()
47			return totals, cdfs_df, performance_df
48
49
50			def _get_pdfs(cdfs_df):
51			cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
52			cdfs_df.values])
53			pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
54			pdfs_df = pd.DataFrame(pdf_vs,
55			columns=cdfs_df.columns, index=cdfs_df.index)
56			return pdfs_df
57
58
59			def _calc_tpr_fpr(pdfs_df, performance_df):
60			dfs = []
61			for value in [performance_df, 1 - performance_df]:
62			proportion_per_score = value * pdfs_df
63
64			proportion_over_all_scores = proportion_per_score.sum(axis=0)
65
66			cum_prop_per_score = proportion_per_score.cumsum(axis=0)
67
68			rate = cum_prop_per_score / proportion_over_all_scores
69
70			dfs.append(pd.DataFrame(1 - rate,
71			index=pdfs_df.index,
72			columns=pdfs_df.columns))
73
74			tpr_df, fpr_df = dfs # pylint: disable=unbalanced-tuple-unpacking
75			return tpr_df, fpr_df
76
77
78			def _build_rocs(fpr_df, tpr_df):
79			rocs = {}
80			for group in fpr_df.columns:
81			rocs[group] = (fpr_df[group].values,
82			tpr_df[group].values,
83			fpr_df.index)
84			return rocs
85
86
87			def build_FICO_dataset():
88			"""Build the FICO dataset.
89
90			Dataset of the credit score of TransUnion (called TransRisk).
91			The TransRisk score is in turn based on
92			a proprietary model created by FICO,
93			hence often referred to as FICO scores.
94
95			The data is aggregated, i.e., there is no outcome
96			and prediction information per individual,
97			but summarized statistics for each FICO score
98			and race/race/ethnicity group.
99
100			+---------------+------------------------------------------------------+
101			\| FICO key \| Meaning \|
102			+===============+======================================================+
103			\| `totals` \| Number of individuals per group \|
104			+---------------+------------------------------------------------------+
105			\| `cdf` \| Cumulative distribution function of score per group \|
106			+---------------+------------------------------------------------------+
107			\| `pdf` \| Probability distribution function of score per group \|
108			+---------------+------------------------------------------------------+
109			\| `performance` \| Fraction of non-defaulters per score and group \|
110			+---------------+------------------------------------------------------+
111			\| `base_rates` \| Base rate of non-defaulters per group \|
112			+---------------+------------------------------------------------------+
113			\| `base_rate` \| The overall base rate non-defaulters \|
114			+---------------+------------------------------------------------------+
115			\| `proportions` \| Fraction of individuals per group \|
116			+---------------+------------------------------------------------------+
117			\| `fpr` \| True Positive Rate by score as threshold per group \|
118			+---------------+------------------------------------------------------+
119			\| `tpr` \| False Positive Rate by score as threshold per group \|
120			+---------------+------------------------------------------------------+
121			\| `rocs` \| ROC per group \|
122			+---------------+------------------------------------------------------+
123			\| `aucs` \| ROC AUC per group \|
124			+---------------+------------------------------------------------------+
125
126			:return: Dictionary of various aggregated statics
127			of the FICO credit score.
128			:rtype: dict
129
130			References:
131			- Based on code (MIT License) by Moritz Hardt
132			from https://github.com/fairmlbook/fairmlbook.github.io
133			- https://fairmlbook.org/demographic.html#case-study-credit-scoring
134
135			"""
136
137			totals, cdfs_df, performance_df = _load_data()
138			pdfs_df = _get_pdfs(cdfs_df)
139
140			proportions = {group: total / sum(totals.values())
141			for group, total in totals.items()}
142
143			base_rates = (pdfs_df * performance_df).sum()
144			base_rate = (base_rates * pd.Series(proportions)).sum()
145
146			tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df)
147			rocs = _build_rocs(fpr_df, tpr_df)
148
149			aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _)
150			in rocs.items()}
151
152			return {'totals': totals,
153			'cdf': cdfs_df,
154			'pdf': pdfs_df,
155			'performance': performance_df,
156			'base_rates': base_rates,
157			'base_rate': base_rate,
158			'proportions': proportions,
159			'fpr': fpr_df,
160			'tpr': tpr_df,
161			'rocs': rocs,
162			'aucs': aucs}
163

ResponsiblyAI / responsibly

Push — master ( 170db5...8af2aa )

ethically.dataset.fico.build_FICO_dataset() A

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like