Passed
Push — master ( 170db5...8af2aa )
by Shlomi
02:43 queued 58s
created

ethically.dataset.fico._load_data()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nop 0
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
__all__ = ['build_FICO_dataset']
2
3
4
import numpy as np
5
import pandas as pd
6
from pkg_resources import resource_filename
7
from sklearn.metrics import auc
8
9
10
CDF_BY_RACE_PATH = resource_filename(__name__,
11
                                     'transrisk_cdf_by_race_ssa.csv')
12
13
14
PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
15
                                             'transrisk_performance_by_race_ssa.csv')  # pylint: disable=line-too-long
16
17
TOTAL_BY_RACE_PATH = resource_filename(__name__,
18
                                       'totals.csv')
19
20
21
def _cleanup_frame(frame):
22
    """Rename and re-order columns."""
23
    frame = frame.rename(columns={'Non- Hispanic white': 'White'})
24
    frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
25
                          axis=1)
26
    return frame
27
28
29
def _read_totals():
30
    """Read the total number of people of each race."""
31
    frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
32
    return {r: frame[r]['SSA'] for r in frame.columns}
33
34
35
def _parse_data():
36
    """Parse sqf data set."""
37
    cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
38
    performance = (100
39
                   - _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
40
                                                index_col=0)))
41
    return (cdfs / 100, performance / 100)
42
43
44
def _load_data():
45
    totals = _read_totals()
46
    cdfs_df, performance_df = _parse_data()
47
    return totals, cdfs_df, performance_df
48
49
50
def _get_pdfs(cdfs_df):
51
    cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
52
                             cdfs_df.values])
53
    pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
54
    pdfs_df = pd.DataFrame(pdf_vs,
55
                           columns=cdfs_df.columns, index=cdfs_df.index)
56
    return pdfs_df
57
58
59
def _calc_tpr_fpr(pdfs_df, performance_df):
60
    dfs = []
61
    for value in [performance_df, 1 - performance_df]:
62
        proportion_per_score = value * pdfs_df
63
64
        proportion_over_all_scores = proportion_per_score.sum(axis=0)
65
66
        cum_prop_per_score = proportion_per_score.cumsum(axis=0)
67
68
        rate = cum_prop_per_score / proportion_over_all_scores
69
70
        dfs.append(pd.DataFrame(1 - rate,
71
                                index=pdfs_df.index,
72
                                columns=pdfs_df.columns))
73
74
    tpr_df, fpr_df = dfs  # pylint: disable=unbalanced-tuple-unpacking
75
    return tpr_df, fpr_df
76
77
78
def _build_rocs(fpr_df, tpr_df):
79
    rocs = {}
80
    for group in fpr_df.columns:
81
        rocs[group] = (fpr_df[group].values,
82
                       tpr_df[group].values,
83
                       fpr_df.index)
84
    return rocs
85
86
87
def build_FICO_dataset():
88
    """Build the FICO dataset.
89
90
    Dataset of the credit score of TransUnion (called TransRisk).
91
    The TransRisk score is in turn based on
92
    a proprietary model created by FICO,
93
    hence often referred to as FICO scores.
94
95
    The data is *aggregated*, i.e., there is no outcome
96
    and prediction information per individual,
97
    but summarized statistics for each FICO score
98
    and race/race/ethnicity group.
99
100
    +---------------+------------------------------------------------------+
101
    | FICO key      | Meaning                                              |
102
    +===============+======================================================+
103
    | `totals`      | Number of individuals per group                      |
104
    +---------------+------------------------------------------------------+
105
    | `cdf`         | Cumulative distribution function of score per group  |
106
    +---------------+------------------------------------------------------+
107
    | `pdf`         | Probability distribution function of score per group |
108
    +---------------+------------------------------------------------------+
109
    | `performance` | Fraction of non-defaulters per score and group       |
110
    +---------------+------------------------------------------------------+
111
    | `base_rates`  | Base rate of non-defaulters per group                |
112
    +---------------+------------------------------------------------------+
113
    | `base_rate`   | The overall base rate non-defaulters                 |
114
    +---------------+------------------------------------------------------+
115
    | `proportions` | Fraction of individuals per group                    |
116
    +---------------+------------------------------------------------------+
117
    | `fpr`         | True Positive Rate by score as threshold per group   |
118
    +---------------+------------------------------------------------------+
119
    | `tpr`         | False Positive Rate by score as threshold per group  |
120
    +---------------+------------------------------------------------------+
121
    | `rocs`        | ROC per group                                        |
122
    +---------------+------------------------------------------------------+
123
    | `aucs`        | ROC AUC per group                                    |
124
    +---------------+------------------------------------------------------+
125
126
    :return: Dictionary of various aggregated statics
127
             of the FICO credit score.
128
    :rtype: dict
129
130
    References:
131
        - Based on code (MIT License) by Moritz Hardt
132
          from https://github.com/fairmlbook/fairmlbook.github.io
133
        - https://fairmlbook.org/demographic.html#case-study-credit-scoring
134
135
    """
136
137
    totals, cdfs_df, performance_df = _load_data()
138
    pdfs_df = _get_pdfs(cdfs_df)
139
140
    proportions = {group: total / sum(totals.values())
141
                   for group, total in totals.items()}
142
143
    base_rates = (pdfs_df * performance_df).sum()
144
    base_rate = (base_rates * pd.Series(proportions)).sum()
145
146
    tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df)
147
    rocs = _build_rocs(fpr_df, tpr_df)
148
149
    aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _)
150
            in rocs.items()}
151
152
    return {'totals': totals,
153
            'cdf': cdfs_df,
154
            'pdf': pdfs_df,
155
            'performance': performance_df,
156
            'base_rates': base_rates,
157
            'base_rate': base_rate,
158
            'proportions': proportions,
159
            'fpr': fpr_df,
160
            'tpr': tpr_df,
161
            'rocs': rocs,
162
            'aucs': aucs}
163