1
|
|
|
__all__ = ['build_FICO_dataset'] |
2
|
|
|
|
3
|
|
|
|
4
|
|
|
import numpy as np |
5
|
|
|
import pandas as pd |
6
|
|
|
from pkg_resources import resource_filename |
7
|
|
|
from sklearn.metrics import auc |
8
|
|
|
|
9
|
|
|
|
10
|
|
|
CDF_BY_RACE_PATH = resource_filename(__name__, |
11
|
|
|
'transrisk_cdf_by_race_ssa.csv') |
12
|
|
|
|
13
|
|
|
|
14
|
|
|
PERFORMANCE_BY_RACE_PATH = resource_filename(__name__, |
15
|
|
|
'transrisk_performance_by_race_ssa.csv') # pylint: disable=line-too-long |
16
|
|
|
|
17
|
|
|
TOTAL_BY_RACE_PATH = resource_filename(__name__, |
18
|
|
|
'totals.csv') |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
def _cleanup_frame(frame): |
22
|
|
|
"""Rename and re-order columns.""" |
23
|
|
|
frame = frame.rename(columns={'Non- Hispanic white': 'White'}) |
24
|
|
|
frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'], |
25
|
|
|
axis=1) |
26
|
|
|
return frame |
27
|
|
|
|
28
|
|
|
|
29
|
|
|
def _read_totals(): |
30
|
|
|
"""Read the total number of people of each race.""" |
31
|
|
|
frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0)) |
32
|
|
|
return {r: frame[r]['SSA'] for r in frame.columns} |
33
|
|
|
|
34
|
|
|
|
35
|
|
|
def _parse_data(): |
36
|
|
|
"""Parse sqf data set.""" |
37
|
|
|
cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0)) |
38
|
|
|
performance = (100 |
39
|
|
|
- _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH, |
40
|
|
|
index_col=0))) |
41
|
|
|
return (cdfs / 100, performance / 100) |
42
|
|
|
|
43
|
|
|
|
44
|
|
|
def _load_data(): |
45
|
|
|
totals = _read_totals() |
46
|
|
|
cdfs_df, performance_df = _parse_data() |
47
|
|
|
return totals, cdfs_df, performance_df |
48
|
|
|
|
49
|
|
|
|
50
|
|
|
def _get_pdfs(cdfs_df): |
51
|
|
|
cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])], |
52
|
|
|
cdfs_df.values]) |
53
|
|
|
pdf_vs = (cdf_vs[1:] - cdf_vs[:-1]) |
54
|
|
|
pdfs_df = pd.DataFrame(pdf_vs, |
55
|
|
|
columns=cdfs_df.columns, index=cdfs_df.index) |
56
|
|
|
return pdfs_df |
57
|
|
|
|
58
|
|
|
|
59
|
|
|
def _calc_tpr_fpr(pdfs_df, performance_df): |
60
|
|
|
dfs = [] |
61
|
|
|
for value in [performance_df, 1 - performance_df]: |
62
|
|
|
proportion_per_score = value * pdfs_df |
63
|
|
|
|
64
|
|
|
proportion_over_all_scores = proportion_per_score.sum(axis=0) |
65
|
|
|
|
66
|
|
|
cum_prop_per_score = proportion_per_score.cumsum(axis=0) |
67
|
|
|
|
68
|
|
|
rate = cum_prop_per_score / proportion_over_all_scores |
69
|
|
|
|
70
|
|
|
dfs.append(pd.DataFrame(1 - rate, |
71
|
|
|
index=pdfs_df.index, |
72
|
|
|
columns=pdfs_df.columns)) |
73
|
|
|
|
74
|
|
|
tpr_df, fpr_df = dfs # pylint: disable=unbalanced-tuple-unpacking |
75
|
|
|
return tpr_df, fpr_df |
76
|
|
|
|
77
|
|
|
|
78
|
|
|
def _build_rocs(fpr_df, tpr_df): |
79
|
|
|
rocs = {} |
80
|
|
|
for group in fpr_df.columns: |
81
|
|
|
rocs[group] = (fpr_df[group].values, |
82
|
|
|
tpr_df[group].values, |
83
|
|
|
fpr_df.index) |
84
|
|
|
return rocs |
85
|
|
|
|
86
|
|
|
|
87
|
|
|
def build_FICO_dataset(): |
88
|
|
|
"""Build the FICO dataset. |
89
|
|
|
|
90
|
|
|
Dataset of the credit score of TransUnion (called TransRisk). |
91
|
|
|
The TransRisk score is in turn based on |
92
|
|
|
a proprietary model created by FICO, |
93
|
|
|
hence often referred to as FICO scores. |
94
|
|
|
|
95
|
|
|
The data is *aggregated*, i.e., there is no outcome |
96
|
|
|
and prediction information per individual, |
97
|
|
|
but summarized statistics for each FICO score |
98
|
|
|
and race/race/ethnicity group. |
99
|
|
|
|
100
|
|
|
+---------------+------------------------------------------------------+ |
101
|
|
|
| FICO key | Meaning | |
102
|
|
|
+===============+======================================================+ |
103
|
|
|
| `totals` | Number of individuals per group | |
104
|
|
|
+---------------+------------------------------------------------------+ |
105
|
|
|
| `cdf` | Cumulative distribution function of score per group | |
106
|
|
|
+---------------+------------------------------------------------------+ |
107
|
|
|
| `pdf` | Probability distribution function of score per group | |
108
|
|
|
+---------------+------------------------------------------------------+ |
109
|
|
|
| `performance` | Fraction of non-defaulters per score and group | |
110
|
|
|
+---------------+------------------------------------------------------+ |
111
|
|
|
| `base_rates` | Base rate of non-defaulters per group | |
112
|
|
|
+---------------+------------------------------------------------------+ |
113
|
|
|
| `base_rate` | The overall base rate non-defaulters | |
114
|
|
|
+---------------+------------------------------------------------------+ |
115
|
|
|
| `proportions` | Fraction of individuals per group | |
116
|
|
|
+---------------+------------------------------------------------------+ |
117
|
|
|
| `fpr` | True Positive Rate by score as threshold per group | |
118
|
|
|
+---------------+------------------------------------------------------+ |
119
|
|
|
| `tpr` | False Positive Rate by score as threshold per group | |
120
|
|
|
+---------------+------------------------------------------------------+ |
121
|
|
|
| `rocs` | ROC per group | |
122
|
|
|
+---------------+------------------------------------------------------+ |
123
|
|
|
| `aucs` | ROC AUC per group | |
124
|
|
|
+---------------+------------------------------------------------------+ |
125
|
|
|
|
126
|
|
|
:return: Dictionary of various aggregated statics |
127
|
|
|
of the FICO credit score. |
128
|
|
|
:rtype: dict |
129
|
|
|
|
130
|
|
|
References: |
131
|
|
|
- Based on code (MIT License) by Moritz Hardt |
132
|
|
|
from https://github.com/fairmlbook/fairmlbook.github.io |
133
|
|
|
- https://fairmlbook.org/demographic.html#case-study-credit-scoring |
134
|
|
|
|
135
|
|
|
""" |
136
|
|
|
|
137
|
|
|
totals, cdfs_df, performance_df = _load_data() |
138
|
|
|
pdfs_df = _get_pdfs(cdfs_df) |
139
|
|
|
|
140
|
|
|
proportions = {group: total / sum(totals.values()) |
141
|
|
|
for group, total in totals.items()} |
142
|
|
|
|
143
|
|
|
base_rates = (pdfs_df * performance_df).sum() |
144
|
|
|
base_rate = (base_rates * pd.Series(proportions)).sum() |
145
|
|
|
|
146
|
|
|
tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df) |
147
|
|
|
rocs = _build_rocs(fpr_df, tpr_df) |
148
|
|
|
|
149
|
|
|
aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _) |
150
|
|
|
in rocs.items()} |
151
|
|
|
|
152
|
|
|
return {'totals': totals, |
153
|
|
|
'cdf': cdfs_df, |
154
|
|
|
'pdf': pdfs_df, |
155
|
|
|
'performance': performance_df, |
156
|
|
|
'base_rates': base_rates, |
157
|
|
|
'base_rate': base_rate, |
158
|
|
|
'proportions': proportions, |
159
|
|
|
'fpr': fpr_df, |
160
|
|
|
'tpr': tpr_df, |
161
|
|
|
'rocs': rocs, |
162
|
|
|
'aucs': aucs} |
163
|
|
|
|