Passed
Push — master ( 400a5c...f23aca )
by Shlomi
01:56
created

responsibly.dataset.fico   A

Complexity

Total Complexity 10

Size/Duplication

Total Lines 177
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 83
dl 0
loc 177
rs 10
c 0
b 0
f 0
wmc 10

8 Functions

Rating   Name   Duplication   Size   Complexity  
A _parse_data() 0 7 1
A _cleanup_frame() 0 6 1
A _read_totals() 0 4 1
A _load_data() 0 4 1
A _get_pdfs() 0 7 1
A build_FICO_dataset() 0 81 1
A _build_rocs() 0 12 2
A _calc_tpr_fpr() 0 21 2
1
__all__ = ['build_FICO_dataset']
2
3
4
import numpy as np
5
import pandas as pd
6
from pkg_resources import resource_filename
7
from sklearn.metrics import auc
8
9
10
CDF_BY_RACE_PATH = resource_filename(__name__,
11
                                     'transrisk_cdf_by_race_ssa.csv')
12
13
14
PERFORMANCE_BY_RACE_PATH = resource_filename(__name__,
15
                                             'transrisk_performance_by_race_ssa.csv')  # pylint: disable=line-too-long
16
17
TOTAL_BY_RACE_PATH = resource_filename(__name__,
18
                                       'totals.csv')
19
20
21
def _cleanup_frame(frame):
22
    """Rename and re-order columns."""
23
    frame = frame.rename(columns={'Non- Hispanic white': 'White'})
24
    frame = frame.reindex(['Asian', 'Black', 'Hispanic', 'White'],
25
                          axis=1)
26
    return frame
27
28
29
def _read_totals():
30
    """Read the total number of people of each race."""
31
    frame = _cleanup_frame(pd.read_csv(TOTAL_BY_RACE_PATH, index_col=0))
32
    return {r: frame[r]['SSA'] for r in frame.columns}
33
34
35
def _parse_data():
36
    """Parse sqf data set."""
37
    cdfs = _cleanup_frame(pd.read_csv(CDF_BY_RACE_PATH, index_col=0))
38
    performance = (100
39
                   - _cleanup_frame(pd.read_csv(PERFORMANCE_BY_RACE_PATH,
40
                                                index_col=0)))
41
    return (cdfs / 100, performance / 100)
42
43
44
def _load_data():
45
    totals = _read_totals()
46
    cdfs_df, performance_df = _parse_data()
47
    return totals, cdfs_df, performance_df
48
49
50
def _get_pdfs(cdfs_df):
51
    cdf_vs = np.concatenate([[np.zeros_like(cdfs_df.values[0])],
52
                             cdfs_df.values])
53
    pdf_vs = (cdf_vs[1:] - cdf_vs[:-1])
54
    pdfs_df = pd.DataFrame(pdf_vs,
55
                           columns=cdfs_df.columns, index=cdfs_df.index)
56
    return pdfs_df
57
58
59
def _calc_tpr_fpr(pdfs_df, performance_df):
60
    dfs = []
61
    for value in [performance_df, 1 - performance_df]:
62
        proportion_per_score = value * pdfs_df
63
64
        proportion_over_all_scores = proportion_per_score.sum(axis=0)
65
66
        cum_prop_per_score = proportion_per_score[::-1].cumsum(axis=0)[::-1]
67
68
        rate = cum_prop_per_score / proportion_over_all_scores
69
70
        # by sklean convention, thresholds[0]
71
        # represents no instances being predicted positive
72
        # and is arbitrarily set to max(y_score) + 1
73
        # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
74
        rate.loc[max(rate.index) + 1] = [0] * len(rate.columns)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable max does not seem to be defined.
Loading history...
75
76
        dfs.append(rate)
77
78
    tpr_df, fpr_df = dfs  # pylint: disable=unbalanced-tuple-unpacking
79
    return tpr_df, fpr_df
80
81
82
def _build_rocs(fpr_df, tpr_df):
83
    rocs = {}
84
    for group in fpr_df.columns:
85
        fprs = fpr_df[group].values[::-1]
86
        tprs = tpr_df[group].values[::-1]
87
        thresholds = fpr_df.index[::-1]
88
89
        rocs[group] = (fprs,
90
                       tprs,
91
                       thresholds)
92
93
    return rocs
94
95
96
def build_FICO_dataset():
97
    """Build the FICO dataset.
98
99
    Dataset of the credit score of TransUnion (called TransRisk).
100
    The TransRisk score is in turn based on
101
    a proprietary model created by FICO,
102
    hence often referred to as FICO scores.
103
104
    The data is *aggregated*, i.e., there is no outcome
105
    and prediction information per individual,
106
    but summarized statistics for each FICO score
107
    and race/race/ethnicity group.
108
109
    +---------------+------------------------------------------------------+
110
    | FICO key      | Meaning                                              |
111
    +===============+======================================================+
112
    | `total`       | Total number of individuals                          |
113
    +---------------+------------------------------------------------------+
114
    | `totals`      | Number of individuals per group                      |
115
    +---------------+------------------------------------------------------+
116
    | `cdf`         | Cumulative distribution function of score per group  |
117
    +---------------+------------------------------------------------------+
118
    | `pdf`         | Probability distribution function of score per group |
119
    +---------------+------------------------------------------------------+
120
    | `performance` | Fraction of non-defaulters per score and group       |
121
    +---------------+------------------------------------------------------+
122
    | `base_rates`  | Base rate of non-defaulters per group                |
123
    +---------------+------------------------------------------------------+
124
    | `base_rate`   | The overall base rate non-defaulters                 |
125
    +---------------+------------------------------------------------------+
126
    | `proportions` | Fraction of individuals per group                    |
127
    +---------------+------------------------------------------------------+
128
    | `fpr`         | True Positive Rate by score as threshold per group   |
129
    +---------------+------------------------------------------------------+
130
    | `tpr`         | False Positive Rate by score as threshold per group  |
131
    +---------------+------------------------------------------------------+
132
    | `rocs`        | ROC per group                                        |
133
    +---------------+------------------------------------------------------+
134
    | `aucs`        | ROC AUC per group                                    |
135
    +---------------+------------------------------------------------------+
136
137
    :return: Dictionary of various aggregated statics
138
             of the FICO credit score.
139
    :rtype: dict
140
141
    References:
142
        - Based on code (MIT License) by Moritz Hardt
143
          from https://github.com/fairmlbook/fairmlbook.github.io
144
        - https://fairmlbook.org/demographic.html#case-study-credit-scoring
145
146
    """
147
148
    totals, cdfs_df, performance_df = _load_data()
149
    pdfs_df = _get_pdfs(cdfs_df)
150
151
    total = sum(totals.values())
152
153
    proportions = {group: total / sum(totals.values())
154
                   for group, total in totals.items()}
155
156
    base_rates = (pdfs_df * performance_df).sum()
157
    base_rate = (base_rates * pd.Series(proportions)).sum()
158
159
    tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df)
160
    rocs = _build_rocs(fpr_df, tpr_df)
161
162
    aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _)
163
            in rocs.items()}
164
165
    return {'total': total,
166
            'totals': totals,
167
            'cdf': cdfs_df,
168
            'pdf': pdfs_df,
169
            'performance': performance_df,
170
            'base_rates': base_rates,
171
            'base_rate': base_rate,
172
            'proportions': proportions,
173
            'fpr': fpr_df,
174
            'tpr': tpr_df,
175
            'rocs': rocs,
176
            'aucs': aucs}
177