| Conditions | 1 |
| Total Lines | 81 |
| Code Lines | 24 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | __all__ = ['build_FICO_dataset'] |
||
| 96 | def build_FICO_dataset(): |
||
| 97 | """Build the FICO dataset. |
||
| 98 | |||
| 99 | Dataset of the credit score of TransUnion (called TransRisk). |
||
| 100 | The TransRisk score is in turn based on |
||
| 101 | a proprietary model created by FICO, |
||
| 102 | hence often referred to as FICO scores. |
||
| 103 | |||
| 104 | The data is *aggregated*, i.e., there is no outcome |
||
| 105 | and prediction information per individual, |
||
| 106 | but summarized statistics for each FICO score |
||
| 107 | and race/race/ethnicity group. |
||
| 108 | |||
| 109 | +---------------+------------------------------------------------------+ |
||
| 110 | | FICO key | Meaning | |
||
| 111 | +===============+======================================================+ |
||
| 112 | | `total` | Total number of individuals | |
||
| 113 | +---------------+------------------------------------------------------+ |
||
| 114 | | `totals` | Number of individuals per group | |
||
| 115 | +---------------+------------------------------------------------------+ |
||
| 116 | | `cdf` | Cumulative distribution function of score per group | |
||
| 117 | +---------------+------------------------------------------------------+ |
||
| 118 | | `pdf` | Probability distribution function of score per group | |
||
| 119 | +---------------+------------------------------------------------------+ |
||
| 120 | | `performance` | Fraction of non-defaulters per score and group | |
||
| 121 | +---------------+------------------------------------------------------+ |
||
| 122 | | `base_rates` | Base rate of non-defaulters per group | |
||
| 123 | +---------------+------------------------------------------------------+ |
||
| 124 | | `base_rate` | The overall base rate non-defaulters | |
||
| 125 | +---------------+------------------------------------------------------+ |
||
| 126 | | `proportions` | Fraction of individuals per group | |
||
| 127 | +---------------+------------------------------------------------------+ |
||
| 128 | | `fpr` | True Positive Rate by score as threshold per group | |
||
| 129 | +---------------+------------------------------------------------------+ |
||
| 130 | | `tpr` | False Positive Rate by score as threshold per group | |
||
| 131 | +---------------+------------------------------------------------------+ |
||
| 132 | | `rocs` | ROC per group | |
||
| 133 | +---------------+------------------------------------------------------+ |
||
| 134 | | `aucs` | ROC AUC per group | |
||
| 135 | +---------------+------------------------------------------------------+ |
||
| 136 | |||
| 137 | :return: Dictionary of various aggregated statics |
||
| 138 | of the FICO credit score. |
||
| 139 | :rtype: dict |
||
| 140 | |||
| 141 | References: |
||
| 142 | - Based on code (MIT License) by Moritz Hardt |
||
| 143 | from https://github.com/fairmlbook/fairmlbook.github.io |
||
| 144 | - https://fairmlbook.org/demographic.html#case-study-credit-scoring |
||
| 145 | |||
| 146 | """ |
||
| 147 | |||
| 148 | totals, cdfs_df, performance_df = _load_data() |
||
| 149 | pdfs_df = _get_pdfs(cdfs_df) |
||
| 150 | |||
| 151 | total = sum(totals.values()) |
||
| 152 | |||
| 153 | proportions = {group: total / sum(totals.values()) |
||
| 154 | for group, total in totals.items()} |
||
| 155 | |||
| 156 | base_rates = (pdfs_df * performance_df).sum() |
||
| 157 | base_rate = (base_rates * pd.Series(proportions)).sum() |
||
| 158 | |||
| 159 | tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df) |
||
| 160 | rocs = _build_rocs(fpr_df, tpr_df) |
||
| 161 | |||
| 162 | aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _) |
||
| 163 | in rocs.items()} |
||
| 164 | |||
| 165 | return {'total': total, |
||
| 166 | 'totals': totals, |
||
| 167 | 'cdf': cdfs_df, |
||
| 168 | 'pdf': pdfs_df, |
||
| 169 | 'performance': performance_df, |
||
| 170 | 'base_rates': base_rates, |
||
| 171 | 'base_rate': base_rate, |
||
| 172 | 'proportions': proportions, |
||
| 173 | 'fpr': fpr_df, |
||
| 174 | 'tpr': tpr_df, |
||
| 175 | 'rocs': rocs, |
||
| 176 | 'aucs': aucs} |
||
| 177 |