| Conditions | 1 |
| Total Lines | 76 |
| Code Lines | 22 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | __all__ = ['build_FICO_dataset'] |
||
| 87 | def build_FICO_dataset(): |
||
| 88 | """Build the FICO dataset. |
||
| 89 | |||
| 90 | Dataset of the credit score of TransUnion (called TransRisk). |
||
| 91 | The TransRisk score is in turn based on |
||
| 92 | a proprietary model created by FICO, |
||
| 93 | hence often referred to as FICO scores. |
||
| 94 | |||
| 95 | The data is *aggregated*, i.e., there is no outcome |
||
| 96 | and prediction information per individual, |
||
| 97 | but summarized statistics for each FICO score |
||
| 98 | and race/race/ethnicity group. |
||
| 99 | |||
| 100 | +---------------+------------------------------------------------------+ |
||
| 101 | | FICO key | Meaning | |
||
| 102 | +===============+======================================================+ |
||
| 103 | | `totals` | Number of individuals per group | |
||
| 104 | +---------------+------------------------------------------------------+ |
||
| 105 | | `cdf` | Cumulative distribution function of score per group | |
||
| 106 | +---------------+------------------------------------------------------+ |
||
| 107 | | `pdf` | Probability distribution function of score per group | |
||
| 108 | +---------------+------------------------------------------------------+ |
||
| 109 | | `performance` | Fraction of non-defaulters per score and group | |
||
| 110 | +---------------+------------------------------------------------------+ |
||
| 111 | | `base_rates` | Base rate of non-defaulters per group | |
||
| 112 | +---------------+------------------------------------------------------+ |
||
| 113 | | `base_rate` | The overall base rate non-defaulters | |
||
| 114 | +---------------+------------------------------------------------------+ |
||
| 115 | | `proportions` | Fraction of individuals per group | |
||
| 116 | +---------------+------------------------------------------------------+ |
||
| 117 | | `fpr` | True Positive Rate by score as threshold per group | |
||
| 118 | +---------------+------------------------------------------------------+ |
||
| 119 | | `tpr` | False Positive Rate by score as threshold per group | |
||
| 120 | +---------------+------------------------------------------------------+ |
||
| 121 | | `rocs` | ROC per group | |
||
| 122 | +---------------+------------------------------------------------------+ |
||
| 123 | | `aucs` | ROC AUC per group | |
||
| 124 | +---------------+------------------------------------------------------+ |
||
| 125 | |||
| 126 | :return: Dictionary of various aggregated statics |
||
| 127 | of the FICO credit score. |
||
| 128 | :rtype: dict |
||
| 129 | |||
| 130 | References: |
||
| 131 | - Based on code (MIT License) by Moritz Hardt |
||
| 132 | from https://github.com/fairmlbook/fairmlbook.github.io |
||
| 133 | - https://fairmlbook.org/demographic.html#case-study-credit-scoring |
||
| 134 | |||
| 135 | """ |
||
| 136 | |||
| 137 | totals, cdfs_df, performance_df = _load_data() |
||
| 138 | pdfs_df = _get_pdfs(cdfs_df) |
||
| 139 | |||
| 140 | proportions = {group: total / sum(totals.values()) |
||
| 141 | for group, total in totals.items()} |
||
| 142 | |||
| 143 | base_rates = (pdfs_df * performance_df).sum() |
||
| 144 | base_rate = (base_rates * pd.Series(proportions)).sum() |
||
| 145 | |||
| 146 | tpr_df, fpr_df = _calc_tpr_fpr(pdfs_df, performance_df) |
||
| 147 | rocs = _build_rocs(fpr_df, tpr_df) |
||
| 148 | |||
| 149 | aucs = {group: auc(fpr, tpr) for group, (fpr, tpr, _) |
||
| 150 | in rocs.items()} |
||
| 151 | |||
| 152 | return {'totals': totals, |
||
| 153 | 'cdf': cdfs_df, |
||
| 154 | 'pdf': pdfs_df, |
||
| 155 | 'performance': performance_df, |
||
| 156 | 'base_rates': base_rates, |
||
| 157 | 'base_rate': base_rate, |
||
| 158 | 'proportions': proportions, |
||
| 159 | 'fpr': fpr_df, |
||
| 160 | 'tpr': tpr_df, |
||
| 161 | 'rocs': rocs, |
||
| 162 | 'aucs': aucs} |
||
| 163 |