1
|
|
|
import json |
2
|
|
|
|
3
|
|
|
import numpy as np |
4
|
|
|
import pandas as pd |
5
|
|
|
from pkg_resources import resource_filename, resource_stream |
6
|
|
|
|
7
|
|
|
from responsibly.dataset.core import Dataset |
8
|
|
|
|
9
|
|
|
|
10
|
|
|
__all__ = ['GermanDataset'] |
11
|
|
|
|
12
|
|
|
GERMAN_PATH = resource_filename(__name__, |
13
|
|
|
'german.data') |
14
|
|
|
|
15
|
|
|
VALUES_MAPS = json.loads(resource_stream(__name__, |
16
|
|
|
'values_maps.json') |
17
|
|
|
.read() |
18
|
|
|
.decode()) |
19
|
|
|
|
20
|
|
|
COLUMN_NAMES = ['status', 'duration', 'credit_history', 'purpose', |
21
|
|
|
'credit_amount', 'savings', 'present_employment', |
22
|
|
|
'installment_rate', 'status_sex', 'other_debtors', |
23
|
|
|
'present_residence_since', 'property', 'age', |
24
|
|
|
'installment_plans', 'housing', |
25
|
|
|
'number_of_existing_credits', 'job', |
26
|
|
|
'number_of_people_liable_for', 'telephone', |
27
|
|
|
'foreign_worker', 'credit'] |
28
|
|
|
|
29
|
|
|
|
30
|
|
|
class GermanDataset(Dataset): |
31
|
|
|
"""German Credit Dataset. |
32
|
|
|
|
33
|
|
|
See :class:`~responsibly.dataset.Dataset` for a description of |
34
|
|
|
the arguments and attributes. |
35
|
|
|
|
36
|
|
|
References: |
37
|
|
|
- https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data) |
38
|
|
|
- Kamiran, F., & Calders, T. (2009, February). |
39
|
|
|
Classifying without discriminating. |
40
|
|
|
In 2009 2nd International Conference on Computer, Control |
41
|
|
|
and Communication (pp. 1-6). IEEE. |
42
|
|
|
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.6067&rep=rep1&type=pdf |
43
|
|
|
|
44
|
|
|
|
45
|
|
|
Extra |
46
|
|
|
This dataset requires use of a cost matrix (see below) |
47
|
|
|
|
48
|
|
|
:: |
49
|
|
|
|
50
|
|
|
1 2 |
51
|
|
|
---- |
52
|
|
|
1 | 0 1 |
53
|
|
|
|---- |
54
|
|
|
2 | 5 0 |
55
|
|
|
|
56
|
|
|
(1 = Good, 2 = Bad) |
57
|
|
|
|
58
|
|
|
The rows represent the actual classification |
59
|
|
|
and the columns the predicted classification. |
60
|
|
|
It is worse to class a customer as good when they are bad (5), |
61
|
|
|
than it is to class a customer as bad when they are good (1). |
62
|
|
|
|
63
|
|
|
""" |
64
|
|
|
|
65
|
|
|
def __init__(self): |
66
|
|
|
super().__init__(target='credit', |
67
|
|
|
sensitive_attributes=['age_factor']) |
68
|
|
|
self.cost_matrix = [[0, 1], [5, 0]] |
69
|
|
|
|
70
|
|
|
def _load_data(self): |
71
|
|
|
return pd.read_csv(GERMAN_PATH, sep=' ', names=COLUMN_NAMES, |
72
|
|
|
header=None, index_col=False) |
73
|
|
|
|
74
|
|
|
def _preprocess(self): |
75
|
|
|
"""Perform the same preprocessing as the dataset doc file.""" |
76
|
|
|
self.df['credit'] = self.df['credit'].astype(str) |
77
|
|
|
|
78
|
|
|
for col, translation in VALUES_MAPS.items(): |
79
|
|
|
self.df[col] = self.df[col].map(translation) |
80
|
|
|
|
81
|
|
|
new_column_names = COLUMN_NAMES[:] |
82
|
|
|
|
83
|
|
|
self.df['status'], self.df['sex'] = (self.df['status_sex'] |
84
|
|
|
.str |
85
|
|
|
.split(' : ') |
86
|
|
|
.str) |
87
|
|
|
self.df = self.df.drop('status_sex', axis=1) |
88
|
|
|
|
89
|
|
|
status_sex_index = new_column_names.index('status_sex') |
90
|
|
|
new_column_names[status_sex_index:status_sex_index + 1] = \ |
91
|
|
|
['status', 'sex'] |
92
|
|
|
|
93
|
|
|
self.df['age_factor'] = pd.cut(self.df['age'], |
94
|
|
|
[19, 25, 76], |
95
|
|
|
right=False) |
96
|
|
|
age_factor_index = new_column_names.index('age') + 1 |
97
|
|
|
new_column_names.insert(age_factor_index, 'age_factor') |
98
|
|
|
|
99
|
|
|
self.df = self.df[new_column_names] |
100
|
|
|
|
101
|
|
|
def _validate(self): |
102
|
|
|
# pylint: disable=line-too-long |
103
|
|
|
super()._validate() |
104
|
|
|
|
105
|
|
|
assert len(self.df) == 1000, 'the number of rows should be 1000,'\ |
106
|
|
|
' but it is {}.'.format(len(self.df)) |
107
|
|
|
assert len(self.df.columns) == 23, 'the number of columns should be 23,'\ |
108
|
|
|
' but it is {}.'.format(len(self.df.columns)) |
109
|
|
|
assert not self.df.isnull().any().any(), 'there are null values.' |
110
|
|
|
assert self.df['age_factor'].nunique() == 2,\ |
111
|
|
|
'age_factor should have only 2 unique values,'\ |
112
|
|
|
' but it is{}'.format(self.df['age_factor'].nunique()) |
113
|
|
|
|