Passed
Push — master ( 170db5...8af2aa )
by Shlomi
02:43 queued 58s
created

ethically.dataset.german.GermanDataset.__init__()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 4
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
import json
2
3
import numpy as np
4
import pandas as pd
5
from pkg_resources import resource_filename, resource_stream
6
7
from ethically.dataset.core import Dataset
8
9
10
__all__ = ['GermanDataset']
11
12
GERMAN_PATH = resource_filename(__name__,
13
                                'german.data')
14
15
VALUES_MAPS = json.loads(resource_stream(__name__,
16
                                         'values_maps.json')
17
                         .read()
18
                         .decode())
19
20
COLUMN_NAMES = ['status', 'duration', 'credit_history', 'purpose',
21
                'credit_amount', 'savings', 'present_employment',
22
                'installment_rate', 'status_sex', 'other_debtors',
23
                'present_residence_since', 'property', 'age',
24
                'installment_plans', 'housing',
25
                'number_of_existing_credits', 'job',
26
                'number_of_people_liable_for', 'telephone',
27
                'foreign_worker', 'credit']
28
29
30
class GermanDataset(Dataset):
31
    """German Credit Dataset.
32
33
    See :class:`~ethically.dataset.Dataset` for a description of
34
    the arguments and attributes.
35
36
    References:
37
        - https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
38
        - Kamiran, F., & Calders, T. (2009, February).
39
          Classifying without discriminating.
40
          In 2009 2nd International Conference on Computer, Control
41
          and Communication (pp. 1-6). IEEE.
42
          http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.6067&rep=rep1&type=pdf
43
44
45
    Extra
46
        This dataset requires use of a cost matrix (see below)
47
48
        ::
49
50
               1 2
51
               ----
52
            1 | 0 1
53
              |----
54
            2 | 5 0
55
56
        (1 = Good, 2 = Bad)
57
58
        The rows represent the actual classification
59
        and the columns the predicted classification.
60
        It is worse to class a customer as good when they are bad (5),
61
        than it is to class a customer as bad when they are good (1).
62
63
    """
64
65
    def __init__(self):
66
        super().__init__(target='credit',
67
                         sensitive_attributes=['age_factor'])
68
        self.cost_matrix = [[0, 1], [5, 0]]
69
70
    def _load_data(self):
71
        return pd.read_csv(GERMAN_PATH, sep=' ', names=COLUMN_NAMES,
72
                           header=None, index_col=False)
73
74
    def _preprocess(self):
75
        """Perform the same preprocessing as the dataset doc file."""
76
        self.df['credit'] = self.df['credit'].astype(str)
77
78
        for col, translation in VALUES_MAPS.items():
79
            self.df[col] = self.df[col].map(translation)
80
81
        new_column_names = COLUMN_NAMES[:]
82
83
        self.df['status'], self.df['sex'] = (self.df['status_sex']
84
                                             .str
85
                                             .split(' : ')
86
                                             .str)
87
        self.df = self.df.drop('status_sex', axis=1)
88
89
        status_sex_index = new_column_names.index('status_sex')
90
        new_column_names[status_sex_index:status_sex_index + 1] = \
91
            ['status', 'sex']
92
93
        self.df['age_factor'] = pd.cut(self.df['age'],
94
                                       [19, 25, 76],
95
                                       right=False)
96
        age_factor_index = new_column_names.index('age') + 1
97
        new_column_names.insert(age_factor_index, 'age_factor')
98
99
        self.df = self.df[new_column_names]
100
101
    def _validate(self):
102
        # pylint: disable=line-too-long
103
        super()._validate()
104
105
        assert len(self.df) == 1000, 'the number of rows should be 1000,'\
106
                                     ' but it is {}.'.format(len(self.df))
107
        assert len(self.df.columns) == 23, 'the number of columns should be 23,'\
108
                                           ' but it is {}.'.format(len(self.df.columns))
109
        assert not self.df.isnull().any().any(), 'there are null values.'
110
        assert self.df['age_factor'].nunique() == 2,\
111
            'age_factor should have only 2 unique values,'\
112
            ' but it is{}'.format(self.df['age_factor'].nunique())
113