responsibly.dataset.german.GermanDataset._preprocess() - Code Metrics - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

GermanDataset._preprocess() A
last analyzed 2021-04-02 13:01 UTC

↳ Parent: responsibly.dataset.german

Complexity

Conditions

Size

Total Lines	26
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	19
nop	1
dl	0
loc	26
rs	9.45
c	0
b	0
f	0

import json

import numpy as np
import pandas as pd
from pkg_resources import resource_filename, resource_stream

from responsibly.dataset.core import Dataset


__all__ = ['GermanDataset']

GERMAN_PATH = resource_filename(__name__,
                                'german.data')

VALUES_MAPS = json.loads(resource_stream(__name__,
                                         'values_maps.json')
                         .read()
                         .decode())

COLUMN_NAMES = ['status', 'duration', 'credit_history', 'purpose',
                'credit_amount', 'savings', 'present_employment',
                'installment_rate', 'status_sex', 'other_debtors',
                'present_residence_since', 'property', 'age',
                'installment_plans', 'housing',
                'number_of_existing_credits', 'job',
                'number_of_people_liable_for', 'telephone',
                'foreign_worker', 'credit']


class GermanDataset(Dataset):
    """German Credit Dataset.

    See :class:`~responsibly.dataset.Dataset` for a description of
    the arguments and attributes.

    References:
        - https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
        - Kamiran, F., & Calders, T. (2009, February).
          Classifying without discriminating.
          In 2009 2nd International Conference on Computer, Control
          and Communication (pp. 1-6). IEEE.
          http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.6067&rep=rep1&type=pdf


    Extra
        This dataset requires use of a cost matrix (see below)

        ::

               1 2
               ----
            1 | 0 1
              |----
            2 | 5 0

        (1 = Good, 2 = Bad)

        The rows represent the actual classification
        and the columns the predicted classification.
        It is worse to class a customer as good when they are bad (5),
        than it is to class a customer as bad when they are good (1).

    """

    def __init__(self):
        super().__init__(target='credit',
                         sensitive_attributes=['age_factor'])
        self.cost_matrix = [[0, 1], [5, 0]]

    def _load_data(self):
        return pd.read_csv(GERMAN_PATH, sep=' ', names=COLUMN_NAMES,
                           header=None, index_col=False)

    def _preprocess(self):
        """Perform the same preprocessing as the dataset doc file."""
        self.df['credit'] = self.df['credit'].astype(str)

        for col, translation in VALUES_MAPS.items():
            self.df[col] = self.df[col].map(translation)

        new_column_names = COLUMN_NAMES[:]

        self.df['status'], self.df['sex'] = (self.df['status_sex']
                                             .str
                                             .split(' : ')
                                             .str)
        self.df = self.df.drop('status_sex', axis=1)

        status_sex_index = new_column_names.index('status_sex')
        new_column_names[status_sex_index:status_sex_index + 1] = \
            ['status', 'sex']

        self.df['age_factor'] = pd.cut(self.df['age'],
                                       [19, 25, 76],
                                       right=False)
        age_factor_index = new_column_names.index('age') + 1
        new_column_names.insert(age_factor_index, 'age_factor')

        self.df = self.df[new_column_names]

    def _validate(self):
        # pylint: disable=line-too-long
        super()._validate()

        assert len(self.df) == 1000, 'the number of rows should be 1000,'\
                                     ' but it is {}.'.format(len(self.df))
        assert len(self.df.columns) == 23, 'the number of columns should be 23,'\
                                           ' but it is {}.'.format(len(self.df.columns))
        assert not self.df.isnull().any().any(), 'there are null values.'
        assert self.df['age_factor'].nunique() == 2,\
            'age_factor should have only 2 unique values,'\
            ' but it is{}'.format(self.df['age_factor'].nunique())


1			import json
2
3			import numpy as np
4			import pandas as pd
5			from pkg_resources import resource_filename, resource_stream
6
7			from responsibly.dataset.core import Dataset
8
9
10			__all__ = ['GermanDataset']
11
12			GERMAN_PATH = resource_filename(__name__,
13			'german.data')
14
15			VALUES_MAPS = json.loads(resource_stream(__name__,
16			'values_maps.json')
17			.read()
18			.decode())
19
20			COLUMN_NAMES = ['status', 'duration', 'credit_history', 'purpose',
21			'credit_amount', 'savings', 'present_employment',
22			'installment_rate', 'status_sex', 'other_debtors',
23			'present_residence_since', 'property', 'age',
24			'installment_plans', 'housing',
25			'number_of_existing_credits', 'job',
26			'number_of_people_liable_for', 'telephone',
27			'foreign_worker', 'credit']
28
29
30			class GermanDataset(Dataset):
31			"""German Credit Dataset.
32
33			See :class:`~responsibly.dataset.Dataset` for a description of
34			the arguments and attributes.
35
36			References:
37			- https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
38			- Kamiran, F., & Calders, T. (2009, February).
39			Classifying without discriminating.
40			In 2009 2nd International Conference on Computer, Control
41			and Communication (pp. 1-6). IEEE.
42			http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.6067&rep=rep1&type=pdf
43
44
45			Extra
46			This dataset requires use of a cost matrix (see below)
47
48			::
49
50			1 2
51			----
52			1 \| 0 1
53			\|----
54			2 \| 5 0
55
56			(1 = Good, 2 = Bad)
57
58			The rows represent the actual classification
59			and the columns the predicted classification.
60			It is worse to class a customer as good when they are bad (5),
61			than it is to class a customer as bad when they are good (1).
62
63			"""
64
65			def __init__(self):
66			super().__init__(target='credit',
67			sensitive_attributes=['age_factor'])
68			self.cost_matrix = [[0, 1], [5, 0]]
69
70			def _load_data(self):
71			return pd.read_csv(GERMAN_PATH, sep=' ', names=COLUMN_NAMES,
72			header=None, index_col=False)
73
74			def _preprocess(self):
75			"""Perform the same preprocessing as the dataset doc file."""
76			self.df['credit'] = self.df['credit'].astype(str)
77
78			for col, translation in VALUES_MAPS.items():
79			self.df[col] = self.df[col].map(translation)
80
81			new_column_names = COLUMN_NAMES[:]
82
83			self.df['status'], self.df['sex'] = (self.df['status_sex']
84			.str
85			.split(' : ')
86			.str)
87			self.df = self.df.drop('status_sex', axis=1)
88
89			status_sex_index = new_column_names.index('status_sex')
90			new_column_names[status_sex_index:status_sex_index + 1] = \
91			['status', 'sex']
92
93			self.df['age_factor'] = pd.cut(self.df['age'],
94			[19, 25, 76],
95			right=False)
96			age_factor_index = new_column_names.index('age') + 1
97			new_column_names.insert(age_factor_index, 'age_factor')
98
99			self.df = self.df[new_column_names]
100
101			def _validate(self):
102			# pylint: disable=line-too-long
103			super()._validate()
104
105			assert len(self.df) == 1000, 'the number of rows should be 1000,'\
106			' but it is {}.'.format(len(self.df))
107			assert len(self.df.columns) == 23, 'the number of columns should be 23,'\
108			' but it is {}.'.format(len(self.df.columns))
109			assert not self.df.isnull().any().any(), 'there are null values.'
110			assert self.df['age_factor'].nunique() == 2,\
111			'age_factor should have only 2 unique values,'\
112			' but it is{}'.format(self.df['age_factor'].nunique())
113

ResponsiblyAI / responsibly

GermanDataset._preprocess() A last analyzed 2021-04-02 13:01 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

GermanDataset._preprocess() A
last analyzed 2021-04-02 13:01 UTC