responsibly.dataset.compas - Code Metrics - Inspection of "Useful threshold api" - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#31)

by Shlomi

created 2019-08-04 02:48 UTC

responsibly.dataset.compas A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	64
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	36
dl	0
loc	64
rs	10
c	0
b	0
f	0
wmc	4

4 Methods

Rating	Name	Size	Complexity
A	COMPASDataset.__init__()	5	1
A	COMPASDataset._load_data()	2	1
A	COMPASDataset._preprocess()	21	1
A	COMPASDataset._validate()	8	1

__all__ = ['COMPASDataset']

import numpy as np
import pandas as pd
from pkg_resources import resource_filename

from responsibly.dataset.core import Dataset


COMPAS_PATH = resource_filename(__name__,
                                'compas-scores-two-years.csv')


class COMPASDataset(Dataset):
    """ProPublica Recidivism/COMPAS Dataset.

    See :class:`~responsibly.dataset.Dataset` for a description of
    the arguments and attributes.

    References:
        https://github.com/propublica/compas-analysis

    """

    def __init__(self):
        super().__init__(target='is_recid',
                         sensitive_attributes=['race', 'sex'],
                         prediction=['y_pred', 'score_factor',
                                     'score_text'])

    def _load_data(self):
        return pd.read_csv(COMPAS_PATH)

    def _preprocess(self):
        """Perform the same preprocessing as the original analysis.

        https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
        """

        self.df = self.df[(self.df['days_b_screening_arrest'] <= 30)
                          & (self.df['days_b_screening_arrest'] >= -30)
                          & (self.df['is_recid'] != -1)
                          & (self.df['c_charge_degree'] != 'O')
                          & (self.df['score_text'] != 'N/A')]

        self.df['c_jail_out'] = pd.to_datetime(self.df['c_jail_out'])
        self.df['c_jail_in'] = pd.to_datetime(self.df['c_jail_in'])
        self.df['length_of_stay'] = (self.df['c_jail_out']
                                     - self.df['c_jail_in'])

        self.df['score_factor'] = np.where(self.df['score_text']
                                           != 'Low',
                                           'HighScore', 'LowScore')
        self.df['y_pred'] = (self.df['score_factor'] == 'HighScore')

    def _validate(self):
        # pylint: disable=line-too-long
        super()._validate()

        assert len(self.df) == 6172, 'the number of rows should be 6172,'\
                                     ' but it is {}.'.format(len(self.df))
        assert len(self.df.columns) == 56, 'the number of columns should be 56,'\
                                           ' but it is {}.'.format(len(self.df.columns))


1			__all__ = ['COMPASDataset']
2
3			import numpy as np
4			import pandas as pd
5			from pkg_resources import resource_filename
6
7			from responsibly.dataset.core import Dataset
8
9
10			COMPAS_PATH = resource_filename(__name__,
11			'compas-scores-two-years.csv')
12
13
14			class COMPASDataset(Dataset):
15			"""ProPublica Recidivism/COMPAS Dataset.
16
17			See :class:`~responsibly.dataset.Dataset` for a description of
18			the arguments and attributes.
19
20			References:
21			https://github.com/propublica/compas-analysis
22
23			"""
24
25			def __init__(self):
26			super().__init__(target='is_recid',
27			sensitive_attributes=['race', 'sex'],
28			prediction=['y_pred', 'score_factor',
29			'score_text'])
30
31			def _load_data(self):
32			return pd.read_csv(COMPAS_PATH)
33
34			def _preprocess(self):
35			"""Perform the same preprocessing as the original analysis.
36
37			https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb
38			"""
39
40			self.df = self.df[(self.df['days_b_screening_arrest'] <= 30)
41			& (self.df['days_b_screening_arrest'] >= -30)
42			& (self.df['is_recid'] != -1)
43			& (self.df['c_charge_degree'] != 'O')
44			& (self.df['score_text'] != 'N/A')]
45
46			self.df['c_jail_out'] = pd.to_datetime(self.df['c_jail_out'])
47			self.df['c_jail_in'] = pd.to_datetime(self.df['c_jail_in'])
48			self.df['length_of_stay'] = (self.df['c_jail_out']
49			- self.df['c_jail_in'])
50
51			self.df['score_factor'] = np.where(self.df['score_text']
52			!= 'Low',
53			'HighScore', 'LowScore')
54			self.df['y_pred'] = (self.df['score_factor'] == 'HighScore')
55
56			def _validate(self):
57			# pylint: disable=line-too-long
58			super()._validate()
59
60			assert len(self.df) == 6172, 'the number of rows should be 6172,'\
61			' but it is {}.'.format(len(self.df))
62			assert len(self.df.columns) == 56, 'the number of columns should be 56,'\
63			' but it is {}.'.format(len(self.df.columns))
64

ResponsiblyAI / responsibly

Pull Request — master (#31)

responsibly.dataset.compas A

Complexity

Size/Duplication

Importance

4 Methods

Duplication Side-by-Side

Filter issues like