klib.tests.test_util.Test__drop_duplicates.test__drop_dupl() - Code Metrics - Inspection of "cat_plot refinements and test updates" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 5a4fe9...c92c0e )

by Andreas

created 2020-08-03 08:12 UTC

Test__drop_duplicates.test__drop_dupl() A

↳ Parent: klib.tests.test_util

Complexity

Conditions

Size

Total Lines	7
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	4
nop	1
dl	0
loc	7
rs	10
c	0
b	0
f	0

import numpy as np
import pandas as pd
import unittest
from ..utils import (
    _corr_selector,
    _drop_duplicates,
    _missing_vals,
    _validate_input_bool,
    _validate_input_int,
    _validate_input_range,
    _validate_input_smaller,
    _validate_input_sum_smaller,
    _validate_input_sum_larger,
)


class Test__corr_selector(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_corr = pd.DataFrame(
            [
                [1, 7, 2, 2, 4, 7],
                [3, 8, 3, 3, 7, 1],
                [5, 7, 9, 5, 1, 4],
                [1, 7, 8, 6, 1, 8],
                [1, 7, 5, 6, 2, 6],
                [2, 7, 3, 3, 5, 3],
            ]
        )

        cls.target = pd.Series([1, 2, 4, 7, 4, 2])

    def test__corr_selector_matrix(self):
        self.assertEqual(_corr_selector(self.df_data_corr.corr()).shape, (6, 6))
        self.assertEqual(_corr_selector(self.df_data_corr.corr(), split="pos").isna().sum().sum(), 18)
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="pos", threshold=0.5).isna().sum().sum(), 26
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="neg", threshold=-0.75).isna().sum().sum(), 32
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="high", threshold=0.15).isna().sum().sum(), 4
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="low", threshold=0.85).isna().sum().sum(), 6
        )

    def test__corr_selector_label(self):
        self.assertEqual(_corr_selector(self.df_data_corr.corrwith(self.target)).shape, (6,))
        self.assertEqual(_corr_selector(self.df_data_corr.corrwith(self.target), split="pos").isna().sum(), 3)
        self.assertEqual(
            _corr_selector(self.df_data_corr.corrwith(self.target), split="pos", threshold=0.8).isna().sum(),
            4,
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corrwith(self.target), split="neg", threshold=-0.7).isna().sum(),
            5,
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corrwith(self.target), split="high", threshold=0.2).isna().sum(),
            1,
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corrwith(self.target), split="low", threshold=0.8).isna().sum(),
            2,
        )


class Test__drop_duplicates(unittest.TestCase):
    @classmethod
    def setUpClass(cls: pd.DataFrame) -> pd.DataFrame:
        cls.data_dupl_df = pd.DataFrame(
            [
                [pd.NA, pd.NA, pd.NA, pd.NA],
                [1, 2, 3, 4],
                [1, 2, 3, 4],
                [1, 2, 3, 4],
                [2, 3, 4, 5],
                [1, 2, 3, pd.NA],
                [pd.NA, pd.NA, pd.NA, pd.NA],
            ]
        )

    def test__drop_dupl(self):
        # Test dropping of duplicate rows
        self.assertAlmostEqual(_drop_duplicates(self.data_dupl_df)[0].shape, (4, 4))
        # Test if the resulting DataFrame is equal to using the pandas method
        self.assertTrue(_drop_duplicates(self.data_dupl_df)[0].equals(self.data_dupl_df.drop_duplicates()))
        # Test number of duplicates
        self.assertEqual(len(_drop_duplicates(self.data_dupl_df)[1]), 3)


class Test__missing_vals(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.data_mv_list = [
            [1, np.nan, 3, 4],
            [None, 4, 5, None],
            ["a", "b", pd.NA, "d"],
            [True, False, 7, pd.NaT],
        ]

        cls.data_mv_df = pd.DataFrame(cls.data_mv_list)

        cls.data_mv_array = np.array(cls.data_mv_list)

    def test_mv_total(self):
        # Test total missing values
        self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_total"], 5)
        self.assertAlmostEqual(_missing_vals(self.data_mv_array)["mv_total"], 5)
        self.assertAlmostEqual(_missing_vals(self.data_mv_list)["mv_total"], 5)

    def test_mv_rows(self):
        # Test missing values for each row
        expected_results = [1, 2, 1, 1]
        for i, _ in enumerate(expected_results):
            self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_rows"][i], expected_results[i])

    def test_mv_cols(self):
        # Test missing values for each column
        expected_results = [1, 1, 1, 2]
        for i, _ in enumerate(expected_results):
            self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_cols"][i], expected_results[i])

    def test_mv_rows_ratio(self):
        # Test missing values ratio for each row
        expected_results = [0.25, 0.5, 0.25, 0.25]
        for i, _ in enumerate(expected_results):
            self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_rows_ratio"][i], expected_results[i])

        # Test if missing value ratio is between 0 and 1
        for i in range(len(self.data_mv_df)):
            self.assertTrue(0 <= _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] <= 1)

    def test_mv_cols_ratio(self):
        # Test missing values ratio for each column
        expected_results = [1 / 4, 0.25, 0.25, 0.5]
        for i, _ in enumerate(expected_results):
            self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_cols_ratio"][i], expected_results[i])

        # Test if missing value ratio is between 0 and 1
        for i in range(len(self.data_mv_df)):
            self.assertTrue(0 <= _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] <= 1)


class Test__validate_input(unittest.TestCase):
    def test__validate_input_bool(self):
        # Raises an exception if the input is not boolean
        with self.assertRaises(TypeError):
            _validate_input_bool("True", None)
        with self.assertRaises(TypeError):
            _validate_input_bool(None, None)
        with self.assertRaises(TypeError):
            _validate_input_bool(1, None)

    def test__validate_input_int(self):
        # Raises an exception if the input is not an integer
        with self.assertRaises(TypeError):
            _validate_input_int(1.1, None)
        with self.assertRaises(TypeError):
            _validate_input_int([1], None)
        with self.assertRaises(TypeError):
            _validate_input_int("1", None)

    def test__validate_input_smaller(self):
        # Raises an exception if the first value is larger than the second
        with self.assertRaises(ValueError):
            _validate_input_smaller(0.3, 0.2, None)
        with self.assertRaises(ValueError):
            _validate_input_smaller(3, 2, None)
        with self.assertRaises(ValueError):
            _validate_input_smaller(5, -3, None)

    def test__validate_input_range(self):
        with self.assertRaises(ValueError):
            _validate_input_range(-0.1, "value -0.1", 0, 1)

        with self.assertRaises(ValueError):
            _validate_input_range(1.1, "value 1.1", 0, 1)

        with self.assertRaises(TypeError):
            _validate_input_range("1", "value string", 0, 1)

    def test__validate_input_sum_smaller(self):
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(1, "Test Sum <= 1", 1.01)
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(1, "Test Sum <= 1", 0.3, 0.2, 0.4, 0.5)
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(-1, "Test Sum <= -1", -0.2, -0.7)
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(10, "Test Sum <= 10", 20, -11, 2)

    def test__validate_input_sum_larger(self):
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(1, "Test Sum >= 1", 0.99)
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(1, "Test Sum >= 1", 0.9, 0.05)
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(-2, "Test Sum >=-2", -3)
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(7, "Test Sum >= 7", 1, 2, 3)


1			import numpy as np
2			import pandas as pd
3			import unittest
4			from ..utils import (
5			_corr_selector,
6			_drop_duplicates,
7			_missing_vals,
8			_validate_input_bool,
9			_validate_input_int,
10			_validate_input_range,
11			_validate_input_smaller,
12			_validate_input_sum_smaller,
13			_validate_input_sum_larger,
14			)
15
16
17			class Test__corr_selector(unittest.TestCase):
18			@classmethod
19			def setUpClass(cls):
20			cls.df_data_corr = pd.DataFrame(
21			[
22			[1, 7, 2, 2, 4, 7],
23			[3, 8, 3, 3, 7, 1],
24			[5, 7, 9, 5, 1, 4],
25			[1, 7, 8, 6, 1, 8],
26			[1, 7, 5, 6, 2, 6],
27			[2, 7, 3, 3, 5, 3],
28			]
29			)
30
31			cls.target = pd.Series([1, 2, 4, 7, 4, 2])
32
33			def test__corr_selector_matrix(self):
34			self.assertEqual(_corr_selector(self.df_data_corr.corr()).shape, (6, 6))
35			self.assertEqual(_corr_selector(self.df_data_corr.corr(), split="pos").isna().sum().sum(), 18)
36			self.assertEqual(
37			_corr_selector(self.df_data_corr.corr(), split="pos", threshold=0.5).isna().sum().sum(), 26
38			)
39			self.assertEqual(
40			_corr_selector(self.df_data_corr.corr(), split="neg", threshold=-0.75).isna().sum().sum(), 32
41			)
42			self.assertEqual(
43			_corr_selector(self.df_data_corr.corr(), split="high", threshold=0.15).isna().sum().sum(), 4
44			)
45			self.assertEqual(
46			_corr_selector(self.df_data_corr.corr(), split="low", threshold=0.85).isna().sum().sum(), 6
47			)
48
49			def test__corr_selector_label(self):
50			self.assertEqual(_corr_selector(self.df_data_corr.corrwith(self.target)).shape, (6,))
51			self.assertEqual(_corr_selector(self.df_data_corr.corrwith(self.target), split="pos").isna().sum(), 3)
52			self.assertEqual(
53			_corr_selector(self.df_data_corr.corrwith(self.target), split="pos", threshold=0.8).isna().sum(),
54			4,
55			)
56			self.assertEqual(
57			_corr_selector(self.df_data_corr.corrwith(self.target), split="neg", threshold=-0.7).isna().sum(),
58			5,
59			)
60			self.assertEqual(
61			_corr_selector(self.df_data_corr.corrwith(self.target), split="high", threshold=0.2).isna().sum(),
62			1,
63			)
64			self.assertEqual(
65			_corr_selector(self.df_data_corr.corrwith(self.target), split="low", threshold=0.8).isna().sum(),
66			2,
67			)
68
69
70			class Test__drop_duplicates(unittest.TestCase):
71			@classmethod
72			def setUpClass(cls: pd.DataFrame) -> pd.DataFrame:
73			cls.data_dupl_df = pd.DataFrame(
74			[
75			[pd.NA, pd.NA, pd.NA, pd.NA],
76			[1, 2, 3, 4],
77			[1, 2, 3, 4],
78			[1, 2, 3, 4],
79			[2, 3, 4, 5],
80			[1, 2, 3, pd.NA],
81			[pd.NA, pd.NA, pd.NA, pd.NA],
82			]
83			)
84
85			def test__drop_dupl(self):
86			# Test dropping of duplicate rows
87			self.assertAlmostEqual(_drop_duplicates(self.data_dupl_df)[0].shape, (4, 4))
88			# Test if the resulting DataFrame is equal to using the pandas method
89			self.assertTrue(_drop_duplicates(self.data_dupl_df)[0].equals(self.data_dupl_df.drop_duplicates()))
90			# Test number of duplicates
91			self.assertEqual(len(_drop_duplicates(self.data_dupl_df)[1]), 3)
92
93
94			class Test__missing_vals(unittest.TestCase):
95			@classmethod
96			def setUpClass(cls):
97			cls.data_mv_list = [
98			[1, np.nan, 3, 4],
99			[None, 4, 5, None],
100			["a", "b", pd.NA, "d"],
101			[True, False, 7, pd.NaT],
102			]
103
104			cls.data_mv_df = pd.DataFrame(cls.data_mv_list)
105
106			cls.data_mv_array = np.array(cls.data_mv_list)
107
108			def test_mv_total(self):
109			# Test total missing values
110			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_total"], 5)
111			self.assertAlmostEqual(_missing_vals(self.data_mv_array)["mv_total"], 5)
112			self.assertAlmostEqual(_missing_vals(self.data_mv_list)["mv_total"], 5)
113
114			def test_mv_rows(self):
115			# Test missing values for each row
116			expected_results = [1, 2, 1, 1]
117			for i, _ in enumerate(expected_results):
118			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_rows"][i], expected_results[i])
119
120			def test_mv_cols(self):
121			# Test missing values for each column
122			expected_results = [1, 1, 1, 2]
123			for i, _ in enumerate(expected_results):
124			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_cols"][i], expected_results[i])
125
126			def test_mv_rows_ratio(self):
127			# Test missing values ratio for each row
128			expected_results = [0.25, 0.5, 0.25, 0.25]
129			for i, _ in enumerate(expected_results):
130			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_rows_ratio"][i], expected_results[i])
131
132			# Test if missing value ratio is between 0 and 1
133			for i in range(len(self.data_mv_df)):
134			self.assertTrue(0 <= _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] <= 1)
135
136			def test_mv_cols_ratio(self):
137			# Test missing values ratio for each column
138			expected_results = [1 / 4, 0.25, 0.25, 0.5]
139			for i, _ in enumerate(expected_results):
140			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_cols_ratio"][i], expected_results[i])
141
142			# Test if missing value ratio is between 0 and 1
143			for i in range(len(self.data_mv_df)):
144			self.assertTrue(0 <= _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] <= 1)
145
146
147			class Test__validate_input(unittest.TestCase):
148			def test__validate_input_bool(self):
149			# Raises an exception if the input is not boolean
150			with self.assertRaises(TypeError):
151			_validate_input_bool("True", None)
152			with self.assertRaises(TypeError):
153			_validate_input_bool(None, None)
154			with self.assertRaises(TypeError):
155			_validate_input_bool(1, None)
156
157			def test__validate_input_int(self):
158			# Raises an exception if the input is not an integer
159			with self.assertRaises(TypeError):
160			_validate_input_int(1.1, None)
161			with self.assertRaises(TypeError):
162			_validate_input_int([1], None)
163			with self.assertRaises(TypeError):
164			_validate_input_int("1", None)
165
166			def test__validate_input_smaller(self):
167			# Raises an exception if the first value is larger than the second
168			with self.assertRaises(ValueError):
169			_validate_input_smaller(0.3, 0.2, None)
170			with self.assertRaises(ValueError):
171			_validate_input_smaller(3, 2, None)
172			with self.assertRaises(ValueError):
173			_validate_input_smaller(5, -3, None)
174
175			def test__validate_input_range(self):
176			with self.assertRaises(ValueError):
177			_validate_input_range(-0.1, "value -0.1", 0, 1)
178
179			with self.assertRaises(ValueError):
180			_validate_input_range(1.1, "value 1.1", 0, 1)
181
182			with self.assertRaises(TypeError):
183			_validate_input_range("1", "value string", 0, 1)
184
185			def test__validate_input_sum_smaller(self):
186			with self.assertRaises(ValueError):
187			_validate_input_sum_smaller(1, "Test Sum <= 1", 1.01)
188			with self.assertRaises(ValueError):
189			_validate_input_sum_smaller(1, "Test Sum <= 1", 0.3, 0.2, 0.4, 0.5)
190			with self.assertRaises(ValueError):
191			_validate_input_sum_smaller(-1, "Test Sum <= -1", -0.2, -0.7)
192			with self.assertRaises(ValueError):
193			_validate_input_sum_smaller(10, "Test Sum <= 10", 20, -11, 2)
194
195			def test__validate_input_sum_larger(self):
196			with self.assertRaises(ValueError):
197			_validate_input_sum_larger(1, "Test Sum >= 1", 0.99)
198			with self.assertRaises(ValueError):
199			_validate_input_sum_larger(1, "Test Sum >= 1", 0.9, 0.05)
200			with self.assertRaises(ValueError):
201			_validate_input_sum_larger(-2, "Test Sum >=-2", -3)
202			with self.assertRaises(ValueError):
203			_validate_input_sum_larger(7, "Test Sum >= 7", 1, 2, 3)
204

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 5a4fe9...c92c0e )

Test__drop_duplicates.test__drop_dupl() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like