tests.test_util.Test__missing_vals.test_mv_cols_ratio() - Code Metrics - Inspection of "Reorganize package" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — main (#17)

by Andreas

created 2021-12-26 12:27 UTC

Test__missing_vals.test_mv_cols_ratio() A

↳ Parent: tests.test_util

Complexity

Conditions

Size

Total Lines	12
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	8
nop	1
dl	0
loc	12
rs	10
c	0
b	0
f	0

import unittest

import numpy as np
import pandas as pd

from klib.utils import (
    _corr_selector,
    _drop_duplicates,
    _missing_vals,
    _validate_input_bool,
    _validate_input_int,
    _validate_input_range,
    _validate_input_smaller,
    _validate_input_sum_larger,
    _validate_input_sum_smaller,
)


class Test__corr_selector(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_corr = pd.DataFrame(
            [
                [1, 7, 2, 2, 4, 7],
                [3, 8, 3, 3, 7, 1],
                [5, 7, 9, 5, 1, 4],
                [1, 7, 8, 6, 1, 8],
                [1, 7, 5, 6, 2, 6],
                [2, 7, 3, 3, 5, 3],
            ]
        )

        cls.target = pd.Series([1, 2, 4, 7, 4, 2])

    def test__corr_selector_matrix(self):
        self.assertEqual(_corr_selector(self.df_data_corr.corr()).shape, (6, 6))
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="pos").isna().sum().sum(), 18
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="pos", threshold=0.5)
            .isna()
            .sum()
            .sum(),
            26,
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="neg", threshold=-0.75)
            .isna()
            .sum()
            .sum(),
            32,
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="high", threshold=0.15)
            .isna()
            .sum()
            .sum(),
            4,
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corr(), split="low", threshold=0.85)
            .isna()
            .sum()
            .sum(),
            6,
        )

    def test__corr_selector_label(self):
        self.assertEqual(
            _corr_selector(self.df_data_corr.corrwith(self.target)).shape, (6,)
        )
        self.assertEqual(
            _corr_selector(self.df_data_corr.corrwith(self.target), split="pos")
            .isna()
            .sum(),
            3,
        )
        self.assertEqual(
            _corr_selector(
                self.df_data_corr.corrwith(self.target), split="pos", threshold=0.8
            )
            .isna()
            .sum(),
            4,
        )
        self.assertEqual(
            _corr_selector(
                self.df_data_corr.corrwith(self.target), split="neg", threshold=-0.7
            )
            .isna()
            .sum(),
            5,
        )
        self.assertEqual(
            _corr_selector(
                self.df_data_corr.corrwith(self.target), split="high", threshold=0.2
            )
            .isna()
            .sum(),
            1,
        )
        self.assertEqual(
            _corr_selector(
                self.df_data_corr.corrwith(self.target), split="low", threshold=0.8
            )
            .isna()
            .sum(),
            2,
        )


class Test__drop_duplicates(unittest.TestCase):
    @classmethod
    def setUpClass(cls: pd.DataFrame) -> pd.DataFrame:
        cls.data_dupl_df = pd.DataFrame(
            [
                [pd.NA, pd.NA, pd.NA, pd.NA],
                [1, 2, 3, 4],
                [1, 2, 3, 4],
                [1, 2, 3, 4],
                [2, 3, 4, 5],
                [1, 2, 3, pd.NA],
                [pd.NA, pd.NA, pd.NA, pd.NA],
            ]
        )

    def test__drop_dupl(self):
        # Test dropping of duplicate rows
        self.assertAlmostEqual(_drop_duplicates(self.data_dupl_df)[0].shape, (4, 4))
        # Test if the resulting DataFrame is equal to using the pandas method
        self.assertTrue(
            _drop_duplicates(self.data_dupl_df)[0].equals(
                self.data_dupl_df.drop_duplicates().reset_index(drop=True)
            )
        )
        # Test number of duplicates
        self.assertEqual(len(_drop_duplicates(self.data_dupl_df)[1]), 3)


class Test__missing_vals(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.data_mv_list = [
            [1, np.nan, 3, 4],
            [None, 4, 5, None],
            ["a", "b", pd.NA, "d"],
            [True, False, 7, pd.NaT],
        ]

        cls.data_mv_df = pd.DataFrame(cls.data_mv_list)

        cls.data_mv_array = np.array(cls.data_mv_list)

    def test_mv_total(self):
        # Test total missing values
        self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_total"], 5)
        self.assertAlmostEqual(_missing_vals(self.data_mv_array)["mv_total"], 5)
        self.assertAlmostEqual(_missing_vals(self.data_mv_list)["mv_total"], 5)

    def test_mv_rows(self):
        # Test missing values for each row
        expected_results = [1, 2, 1, 1]
        for i, result in enumerate(expected_results):
            self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_rows"][i], result)

    def test_mv_cols(self):
        # Test missing values for each column
        expected_results = [1, 1, 1, 2]
        for i, result in enumerate(expected_results):
            self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_cols"][i], result)

    def test_mv_rows_ratio(self):
        # Test missing values ratio for each row
        expected_results = [0.25, 0.5, 0.25, 0.25]
        for i, result in enumerate(expected_results):
            self.assertAlmostEqual(
                _missing_vals(self.data_mv_df)["mv_rows_ratio"][i], result
            )

        # Test if missing value ratio is between 0 and 1
        for i, _ in enumerate(self.data_mv_df):
            self.assertTrue(
                0 <= _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] <= 1
            )

    def test_mv_cols_ratio(self):
        # Test missing values ratio for each column
        expected_results = [1 / 4, 0.25, 0.25, 0.5]
        for i, result in enumerate(expected_results):
            self.assertAlmostEqual(
                _missing_vals(self.data_mv_df)["mv_cols_ratio"][i], result
            )

        # Test if missing value ratio is between 0 and 1
        for i, _ in enumerate(self.data_mv_df):
            self.assertTrue(
                0 <= _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] <= 1
            )


class Test__validate_input(unittest.TestCase):
    def test__validate_input_bool(self):
        # Raises an exception if the input is not boolean
        with self.assertRaises(TypeError):
            _validate_input_bool("True", None)
        with self.assertRaises(TypeError):
            _validate_input_bool(None, None)
        with self.assertRaises(TypeError):
            _validate_input_bool(1, None)

    def test__validate_input_int(self):
        # Raises an exception if the input is not an integer
        with self.assertRaises(TypeError):
            _validate_input_int(1.1, None)
        with self.assertRaises(TypeError):
            _validate_input_int([1], None)
        with self.assertRaises(TypeError):
            _validate_input_int("1", None)

    def test__validate_input_smaller(self):
        # Raises an exception if the first value is larger than the second
        with self.assertRaises(ValueError):
            _validate_input_smaller(0.3, 0.2, None)
        with self.assertRaises(ValueError):
            _validate_input_smaller(3, 2, None)
        with self.assertRaises(ValueError):
            _validate_input_smaller(5, -3, None)

    def test__validate_input_range(self):
        with self.assertRaises(ValueError):
            _validate_input_range(-0.1, "value -0.1", 0, 1)

        with self.assertRaises(ValueError):
            _validate_input_range(1.1, "value 1.1", 0, 1)

        with self.assertRaises(TypeError):
            _validate_input_range("1", "value string", 0, 1)

    def test__validate_input_sum_smaller(self):
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(1, "Test Sum <= 1", 1.01)
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(1, "Test Sum <= 1", 0.3, 0.2, 0.4, 0.5)
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(-1, "Test Sum <= -1", -0.2, -0.7)
        with self.assertRaises(ValueError):
            _validate_input_sum_smaller(10, "Test Sum <= 10", 20, -11, 2)

    def test__validate_input_sum_larger(self):
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(1, "Test Sum >= 1", 0.99)
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(1, "Test Sum >= 1", 0.9, 0.05)
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(-2, "Test Sum >=-2", -3)
        with self.assertRaises(ValueError):
            _validate_input_sum_larger(7, "Test Sum >= 7", 1, 2, 3)


1			import unittest
2
3			import numpy as np
4			import pandas as pd
5
6			from klib.utils import (
7			_corr_selector,
8			_drop_duplicates,
9			_missing_vals,
10			_validate_input_bool,
11			_validate_input_int,
12			_validate_input_range,
13			_validate_input_smaller,
14			_validate_input_sum_larger,
15			_validate_input_sum_smaller,
16			)
17
18
19			class Test__corr_selector(unittest.TestCase):
20			@classmethod
21			def setUpClass(cls):
22			cls.df_data_corr = pd.DataFrame(
23			[
24			[1, 7, 2, 2, 4, 7],
25			[3, 8, 3, 3, 7, 1],
26			[5, 7, 9, 5, 1, 4],
27			[1, 7, 8, 6, 1, 8],
28			[1, 7, 5, 6, 2, 6],
29			[2, 7, 3, 3, 5, 3],
30			]
31			)
32
33			cls.target = pd.Series([1, 2, 4, 7, 4, 2])
34
35			def test__corr_selector_matrix(self):
36			self.assertEqual(_corr_selector(self.df_data_corr.corr()).shape, (6, 6))
37			self.assertEqual(
38			_corr_selector(self.df_data_corr.corr(), split="pos").isna().sum().sum(), 18
39			)
40			self.assertEqual(
41			_corr_selector(self.df_data_corr.corr(), split="pos", threshold=0.5)
42			.isna()
43			.sum()
44			.sum(),
45			26,
46			)
47			self.assertEqual(
48			_corr_selector(self.df_data_corr.corr(), split="neg", threshold=-0.75)
49			.isna()
50			.sum()
51			.sum(),
52			32,
53			)
54			self.assertEqual(
55			_corr_selector(self.df_data_corr.corr(), split="high", threshold=0.15)
56			.isna()
57			.sum()
58			.sum(),
59			4,
60			)
61			self.assertEqual(
62			_corr_selector(self.df_data_corr.corr(), split="low", threshold=0.85)
63			.isna()
64			.sum()
65			.sum(),
66			6,
67			)
68
69			def test__corr_selector_label(self):
70			self.assertEqual(
71			_corr_selector(self.df_data_corr.corrwith(self.target)).shape, (6,)
72			)
73			self.assertEqual(
74			_corr_selector(self.df_data_corr.corrwith(self.target), split="pos")
75			.isna()
76			.sum(),
77			3,
78			)
79			self.assertEqual(
80			_corr_selector(
81			self.df_data_corr.corrwith(self.target), split="pos", threshold=0.8
82			)
83			.isna()
84			.sum(),
85			4,
86			)
87			self.assertEqual(
88			_corr_selector(
89			self.df_data_corr.corrwith(self.target), split="neg", threshold=-0.7
90			)
91			.isna()
92			.sum(),
93			5,
94			)
95			self.assertEqual(
96			_corr_selector(
97			self.df_data_corr.corrwith(self.target), split="high", threshold=0.2
98			)
99			.isna()
100			.sum(),
101			1,
102			)
103			self.assertEqual(
104			_corr_selector(
105			self.df_data_corr.corrwith(self.target), split="low", threshold=0.8
106			)
107			.isna()
108			.sum(),
109			2,
110			)
111
112
113			class Test__drop_duplicates(unittest.TestCase):
114			@classmethod
115			def setUpClass(cls: pd.DataFrame) -> pd.DataFrame:
116			cls.data_dupl_df = pd.DataFrame(
117			[
118			[pd.NA, pd.NA, pd.NA, pd.NA],
119			[1, 2, 3, 4],
120			[1, 2, 3, 4],
121			[1, 2, 3, 4],
122			[2, 3, 4, 5],
123			[1, 2, 3, pd.NA],
124			[pd.NA, pd.NA, pd.NA, pd.NA],
125			]
126			)
127
128			def test__drop_dupl(self):
129			# Test dropping of duplicate rows
130			self.assertAlmostEqual(_drop_duplicates(self.data_dupl_df)[0].shape, (4, 4))
131			# Test if the resulting DataFrame is equal to using the pandas method
132			self.assertTrue(
133			_drop_duplicates(self.data_dupl_df)[0].equals(
134			self.data_dupl_df.drop_duplicates().reset_index(drop=True)
135			)
136			)
137			# Test number of duplicates
138			self.assertEqual(len(_drop_duplicates(self.data_dupl_df)[1]), 3)
139
140
141			class Test__missing_vals(unittest.TestCase):
142			@classmethod
143			def setUpClass(cls):
144			cls.data_mv_list = [
145			[1, np.nan, 3, 4],
146			[None, 4, 5, None],
147			["a", "b", pd.NA, "d"],
148			[True, False, 7, pd.NaT],
149			]
150
151			cls.data_mv_df = pd.DataFrame(cls.data_mv_list)
152
153			cls.data_mv_array = np.array(cls.data_mv_list)
154
155			def test_mv_total(self):
156			# Test total missing values
157			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_total"], 5)
158			self.assertAlmostEqual(_missing_vals(self.data_mv_array)["mv_total"], 5)
159			self.assertAlmostEqual(_missing_vals(self.data_mv_list)["mv_total"], 5)
160
161			def test_mv_rows(self):
162			# Test missing values for each row
163			expected_results = [1, 2, 1, 1]
164			for i, result in enumerate(expected_results):
165			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_rows"][i], result)
166
167			def test_mv_cols(self):
168			# Test missing values for each column
169			expected_results = [1, 1, 1, 2]
170			for i, result in enumerate(expected_results):
171			self.assertAlmostEqual(_missing_vals(self.data_mv_df)["mv_cols"][i], result)
172
173			def test_mv_rows_ratio(self):
174			# Test missing values ratio for each row
175			expected_results = [0.25, 0.5, 0.25, 0.25]
176			for i, result in enumerate(expected_results):
177			self.assertAlmostEqual(
178			_missing_vals(self.data_mv_df)["mv_rows_ratio"][i], result
179			)
180
181			# Test if missing value ratio is between 0 and 1
182			for i, _ in enumerate(self.data_mv_df):
183			self.assertTrue(
184			0 <= _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] <= 1
185			)
186
187			def test_mv_cols_ratio(self):
188			# Test missing values ratio for each column
189			expected_results = [1 / 4, 0.25, 0.25, 0.5]
190			for i, result in enumerate(expected_results):
191			self.assertAlmostEqual(
192			_missing_vals(self.data_mv_df)["mv_cols_ratio"][i], result
193			)
194
195			# Test if missing value ratio is between 0 and 1
196			for i, _ in enumerate(self.data_mv_df):
197			self.assertTrue(
198			0 <= _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] <= 1
199			)
200
201
202			class Test__validate_input(unittest.TestCase):
203			def test__validate_input_bool(self):
204			# Raises an exception if the input is not boolean
205			with self.assertRaises(TypeError):
206			_validate_input_bool("True", None)
207			with self.assertRaises(TypeError):
208			_validate_input_bool(None, None)
209			with self.assertRaises(TypeError):
210			_validate_input_bool(1, None)
211
212			def test__validate_input_int(self):
213			# Raises an exception if the input is not an integer
214			with self.assertRaises(TypeError):
215			_validate_input_int(1.1, None)
216			with self.assertRaises(TypeError):
217			_validate_input_int([1], None)
218			with self.assertRaises(TypeError):
219			_validate_input_int("1", None)
220
221			def test__validate_input_smaller(self):
222			# Raises an exception if the first value is larger than the second
223			with self.assertRaises(ValueError):
224			_validate_input_smaller(0.3, 0.2, None)
225			with self.assertRaises(ValueError):
226			_validate_input_smaller(3, 2, None)
227			with self.assertRaises(ValueError):
228			_validate_input_smaller(5, -3, None)
229
230			def test__validate_input_range(self):
231			with self.assertRaises(ValueError):
232			_validate_input_range(-0.1, "value -0.1", 0, 1)
233
234			with self.assertRaises(ValueError):
235			_validate_input_range(1.1, "value 1.1", 0, 1)
236
237			with self.assertRaises(TypeError):
238			_validate_input_range("1", "value string", 0, 1)
239
240			def test__validate_input_sum_smaller(self):
241			with self.assertRaises(ValueError):
242			_validate_input_sum_smaller(1, "Test Sum <= 1", 1.01)
243			with self.assertRaises(ValueError):
244			_validate_input_sum_smaller(1, "Test Sum <= 1", 0.3, 0.2, 0.4, 0.5)
245			with self.assertRaises(ValueError):
246			_validate_input_sum_smaller(-1, "Test Sum <= -1", -0.2, -0.7)
247			with self.assertRaises(ValueError):
248			_validate_input_sum_smaller(10, "Test Sum <= 10", 20, -11, 2)
249
250			def test__validate_input_sum_larger(self):
251			with self.assertRaises(ValueError):
252			_validate_input_sum_larger(1, "Test Sum >= 1", 0.99)
253			with self.assertRaises(ValueError):
254			_validate_input_sum_larger(1, "Test Sum >= 1", 0.9, 0.05)
255			with self.assertRaises(ValueError):
256			_validate_input_sum_larger(-2, "Test Sum >=-2", -3)
257			with self.assertRaises(ValueError):
258			_validate_input_sum_larger(7, "Test Sum >= 7", 1, 2, 3)
259

akanz1 / klib

GitHub Access Token became invalid

Pull Request — main (#17)

Test__missing_vals.test_mv_cols_ratio() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like