tests.test_util.Test__missing_vals.test_mv_cols_ratio() - Code Metrics - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test__missing_vals.test_mv_cols_ratio() A
last analyzed 2025-11-06 11:08 UTC

↳ Parent: tests.test_util

Complexity

Conditions

Size

Total Lines	9
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	6
nop	1
dl	0
loc	9
rs	10
c	0
b	0
f	0

import unittest

import numpy as np
import pandas as pd
import pytest

from klib.utils import _corr_selector
from klib.utils import _drop_duplicates
from klib.utils import _missing_vals
from klib.utils import _validate_input_bool
from klib.utils import _validate_input_int
from klib.utils import _validate_input_num_data
from klib.utils import _validate_input_range
from klib.utils import _validate_input_smaller
from klib.utils import _validate_input_sum_larger
from klib.utils import _validate_input_sum_smaller


class Test__corr_selector(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df_data_corr = pd.DataFrame(
            [
                [1, 7, 2, 2, 4, 7],
                [3, 8, 3, 3, 7, 1],
                [5, 7, 9, 5, 1, 4],
                [1, 7, 8, 6, 1, 8],
                [1, 7, 5, 6, 2, 6],
                [2, 7, 3, 3, 5, 3],
            ],
        )

        cls.target = pd.Series([1, 2, 4, 7, 4, 2])

    def test__corr_selector_matrix(self):
        assert _corr_selector(self.df_data_corr.corr()).shape == (6, 6)
        assert _corr_selector(self.df_data_corr.corr(), split="pos").isna().sum().sum() == 18
        assert (
            _corr_selector(self.df_data_corr.corr(), split="pos", threshold=0.5).isna().sum().sum()
            == 26
        )
        assert (
            _corr_selector(self.df_data_corr.corr(), split="neg", threshold=-0.75)
            .isna()
            .sum()
            .sum()
            == 32
        )
        assert (
            _corr_selector(self.df_data_corr.corr(), split="high", threshold=0.15)
            .isna()
            .sum()
            .sum()
            == 4
        )
        assert (
            _corr_selector(self.df_data_corr.corr(), split="low", threshold=0.85).isna().sum().sum()
            == 6
        )

    def test__corr_selector_label(self):
        assert _corr_selector(self.df_data_corr.corrwith(self.target)).shape == (6,)
        assert (
            _corr_selector(self.df_data_corr.corrwith(self.target), split="pos").isna().sum() == 3
        )
        assert (
            _corr_selector(
                self.df_data_corr.corrwith(self.target),
                split="pos",
                threshold=0.8,
            )
            .isna()
            .sum()
            == 4
        )
        assert (
            _corr_selector(
                self.df_data_corr.corrwith(self.target),
                split="neg",
                threshold=-0.7,
            )
            .isna()
            .sum()
            == 5
        )
        assert (
            _corr_selector(
                self.df_data_corr.corrwith(self.target),
                split="high",
                threshold=0.2,
            )
            .isna()
            .sum()
            == 1
        )
        assert (
            _corr_selector(
                self.df_data_corr.corrwith(self.target),
                split="low",
                threshold=0.8,
            )
            .isna()
            .sum()
            == 2
        )


class Test__drop_duplicates(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.data_dupl_df = pd.DataFrame(
            [
                [pd.NA, pd.NA, pd.NA, pd.NA],
                [1, 2, 3, 4],
                [1, 2, 3, 4],
                [1, 2, 3, 4],
                [2, 3, 4, 5],
                [1, 2, 3, pd.NA],
                [pd.NA, pd.NA, pd.NA, pd.NA],
            ],
        )

    def test__drop_dupl(self) -> None:
        # Test dropping of duplicate rows
        assert _drop_duplicates(self.data_dupl_df)[0].shape == (4, 4)
        # Test if the resulting DataFrame is equal to using the pandas method
        assert _drop_duplicates(self.data_dupl_df)[0].equals(
            self.data_dupl_df.drop_duplicates().reset_index(drop=True),
        )
        # Test number of duplicates
        assert len(_drop_duplicates(self.data_dupl_df)[1]) == 3


class Test__missing_vals(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.data_mv_list = [
            [1, np.nan, 3, 4],
            [None, 4, 5, None],
            ["a", "b", pd.NA, "d"],
            [True, False, 7, pd.NaT],
        ]

        cls.data_mv_df = pd.DataFrame(cls.data_mv_list)

        cls.data_mv_array = np.array(cls.data_mv_list)

    def test_mv_total(self) -> None:
        # Test total missing values
        assert _missing_vals(self.data_mv_df)["mv_total"] == 5
        assert _missing_vals(self.data_mv_array)["mv_total"] == 5
        assert _missing_vals(self.data_mv_list)["mv_total"] == 5

    def test_mv_rows(self) -> None:
        # Test missing values for each row
        expected_results = [1, 2, 1, 1]
        for i, result in enumerate(expected_results):
            assert _missing_vals(self.data_mv_df)["mv_rows"][i] == result

    def test_mv_cols(self) -> None:
        # Test missing values for each column
        expected_results = [1, 1, 1, 2]
        for i, result in enumerate(expected_results):
            assert _missing_vals(self.data_mv_df)["mv_cols"][i] == result

    def test_mv_rows_ratio(self) -> None:
        # Test missing values ratio for each row
        expected_results = [0.25, 0.5, 0.25, 0.25]
        for i, result in enumerate(expected_results):
            assert _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] == result

        # Test if missing value ratio is between 0 and 1
        for i, _ in enumerate(self.data_mv_df):
            assert 0 <= _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] <= 1

    def test_mv_cols_ratio(self) -> None:
        # Test missing values ratio for each column
        expected_results = [1 / 4, 0.25, 0.25, 0.5]
        for i, result in enumerate(expected_results):
            assert _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] == result

        # Test if missing value ratio is between 0 and 1
        for i, _ in enumerate(self.data_mv_df):
            assert 0 <= _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] <= 1


class Test__validate_input(unittest.TestCase):
    def test__validate_input_bool(self) -> None:
        # Raises an exception if the input is not boolean
        with pytest.raises(TypeError):
            _validate_input_bool("True", "No description")
        with pytest.raises(TypeError):
            _validate_input_bool(None, "No description")
        with pytest.raises(TypeError):
            _validate_input_bool(1, "No description")

    def test__validate_input_int(self) -> None:
        # Raises an exception if the input is not an integer
        with pytest.raises(TypeError):
            _validate_input_int(1.1, "No description")
        with pytest.raises(TypeError):
            _validate_input_int([1], "No description")
        with pytest.raises(TypeError):
            _validate_input_int("1", "No description")

    def test__validate_input_smaller(self) -> None:
        # Raises an exception if the first value is larger than the second
        with pytest.raises(ValueError, match="The first input for 'some check' should"):
            _validate_input_smaller(0.3, 0.2, "some check")
        with pytest.raises(ValueError, match="The first input for 'some check' should"):
            _validate_input_smaller(3, 2, "some check")
        with pytest.raises(ValueError, match="The first input for 'some check' should"):
            _validate_input_smaller(5, -3, "some check")

    def test__validate_input_range(self) -> None:
        with pytest.raises(
            ValueError,
            match="'actual' = -0.1 but should be 0 <= 'actual' <= 1.",
        ):
            _validate_input_range(-0.1, "actual", 0, 1)

        with pytest.raises(
            ValueError,
            match="'actual' = 1.1 but should be 0 <= 'actual' <= 1.",
        ):
            _validate_input_range(1.1, "actual", 0, 1)

        with pytest.raises(TypeError):
            _validate_input_range("1", "value string", 0, 1)

    def test__validate_input_sum_smaller(self) -> None:
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum <= 1' should be less or equal to 1.",
        ):
            _validate_input_sum_smaller(1, "Test Sum <= 1", 1.01)
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum <= 1' should be less or equal to 1.",
        ):
            _validate_input_sum_smaller(1, "Test Sum <= 1", 0.3, 0.2, 0.4, 0.5)
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum <= -1' should be less or equal to -1.",
        ):
            _validate_input_sum_smaller(-1, "Test Sum <= -1", -0.2, -0.7)
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum <= 10' should be less or equal to 10.",
        ):
            _validate_input_sum_smaller(10, "Test Sum <= 10", 20, -11, 2)

    def test__validate_input_sum_larger(self) -> None:
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum >= 1' should be larger/equal to 1.",
        ):
            _validate_input_sum_larger(1, "Test Sum >= 1", 0.99)
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum >= 1' should be larger/equal to 1.",
        ):
            _validate_input_sum_larger(1, "Test Sum >= 1", 0.9, 0.05)
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum >=-2' should be larger/equal to -2.",
        ):
            _validate_input_sum_larger(-2, "Test Sum >=-2", -3)
        with pytest.raises(
            ValueError,
            match="The sum of input values for 'Test Sum >= 7' should be larger/equal to 7.",
        ):
            _validate_input_sum_larger(7, "Test Sum >= 7", 1, 2, 3)

    def test__validate_input_num_data(self) -> None:
        with pytest.raises(TypeError):
            _validate_input_num_data(
                pd.DataFrame({"col1": ["a", "b", "c"]}),
                "No description",
            )

        _validate_input_num_data(
            pd.DataFrame({"col1": [1, 2, 3]}),
            "No description",
        )  # No exception


1			import unittest
2
3			import numpy as np
4			import pandas as pd
5			import pytest
6
7			from klib.utils import _corr_selector
8			from klib.utils import _drop_duplicates
9			from klib.utils import _missing_vals
10			from klib.utils import _validate_input_bool
11			from klib.utils import _validate_input_int
12			from klib.utils import _validate_input_num_data
13			from klib.utils import _validate_input_range
14			from klib.utils import _validate_input_smaller
15			from klib.utils import _validate_input_sum_larger
16			from klib.utils import _validate_input_sum_smaller
17
18
19			class Test__corr_selector(unittest.TestCase):
20			@classmethod
21			def setUpClass(cls) -> None:
22			cls.df_data_corr = pd.DataFrame(
23			[
24			[1, 7, 2, 2, 4, 7],
25			[3, 8, 3, 3, 7, 1],
26			[5, 7, 9, 5, 1, 4],
27			[1, 7, 8, 6, 1, 8],
28			[1, 7, 5, 6, 2, 6],
29			[2, 7, 3, 3, 5, 3],
30			],
31			)
32
33			cls.target = pd.Series([1, 2, 4, 7, 4, 2])
34
35			def test__corr_selector_matrix(self):
36			assert _corr_selector(self.df_data_corr.corr()).shape == (6, 6)
37			assert _corr_selector(self.df_data_corr.corr(), split="pos").isna().sum().sum() == 18
38			assert (
39			_corr_selector(self.df_data_corr.corr(), split="pos", threshold=0.5).isna().sum().sum()
40			== 26
41			)
42			assert (
43			_corr_selector(self.df_data_corr.corr(), split="neg", threshold=-0.75)
44			.isna()
45			.sum()
46			.sum()
47			== 32
48			)
49			assert (
50			_corr_selector(self.df_data_corr.corr(), split="high", threshold=0.15)
51			.isna()
52			.sum()
53			.sum()
54			== 4
55			)
56			assert (
57			_corr_selector(self.df_data_corr.corr(), split="low", threshold=0.85).isna().sum().sum()
58			== 6
59			)
60
61			def test__corr_selector_label(self):
62			assert _corr_selector(self.df_data_corr.corrwith(self.target)).shape == (6,)
63			assert (
64			_corr_selector(self.df_data_corr.corrwith(self.target), split="pos").isna().sum() == 3
65			)
66			assert (
67			_corr_selector(
68			self.df_data_corr.corrwith(self.target),
69			split="pos",
70			threshold=0.8,
71			)
72			.isna()
73			.sum()
74			== 4
75			)
76			assert (
77			_corr_selector(
78			self.df_data_corr.corrwith(self.target),
79			split="neg",
80			threshold=-0.7,
81			)
82			.isna()
83			.sum()
84			== 5
85			)
86			assert (
87			_corr_selector(
88			self.df_data_corr.corrwith(self.target),
89			split="high",
90			threshold=0.2,
91			)
92			.isna()
93			.sum()
94			== 1
95			)
96			assert (
97			_corr_selector(
98			self.df_data_corr.corrwith(self.target),
99			split="low",
100			threshold=0.8,
101			)
102			.isna()
103			.sum()
104			== 2
105			)
106
107
108			class Test__drop_duplicates(unittest.TestCase):
109			@classmethod
110			def setUpClass(cls) -> None:
111			cls.data_dupl_df = pd.DataFrame(
112			[
113			[pd.NA, pd.NA, pd.NA, pd.NA],
114			[1, 2, 3, 4],
115			[1, 2, 3, 4],
116			[1, 2, 3, 4],
117			[2, 3, 4, 5],
118			[1, 2, 3, pd.NA],
119			[pd.NA, pd.NA, pd.NA, pd.NA],
120			],
121			)
122
123			def test__drop_dupl(self) -> None:
124			# Test dropping of duplicate rows
125			assert _drop_duplicates(self.data_dupl_df)[0].shape == (4, 4)
126			# Test if the resulting DataFrame is equal to using the pandas method
127			assert _drop_duplicates(self.data_dupl_df)[0].equals(
128			self.data_dupl_df.drop_duplicates().reset_index(drop=True),
129			)
130			# Test number of duplicates
131			assert len(_drop_duplicates(self.data_dupl_df)[1]) == 3
132
133
134			class Test__missing_vals(unittest.TestCase):
135			@classmethod
136			def setUpClass(cls) -> None:
137			cls.data_mv_list = [
138			[1, np.nan, 3, 4],
139			[None, 4, 5, None],
140			["a", "b", pd.NA, "d"],
141			[True, False, 7, pd.NaT],
142			]
143
144			cls.data_mv_df = pd.DataFrame(cls.data_mv_list)
145
146			cls.data_mv_array = np.array(cls.data_mv_list)
147
148			def test_mv_total(self) -> None:
149			# Test total missing values
150			assert _missing_vals(self.data_mv_df)["mv_total"] == 5
151			assert _missing_vals(self.data_mv_array)["mv_total"] == 5
152			assert _missing_vals(self.data_mv_list)["mv_total"] == 5
153
154			def test_mv_rows(self) -> None:
155			# Test missing values for each row
156			expected_results = [1, 2, 1, 1]
157			for i, result in enumerate(expected_results):
158			assert _missing_vals(self.data_mv_df)["mv_rows"][i] == result
159
160			def test_mv_cols(self) -> None:
161			# Test missing values for each column
162			expected_results = [1, 1, 1, 2]
163			for i, result in enumerate(expected_results):
164			assert _missing_vals(self.data_mv_df)["mv_cols"][i] == result
165
166			def test_mv_rows_ratio(self) -> None:
167			# Test missing values ratio for each row
168			expected_results = [0.25, 0.5, 0.25, 0.25]
169			for i, result in enumerate(expected_results):
170			assert _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] == result
171
172			# Test if missing value ratio is between 0 and 1
173			for i, _ in enumerate(self.data_mv_df):
174			assert 0 <= _missing_vals(self.data_mv_df)["mv_rows_ratio"][i] <= 1
175
176			def test_mv_cols_ratio(self) -> None:
177			# Test missing values ratio for each column
178			expected_results = [1 / 4, 0.25, 0.25, 0.5]
179			for i, result in enumerate(expected_results):
180			assert _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] == result
181
182			# Test if missing value ratio is between 0 and 1
183			for i, _ in enumerate(self.data_mv_df):
184			assert 0 <= _missing_vals(self.data_mv_df)["mv_cols_ratio"][i] <= 1
185
186
187			class Test__validate_input(unittest.TestCase):
188			def test__validate_input_bool(self) -> None:
189			# Raises an exception if the input is not boolean
190			with pytest.raises(TypeError):
191			_validate_input_bool("True", "No description")
192			with pytest.raises(TypeError):
193			_validate_input_bool(None, "No description")
194			with pytest.raises(TypeError):
195			_validate_input_bool(1, "No description")
196
197			def test__validate_input_int(self) -> None:
198			# Raises an exception if the input is not an integer
199			with pytest.raises(TypeError):
200			_validate_input_int(1.1, "No description")
201			with pytest.raises(TypeError):
202			_validate_input_int([1], "No description")
203			with pytest.raises(TypeError):
204			_validate_input_int("1", "No description")
205
206			def test__validate_input_smaller(self) -> None:
207			# Raises an exception if the first value is larger than the second
208			with pytest.raises(ValueError, match="The first input for 'some check' should"):
209			_validate_input_smaller(0.3, 0.2, "some check")
210			with pytest.raises(ValueError, match="The first input for 'some check' should"):
211			_validate_input_smaller(3, 2, "some check")
212			with pytest.raises(ValueError, match="The first input for 'some check' should"):
213			_validate_input_smaller(5, -3, "some check")
214
215			def test__validate_input_range(self) -> None:
216			with pytest.raises(
217			ValueError,
218			match="'actual' = -0.1 but should be 0 <= 'actual' <= 1.",
219			):
220			_validate_input_range(-0.1, "actual", 0, 1)
221
222			with pytest.raises(
223			ValueError,
224			match="'actual' = 1.1 but should be 0 <= 'actual' <= 1.",
225			):
226			_validate_input_range(1.1, "actual", 0, 1)
227
228			with pytest.raises(TypeError):
229			_validate_input_range("1", "value string", 0, 1)
230
231			def test__validate_input_sum_smaller(self) -> None:
232			with pytest.raises(
233			ValueError,
234			match="The sum of input values for 'Test Sum <= 1' should be less or equal to 1.",
235			):
236			_validate_input_sum_smaller(1, "Test Sum <= 1", 1.01)
237			with pytest.raises(
238			ValueError,
239			match="The sum of input values for 'Test Sum <= 1' should be less or equal to 1.",
240			):
241			_validate_input_sum_smaller(1, "Test Sum <= 1", 0.3, 0.2, 0.4, 0.5)
242			with pytest.raises(
243			ValueError,
244			match="The sum of input values for 'Test Sum <= -1' should be less or equal to -1.",
245			):
246			_validate_input_sum_smaller(-1, "Test Sum <= -1", -0.2, -0.7)
247			with pytest.raises(
248			ValueError,
249			match="The sum of input values for 'Test Sum <= 10' should be less or equal to 10.",
250			):
251			_validate_input_sum_smaller(10, "Test Sum <= 10", 20, -11, 2)
252
253			def test__validate_input_sum_larger(self) -> None:
254			with pytest.raises(
255			ValueError,
256			match="The sum of input values for 'Test Sum >= 1' should be larger/equal to 1.",
257			):
258			_validate_input_sum_larger(1, "Test Sum >= 1", 0.99)
259			with pytest.raises(
260			ValueError,
261			match="The sum of input values for 'Test Sum >= 1' should be larger/equal to 1.",
262			):
263			_validate_input_sum_larger(1, "Test Sum >= 1", 0.9, 0.05)
264			with pytest.raises(
265			ValueError,
266			match="The sum of input values for 'Test Sum >=-2' should be larger/equal to -2.",
267			):
268			_validate_input_sum_larger(-2, "Test Sum >=-2", -3)
269			with pytest.raises(
270			ValueError,
271			match="The sum of input values for 'Test Sum >= 7' should be larger/equal to 7.",
272			):
273			_validate_input_sum_larger(7, "Test Sum >= 7", 1, 2, 3)
274
275			def test__validate_input_num_data(self) -> None:
276			with pytest.raises(TypeError):
277			_validate_input_num_data(
278			pd.DataFrame({"col1": ["a", "b", "c"]}),
279			"No description",
280			)
281
282			_validate_input_num_data(
283			pd.DataFrame({"col1": [1, 2, 3]}),
284			"No description",
285			) # No exception
286

akanz1 / klib

GitHub Access Token became invalid

Test__missing_vals.test_mv_cols_ratio() A last analyzed 2025-11-06 11:08 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

Test__missing_vals.test_mv_cols_ratio() A
last analyzed 2025-11-06 11:08 UTC