tests.test_clean.Test_data_cleaning.test_data_cleaning() - Code Metrics - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test_data_cleaning.test_data_cleaning() A
last analyzed 2025-11-06 11:08 UTC

↳ Parent: tests.test_clean

Complexity

Conditions

Size

Total Lines	39
Code Lines	31

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	31
nop	1
dl	0
loc	39
rs	9.1359
c	0
b	0
f	0

from __future__ import annotations

import io
import sys
import unittest

import numpy as np
import pandas as pd

from klib.clean import clean_column_names
from klib.clean import convert_datatypes
from klib.clean import data_cleaning
from klib.clean import drop_missing
from klib.clean import pool_duplicate_subsets


class Test_clean_column_names(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df1 = pd.DataFrame(
            {
                "Asd 5$ & (3€)": [1, 2, 3],
                "3+3": [2, 3, 4],
                "AsdFer #9": [3, 4, 5],
                '"asdäöüß"': [5, 6, 7],
                "dupli": [5, 6, 8],
                "also": [9, 2, 7],
                "-ä-__________!?:;some/(... \n ..))(++$%/name/    -.....": [2, 3, 7],
            },
        )
        cls.df2 = pd.DataFrame(
            {
                "dupli": [3, 2, 1],
                "also": [4, 5, 7],
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
                "< #total@": [2, 6, 4],
                "count >= 10": [6, 3, 2],
            },
        )
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)

    def test_clean_column_names(self) -> None:
        expected_results = [
            "asd_5_dollar_and_3_euro",
            "3_plus_3",
            "asd_fer_hash_9",
            "asdaeoeuess",
            "dupli",
            "also",
            "ae_some_plus_plus_dollar_percent_name",
            "dupli_7",
            "also_8",
            "verylong_column_namesare_hardto_read",
            "smaller_hash_total_at",
            "count_larger_equal_10",
        ]
        for i, _ in enumerate(expected_results):
            assert clean_column_names(self.df_clean_column_names).columns[i] == expected_results[i]
        for i, _ in enumerate(expected_results):
            assert (
                clean_column_names(self.df_clean_column_names, hints=False).columns[i]
                == expected_results[i]
            )

    def test_clean_column_names_prints(self) -> None:
        captured_output = io.StringIO()
        sys.stdout = captured_output
        clean_column_names(self.df_clean_column_names, hints=True)
        sys.stdout = sys.__stdout__
        assert captured_output.getvalue() == (
            "(\"Duplicate column names detected! Columns with index [7, 8] and names ['dupli', 'also'] have been renamed to ['dupli_7', 'also_8'].\", \"Long column names detected (>25 characters). Consider renaming the following columns ['ae_some_plus_plus_dollar_percent_name', 'verylong_column_namesare_hardto_read'].\")\n"
        )


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self) -> None:
        assert drop_missing(self.df_data_drop).shape == (4, 4)

        # Drop further columns based on threshold
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape == (4, 3)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_cols=0.5,
            col_exclude=["c1"],
        ).shape == (4, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape == (4, 2)
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0).shape == (0, 0)

        # Drop further rows based on threshold
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape == (4, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape == (4, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape == (3, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape == (3, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape == (2, 4)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_rows=0.24,
            col_exclude=["c1"],
        ).shape == (2, 5)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_rows=0.24,
            col_exclude=["c2"],
        ).shape == (2, 4)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_rows=0.51,
            col_exclude=["c1"],
        ).shape == (3, 5)


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan, 1],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1],
                [pd.NA, "b", 6, "d", "e", 1],
                [pd.NA, "b", 7, 8, 9, 1],
                [pd.NA, "c", 3, 4, pd.NA, 1],
                [pd.NA, "d", 7, pd.NA, pd.NA, 1],
            ],
            columns=["c1", "c2", "c3", "c 4", "c5", "c6"],
        )

    def test_data_cleaning(self) -> None:
        assert data_cleaning(self.df_data_cleaning, show="all").shape == (5, 4)
        assert data_cleaning(self.df_data_cleaning, show=None).shape == (5, 4)

        assert data_cleaning(self.df_data_cleaning, col_exclude=["c6"]).shape == (5, 5)

        assert data_cleaning(
            self.df_data_cleaning,
            show="changes",
            clean_col_names=False,
            drop_duplicates=False,
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]

        assert data_cleaning(
            self.df_data_cleaning,
            show="changes",
            clean_col_names=False,
            drop_duplicates=False,
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]

        expected_results = ["string", "float32", "O", "O"]
        for i, _ in enumerate(expected_results):
            assert (
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i]
                == expected_results[i]
            )

        expected_results = ["O", "O", "O", "O"]
        for i, _ in enumerate(expected_results):
            assert (
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
                == expected_results[i]
            )

        expected_results = ["O", "O", "O", "O"]
        for i, _ in enumerate(expected_results):
            assert (
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
                == expected_results[i]
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ],
        )

    def test_convert_dtypes(self) -> None:
        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "category",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i]
                == expected_results[i]
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "object",
            "string",
        ]
        for i, _ in enumerate(expected_results):
            assert convert_datatypes(self.df_data_convert).dtypes[i] == expected_results[i]

        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(
                    self.df_data_convert,
                    cat_threshold=0.5,
                    cat_exclude=[4],
                ).dtypes[i]
                == expected_results[i]
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "category",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(
                    self.df_data_convert,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i]
                == expected_results[i]
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(
                    self.df_data_convert,
                    category=False,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i]
                == expected_results[i]
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ],
            columns=["c1", "c2", "c3", "c4", "c5", "c6"],
        )

    def test_pool_duplicate_subsets(self) -> None:
        assert pool_duplicate_subsets(self.df_data_subsets).shape == (6, 3)
        assert pool_duplicate_subsets(
            self.df_data_subsets,
            col_dupl_thresh=1,
        ).shape == (6, 6)

        assert pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape == (
            6,
            2,
        )

        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[0].shape == (6, 3)
        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[1] == [
            "c1",
            "c2",
            "c3",
            "c5",
        ]

        assert pool_duplicate_subsets(self.df_data_subsets, exclude=["c1"]).shape == (
            6,
            4,
        )

        assert pool_duplicate_subsets(
            self.df_data_subsets,
            exclude=["c1"],
            return_details=True,
        )[1] == ["c2", "c5", "c6"]


1			from __future__ import annotations
2
3			import io
4			import sys
5			import unittest
6
7			import numpy as np
8			import pandas as pd
9
10			from klib.clean import clean_column_names
11			from klib.clean import convert_datatypes
12			from klib.clean import data_cleaning
13			from klib.clean import drop_missing
14			from klib.clean import pool_duplicate_subsets
15
16
17			class Test_clean_column_names(unittest.TestCase):
18			@classmethod
19			def setUpClass(cls) -> None:
20			cls.df1 = pd.DataFrame(
21			{
22			"Asd 5$ & (3€)": [1, 2, 3],
23			"3+3": [2, 3, 4],
24			"AsdFer #9": [3, 4, 5],
25			'"asdäöüß"': [5, 6, 7],
26			"dupli": [5, 6, 8],
27			"also": [9, 2, 7],
28			"-ä-__________!?:;some/(... \n ..))(++$%/name/ -.....": [2, 3, 7],
29			},
30			)
31			cls.df2 = pd.DataFrame(
32			{
33			"dupli": [3, 2, 1],
34			"also": [4, 5, 7],
35			"verylongColumnNamesareHardtoRead": [9, 2, 7],
36			"< #total@": [2, 6, 4],
37			"count >= 10": [6, 3, 2],
38			},
39			)
40			cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
41
42			def test_clean_column_names(self) -> None:
43			expected_results = [
44			"asd_5_dollar_and_3_euro",
45			"3_plus_3",
46			"asd_fer_hash_9",
47			"asdaeoeuess",
48			"dupli",
49			"also",
50			"ae_some_plus_plus_dollar_percent_name",
51			"dupli_7",
52			"also_8",
53			"verylong_column_namesare_hardto_read",
54			"smaller_hash_total_at",
55			"count_larger_equal_10",
56			]
57			for i, _ in enumerate(expected_results):
58			assert clean_column_names(self.df_clean_column_names).columns[i] == expected_results[i]
59			for i, _ in enumerate(expected_results):
60			assert (
61			clean_column_names(self.df_clean_column_names, hints=False).columns[i]
62			== expected_results[i]
63			)
64
65			def test_clean_column_names_prints(self) -> None:
66			captured_output = io.StringIO()
67			sys.stdout = captured_output
68			clean_column_names(self.df_clean_column_names, hints=True)
69			sys.stdout = sys.__stdout__
70			assert captured_output.getvalue() == (
71			"(\"Duplicate column names detected! Columns with index [7, 8] and names ['dupli', 'also'] have been renamed to ['dupli_7', 'also_8'].\", \"Long column names detected (>25 characters). Consider renaming the following columns ['ae_some_plus_plus_dollar_percent_name', 'verylong_column_namesare_hardto_read'].\")\n"
72			)
73
74
75			class Test_drop_missing(unittest.TestCase):
76			@classmethod
77			def setUpClass(cls) -> None:
78			cls.df_data_drop = pd.DataFrame(
79			[
80			[np.nan, np.nan, np.nan, np.nan, np.nan],
81			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
82			[pd.NA, "b", "c", "d", "e"],
83			[pd.NA, 6, 7, 8, 9],
84			[pd.NA, 2, 3, 4, pd.NA],
85			[pd.NA, 6, 7, pd.NA, pd.NA],
86			],
87			columns=["c1", "c2", "c3", "c4", "c5"],
88			)
89
90			def test_drop_missing(self) -> None:
91			assert drop_missing(self.df_data_drop).shape == (4, 4)
92
93			# Drop further columns based on threshold
94			assert drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape == (4, 3)
95			assert drop_missing(
96			self.df_data_drop,
97			drop_threshold_cols=0.5,
98			col_exclude=["c1"],
99			).shape == (4, 4)
100			assert drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape == (4, 2)
101			assert drop_missing(self.df_data_drop, drop_threshold_cols=0).shape == (0, 0)
102
103			# Drop further rows based on threshold
104			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape == (4, 4)
105			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape == (4, 4)
106			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape == (3, 4)
107			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape == (3, 4)
108			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape == (2, 4)
109			assert drop_missing(
110			self.df_data_drop,
111			drop_threshold_rows=0.24,
112			col_exclude=["c1"],
113			).shape == (2, 5)
114			assert drop_missing(
115			self.df_data_drop,
116			drop_threshold_rows=0.24,
117			col_exclude=["c2"],
118			).shape == (2, 4)
119			assert drop_missing(
120			self.df_data_drop,
121			drop_threshold_rows=0.51,
122			col_exclude=["c1"],
123			).shape == (3, 5)
124
125
126			class Test_data_cleaning(unittest.TestCase):
127			@classmethod
128			def setUpClass(cls) -> None:
129			cls.df_data_cleaning = pd.DataFrame(
130			[
131			[np.nan, np.nan, np.nan, np.nan, np.nan, 1],
132			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1],
133			[pd.NA, "b", 6, "d", "e", 1],
134			[pd.NA, "b", 7, 8, 9, 1],
135			[pd.NA, "c", 3, 4, pd.NA, 1],
136			[pd.NA, "d", 7, pd.NA, pd.NA, 1],
137			],
138			columns=["c1", "c2", "c3", "c 4", "c5", "c6"],
139			)
140
141			def test_data_cleaning(self) -> None:
142			assert data_cleaning(self.df_data_cleaning, show="all").shape == (5, 4)
143			assert data_cleaning(self.df_data_cleaning, show=None).shape == (5, 4)
144
145			assert data_cleaning(self.df_data_cleaning, col_exclude=["c6"]).shape == (5, 5)
146
147			assert data_cleaning(
148			self.df_data_cleaning,
149			show="changes",
150			clean_col_names=False,
151			drop_duplicates=False,
152			).columns.tolist() == ["c2", "c3", "c 4", "c5"]
153
154			assert data_cleaning(
155			self.df_data_cleaning,
156			show="changes",
157			clean_col_names=False,
158			drop_duplicates=False,
159			).columns.tolist() == ["c2", "c3", "c 4", "c5"]
160
161			expected_results = ["string", "float32", "O", "O"]
162			for i, _ in enumerate(expected_results):
163			assert (
164			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i]
165			== expected_results[i]
166			)
167
168			expected_results = ["O", "O", "O", "O"]
169			for i, _ in enumerate(expected_results):
170			assert (
171			data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
172			== expected_results[i]
173			)
174
175			expected_results = ["O", "O", "O", "O"]
176			for i, _ in enumerate(expected_results):
177			assert (
178			data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
179			== expected_results[i]
180			)
181
182
183			class Test_convert_dtypes(unittest.TestCase):
184			@classmethod
185			def setUpClass(cls) -> None:
186			cls.df_data_convert = pd.DataFrame(
187			[
188			[1, 7.0, "y", "x", pd.NA, "v"],
189			[3, 8.0, "d", "e", pd.NA, "v"],
190			[5, 7.0, "o", "z", pd.NA, "v"],
191			[1, 7.0, "u", "f", pd.NA, "p"],
192			[1, 7.0, "u", "f", pd.NA, "p"],
193			[2, 7.0, "g", "a", pd.NA, "p"],
194			],
195			)
196
197			def test_convert_dtypes(self) -> None:
198			expected_results = [
199			"int8",
200			"float32",
201			"string",
202			"string",
203			"category",
204			"category",
205			]
206			for i, _ in enumerate(expected_results):
207			assert (
208			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i]
209			== expected_results[i]
210			)
211
212			expected_results = [
213			"int8",
214			"float32",
215			"string",
216			"string",
217			"object",
218			"string",
219			]
220			for i, _ in enumerate(expected_results):
221			assert convert_datatypes(self.df_data_convert).dtypes[i] == expected_results[i]
222
223			expected_results = [
224			"int8",
225			"float32",
226			"string",
227			"string",
228			"object",
229			"category",
230			]
231			for i, _ in enumerate(expected_results):
232			assert (
233			convert_datatypes(
234			self.df_data_convert,
235			cat_threshold=0.5,
236			cat_exclude=[4],
237			).dtypes[i]
238			== expected_results[i]
239			)
240
241			expected_results = [
242			"int8",
243			"float32",
244			"string",
245			"category",
246			"object",
247			"category",
248			]
249			for i, _ in enumerate(expected_results):
250			assert (
251			convert_datatypes(
252			self.df_data_convert,
253			cat_threshold=0.95,
254			cat_exclude=[2, 4],
255			).dtypes[i]
256			== expected_results[i]
257			)
258
259			expected_results = ["int8", "float32", "string", "string", "object", "string"]
260			for i, _ in enumerate(expected_results):
261			assert (
262			convert_datatypes(
263			self.df_data_convert,
264			category=False,
265			cat_threshold=0.95,
266			cat_exclude=[2, 4],
267			).dtypes[i]
268			== expected_results[i]
269			)
270
271
272			class Test_pool_duplicate_subsets(unittest.TestCase):
273			@classmethod
274			def setUpClass(cls) -> None:
275			cls.df_data_subsets = pd.DataFrame(
276			[
277			[1, 7, "d", "x", pd.NA, "v"],
278			[1, 8, "d", "e", pd.NA, "v"],
279			[2, 7, "g", "z", pd.NA, "v"],
280			[1, 7, "u", "f", pd.NA, "p"],
281			[1, 7, "u", "z", pd.NA, "p"],
282			[2, 7, "g", "z", pd.NA, "p"],
283			],
284			columns=["c1", "c2", "c3", "c4", "c5", "c6"],
285			)
286
287			def test_pool_duplicate_subsets(self) -> None:
288			assert pool_duplicate_subsets(self.df_data_subsets).shape == (6, 3)
289			assert pool_duplicate_subsets(
290			self.df_data_subsets,
291			col_dupl_thresh=1,
292			).shape == (6, 6)
293
294			assert pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape == (
295			6,
296			2,
297			)
298
299			assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[0].shape == (6, 3)
300			assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[1] == [
301			"c1",
302			"c2",
303			"c3",
304			"c5",
305			]
306
307			assert pool_duplicate_subsets(self.df_data_subsets, exclude=["c1"]).shape == (
308			6,
309			4,
310			)
311
312			assert pool_duplicate_subsets(
313			self.df_data_subsets,
314			exclude=["c1"],
315			return_details=True,
316			)[1] == ["c2", "c5", "c6"]
317

akanz1 / klib

GitHub Access Token became invalid

Test_data_cleaning.test_data_cleaning() A last analyzed 2025-11-06 11:08 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

Test_data_cleaning.test_data_cleaning() A
last analyzed 2025-11-06 11:08 UTC