tests.test_clean.Test_drop_missing.test_drop_missing() - Code Metrics - Inspection of "ci and dependency updates (#98)" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — main ( 38f5af...23f36e )

by Andreas

created 2023-06-03 13:41 UTC

Test_drop_missing.test_drop_missing() A

↳ Parent: tests.test_clean

Complexity

Conditions

Size

Total Lines	34
Code Lines	30

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	30
nop	1
dl	0
loc	34
rs	9.16
c	0
b	0
f	0

from __future__ import annotations

import io
import sys
import unittest

import numpy as np
import pandas as pd
from klib.clean import clean_column_names
from klib.clean import convert_datatypes
from klib.clean import data_cleaning
from klib.clean import drop_missing
from klib.clean import pool_duplicate_subsets


class Test_clean_column_names(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df1 = pd.DataFrame(
            {
                "Asd 5$ & (3€)": [1, 2, 3],
                "3+3": [2, 3, 4],
                "AsdFer #9": [3, 4, 5],
                '"asdäöüß"': [5, 6, 7],
                "dupli": [5, 6, 8],
                "also": [9, 2, 7],
                "-ä-__________!?:;some/(... \n ..))(++$%/name/    -.....": [2, 3, 7],
            },
        )
        cls.df2 = pd.DataFrame(
            {
                "dupli": [3, 2, 1],
                "also": [4, 5, 7],
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
                "< #total@": [2, 6, 4],
                "count >= 10": [6, 3, 2],
            },
        )
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)

    def test_clean_column_names(self):
        expected_results = [
            "asd_5_dollar_and_3_euro",
            "3_plus_3",
            "asd_fer_hash_9",
            "asdaeoeuess",
            "dupli",
            "also",
            "ae_some_plus_plus_dollar_percent_name",
            "dupli_7",
            "also_8",
            "verylong_column_namesare_hardto_read",
            "smaller_hash_total_at",
            "count_larger_equal_10",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                clean_column_names(self.df_clean_column_names).columns[i]
                == expected_results[i]
            )
        for i, _ in enumerate(expected_results):
            assert (
                clean_column_names(self.df_clean_column_names, hints=False).columns[i]
                == expected_results[i]
            )

    def test_clean_column_names_prints(self):
        captured_output = io.StringIO()
        sys.stdout = captured_output
        clean_column_names(self.df_clean_column_names, hints=True)
        sys.stdout = sys.__stdout__
        assert captured_output.getvalue() == (
            "(\"Duplicate column names detected! Columns with index [7, 8] and names ['dupli', 'also'] have been renamed to ['dupli_7', 'also_8'].\", \"Long column names detected (>25 characters). Consider renaming the following columns ['ae_some_plus_plus_dollar_percent_name', 'verylong_column_namesare_hardto_read'].\")\n"
        )


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        assert drop_missing(self.df_data_drop).shape == (4, 4)

        # Drop further columns based on threshold
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape == (4, 3)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_cols=0.5,
            col_exclude=["c1"],
        ).shape == (4, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape == (4, 2)
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0).shape == (0, 0)

        # Drop further rows based on threshold
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape == (4, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape == (4, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape == (3, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape == (3, 4)
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape == (2, 4)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_rows=0.24,
            col_exclude=["c1"],
        ).shape == (2, 5)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_rows=0.24,
            col_exclude=["c2"],
        ).shape == (2, 4)
        assert drop_missing(
            self.df_data_drop,
            drop_threshold_rows=0.51,
            col_exclude=["c1"],
        ).shape == (3, 5)


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan, 1],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1],
                [pd.NA, "b", 6, "d", "e", 1],
                [pd.NA, "b", 7, 8, 9, 1],
                [pd.NA, "c", 3, 4, pd.NA, 1],
                [pd.NA, "d", 7, pd.NA, pd.NA, 1],
            ],
            columns=["c1", "c2", "c3", "c 4", "c5", "c6"],
        )

    def test_data_cleaning(self):
        assert data_cleaning(self.df_data_cleaning, show="all").shape == (5, 4)
        assert data_cleaning(self.df_data_cleaning, show=None).shape == (5, 4)

        assert data_cleaning(self.df_data_cleaning, col_exclude=["c6"]).shape == (5, 5)

        assert data_cleaning(
            self.df_data_cleaning,
            show="changes",
            clean_col_names=False,
            drop_duplicates=False,
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]

        assert data_cleaning(
            self.df_data_cleaning,
            show="changes",
            clean_col_names=False,
            drop_duplicates=False,
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]

        expected_results = ["string", "float32", "O", "O"]
        for i, _ in enumerate(expected_results):
            assert (
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i]
                == expected_results[i]
            )

        expected_results = ["O", "O", "O", "O"]
        for i, _ in enumerate(expected_results):
            assert (
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
                == expected_results[i]
            )

        expected_results = ["O", "O", "O", "O"]
        for i, _ in enumerate(expected_results):
            assert (
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
                == expected_results[i]
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ],
        )

    def test_convert_dtypes(self):
        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "category",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i]
                == expected_results[i]
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "object",
            "string",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(self.df_data_convert).dtypes[i] == expected_results[i]
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(
                    self.df_data_convert,
                    cat_threshold=0.5,
                    cat_exclude=[4],
                ).dtypes[i]
                == expected_results[i]
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "category",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(
                    self.df_data_convert,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i]
                == expected_results[i]
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            assert (
                convert_datatypes(
                    self.df_data_convert,
                    category=False,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i]
                == expected_results[i]
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ],
            columns=["c1", "c2", "c3", "c4", "c5", "c6"],
        )

    def test_pool_duplicate_subsets(self):
        assert pool_duplicate_subsets(self.df_data_subsets).shape == (6, 3)
        assert pool_duplicate_subsets(
            self.df_data_subsets,
            col_dupl_thresh=1,
        ).shape == (6, 6)

        assert pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape == (
            6,
            2,
        )

        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[
            0
        ].shape == (6, 3)
        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[1] == [
            "c1",
            "c2",
            "c3",
            "c5",
        ]

        assert pool_duplicate_subsets(self.df_data_subsets, exclude=["c1"]).shape == (
            6,
            4,
        )

        assert pool_duplicate_subsets(
            self.df_data_subsets,
            exclude=["c1"],
            return_details=True,
        )[1] == ["c2", "c5", "c6"]


1			from __future__ import annotations
2
3			import io
4			import sys
5			import unittest
6
7			import numpy as np
8			import pandas as pd
9			from klib.clean import clean_column_names
10			from klib.clean import convert_datatypes
11			from klib.clean import data_cleaning
12			from klib.clean import drop_missing
13			from klib.clean import pool_duplicate_subsets
14
15
16			class Test_clean_column_names(unittest.TestCase):
17			@classmethod
18			def setUpClass(cls) -> None:
19			cls.df1 = pd.DataFrame(
20			{
21			"Asd 5$ & (3€)": [1, 2, 3],
22			"3+3": [2, 3, 4],
23			"AsdFer #9": [3, 4, 5],
24			'"asdäöüß"': [5, 6, 7],
25			"dupli": [5, 6, 8],
26			"also": [9, 2, 7],
27			"-ä-__________!?:;some/(... \n ..))(++$%/name/ -.....": [2, 3, 7],
28			},
29			)
30			cls.df2 = pd.DataFrame(
31			{
32			"dupli": [3, 2, 1],
33			"also": [4, 5, 7],
34			"verylongColumnNamesareHardtoRead": [9, 2, 7],
35			"< #total@": [2, 6, 4],
36			"count >= 10": [6, 3, 2],
37			},
38			)
39			cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
40
41			def test_clean_column_names(self):
42			expected_results = [
43			"asd_5_dollar_and_3_euro",
44			"3_plus_3",
45			"asd_fer_hash_9",
46			"asdaeoeuess",
47			"dupli",
48			"also",
49			"ae_some_plus_plus_dollar_percent_name",
50			"dupli_7",
51			"also_8",
52			"verylong_column_namesare_hardto_read",
53			"smaller_hash_total_at",
54			"count_larger_equal_10",
55			]
56			for i, _ in enumerate(expected_results):
57			assert (
58			clean_column_names(self.df_clean_column_names).columns[i]
59			== expected_results[i]
60			)
61			for i, _ in enumerate(expected_results):
62			assert (
63			clean_column_names(self.df_clean_column_names, hints=False).columns[i]
64			== expected_results[i]
65			)
66
67			def test_clean_column_names_prints(self):
68			captured_output = io.StringIO()
69			sys.stdout = captured_output
70			clean_column_names(self.df_clean_column_names, hints=True)
71			sys.stdout = sys.__stdout__
72			assert captured_output.getvalue() == (
73			"(\"Duplicate column names detected! Columns with index [7, 8] and names ['dupli', 'also'] have been renamed to ['dupli_7', 'also_8'].\", \"Long column names detected (>25 characters). Consider renaming the following columns ['ae_some_plus_plus_dollar_percent_name', 'verylong_column_namesare_hardto_read'].\")\n"
74			)
75
76
77			class Test_drop_missing(unittest.TestCase):
78			@classmethod
79			def setUpClass(cls):
80			cls.df_data_drop = pd.DataFrame(
81			[
82			[np.nan, np.nan, np.nan, np.nan, np.nan],
83			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
84			[pd.NA, "b", "c", "d", "e"],
85			[pd.NA, 6, 7, 8, 9],
86			[pd.NA, 2, 3, 4, pd.NA],
87			[pd.NA, 6, 7, pd.NA, pd.NA],
88			],
89			columns=["c1", "c2", "c3", "c4", "c5"],
90			)
91
92			def test_drop_missing(self):
93			assert drop_missing(self.df_data_drop).shape == (4, 4)
94
95			# Drop further columns based on threshold
96			assert drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape == (4, 3)
97			assert drop_missing(
98			self.df_data_drop,
99			drop_threshold_cols=0.5,
100			col_exclude=["c1"],
101			).shape == (4, 4)
102			assert drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape == (4, 2)
103			assert drop_missing(self.df_data_drop, drop_threshold_cols=0).shape == (0, 0)
104
105			# Drop further rows based on threshold
106			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape == (4, 4)
107			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape == (4, 4)
108			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape == (3, 4)
109			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape == (3, 4)
110			assert drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape == (2, 4)
111			assert drop_missing(
112			self.df_data_drop,
113			drop_threshold_rows=0.24,
114			col_exclude=["c1"],
115			).shape == (2, 5)
116			assert drop_missing(
117			self.df_data_drop,
118			drop_threshold_rows=0.24,
119			col_exclude=["c2"],
120			).shape == (2, 4)
121			assert drop_missing(
122			self.df_data_drop,
123			drop_threshold_rows=0.51,
124			col_exclude=["c1"],
125			).shape == (3, 5)
126
127
128			class Test_data_cleaning(unittest.TestCase):
129			@classmethod
130			def setUpClass(cls):
131			cls.df_data_cleaning = pd.DataFrame(
132			[
133			[np.nan, np.nan, np.nan, np.nan, np.nan, 1],
134			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1],
135			[pd.NA, "b", 6, "d", "e", 1],
136			[pd.NA, "b", 7, 8, 9, 1],
137			[pd.NA, "c", 3, 4, pd.NA, 1],
138			[pd.NA, "d", 7, pd.NA, pd.NA, 1],
139			],
140			columns=["c1", "c2", "c3", "c 4", "c5", "c6"],
141			)
142
143			def test_data_cleaning(self):
144			assert data_cleaning(self.df_data_cleaning, show="all").shape == (5, 4)
145			assert data_cleaning(self.df_data_cleaning, show=None).shape == (5, 4)
146
147			assert data_cleaning(self.df_data_cleaning, col_exclude=["c6"]).shape == (5, 5)
148
149			assert data_cleaning(
150			self.df_data_cleaning,
151			show="changes",
152			clean_col_names=False,
153			drop_duplicates=False,
154			).columns.tolist() == ["c2", "c3", "c 4", "c5"]
155
156			assert data_cleaning(
157			self.df_data_cleaning,
158			show="changes",
159			clean_col_names=False,
160			drop_duplicates=False,
161			).columns.tolist() == ["c2", "c3", "c 4", "c5"]
162
163			expected_results = ["string", "float32", "O", "O"]
164			for i, _ in enumerate(expected_results):
165			assert (
166			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i]
167			== expected_results[i]
168			)
169
170			expected_results = ["O", "O", "O", "O"]
171			for i, _ in enumerate(expected_results):
172			assert (
173			data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
174			== expected_results[i]
175			)
176
177			expected_results = ["O", "O", "O", "O"]
178			for i, _ in enumerate(expected_results):
179			assert (
180			data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
181			== expected_results[i]
182			)
183
184
185			class Test_convert_dtypes(unittest.TestCase):
186			@classmethod
187			def setUpClass(cls):
188			cls.df_data_convert = pd.DataFrame(
189			[
190			[1, 7.0, "y", "x", pd.NA, "v"],
191			[3, 8.0, "d", "e", pd.NA, "v"],
192			[5, 7.0, "o", "z", pd.NA, "v"],
193			[1, 7.0, "u", "f", pd.NA, "p"],
194			[1, 7.0, "u", "f", pd.NA, "p"],
195			[2, 7.0, "g", "a", pd.NA, "p"],
196			],
197			)
198
199			def test_convert_dtypes(self):
200			expected_results = [
201			"int8",
202			"float32",
203			"string",
204			"string",
205			"category",
206			"category",
207			]
208			for i, _ in enumerate(expected_results):
209			assert (
210			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i]
211			== expected_results[i]
212			)
213
214			expected_results = [
215			"int8",
216			"float32",
217			"string",
218			"string",
219			"object",
220			"string",
221			]
222			for i, _ in enumerate(expected_results):
223			assert (
224			convert_datatypes(self.df_data_convert).dtypes[i] == expected_results[i]
225			)
226
227			expected_results = [
228			"int8",
229			"float32",
230			"string",
231			"string",
232			"object",
233			"category",
234			]
235			for i, _ in enumerate(expected_results):
236			assert (
237			convert_datatypes(
238			self.df_data_convert,
239			cat_threshold=0.5,
240			cat_exclude=[4],
241			).dtypes[i]
242			== expected_results[i]
243			)
244
245			expected_results = [
246			"int8",
247			"float32",
248			"string",
249			"category",
250			"object",
251			"category",
252			]
253			for i, _ in enumerate(expected_results):
254			assert (
255			convert_datatypes(
256			self.df_data_convert,
257			cat_threshold=0.95,
258			cat_exclude=[2, 4],
259			).dtypes[i]
260			== expected_results[i]
261			)
262
263			expected_results = ["int8", "float32", "string", "string", "object", "string"]
264			for i, _ in enumerate(expected_results):
265			assert (
266			convert_datatypes(
267			self.df_data_convert,
268			category=False,
269			cat_threshold=0.95,
270			cat_exclude=[2, 4],
271			).dtypes[i]
272			== expected_results[i]
273			)
274
275
276			class Test_pool_duplicate_subsets(unittest.TestCase):
277			@classmethod
278			def setUpClass(cls):
279			cls.df_data_subsets = pd.DataFrame(
280			[
281			[1, 7, "d", "x", pd.NA, "v"],
282			[1, 8, "d", "e", pd.NA, "v"],
283			[2, 7, "g", "z", pd.NA, "v"],
284			[1, 7, "u", "f", pd.NA, "p"],
285			[1, 7, "u", "z", pd.NA, "p"],
286			[2, 7, "g", "z", pd.NA, "p"],
287			],
288			columns=["c1", "c2", "c3", "c4", "c5", "c6"],
289			)
290
291			def test_pool_duplicate_subsets(self):
292			assert pool_duplicate_subsets(self.df_data_subsets).shape == (6, 3)
293			assert pool_duplicate_subsets(
294			self.df_data_subsets,
295			col_dupl_thresh=1,
296			).shape == (6, 6)
297
298			assert pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape == (
299			6,
300			2,
301			)
302
303			assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[
304			0
305			].shape == (6, 3)
306			assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[1] == [
307			"c1",
308			"c2",
309			"c3",
310			"c5",
311			]
312
313			assert pool_duplicate_subsets(self.df_data_subsets, exclude=["c1"]).shape == (
314			6,
315			4,
316			)
317
318			assert pool_duplicate_subsets(
319			self.df_data_subsets,
320			exclude=["c1"],
321			return_details=True,
322			)[1] == ["c2", "c5", "c6"]
323

akanz1 / klib

GitHub Access Token became invalid

Push — main ( 38f5af...23f36e )

Test_drop_missing.test_drop_missing() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like