tests.test_clean - Code Metrics - Inspection of "Reorganize package" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — main (#17)

by Andreas

created 2021-12-26 12:27 UTC

tests.test_clean A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	277
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	211
dl	0
loc	277
rs	10
c	0
b	0
f	0
wmc	18

10 Methods

Rating	Name	Size	Complexity
B	Test_drop_missing.test_drop_missing()	53	1
A	Test_clean_column_names.setUpClass()	23	1
A	Test_convert_dtypes.setUpClass()	10	1
B	Test_convert_dtypes.test_convert_dtypes()	70	6
A	Test_drop_missing.setUpClass()	12	1
A	Test_data_cleaning.test_data_cleaning()	12	2
A	Test_clean_column_names.test_clean_column_names()	24	3
A	Test_pool_duplicate_subsets.test_pool_duplicate_subsets()	8	1
A	Test_pool_duplicate_subsets.setUpClass()	10	1
A	Test_data_cleaning.setUpClass()	12	1

import unittest

import numpy as np
import pandas as pd

from klib.clean import (
    clean_column_names,
    convert_datatypes,
    data_cleaning,
    drop_missing,
    pool_duplicate_subsets,
)


class Test_clean_column_names(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df1 = pd.DataFrame(
            {
                "Asd 5$ & (3€)": [1, 2, 3],
                "3+3": [2, 3, 4],
                "AsdFer #9": [3, 4, 5],
                '"asdäöüß"': [5, 6, 7],
                "dupli": [5, 6, 8],
                "also": [9, 2, 7],
                "-Ä-__________!?:;some/(... \n ..))(++$%/name/    -.....": [2, 3, 7],
            }
        )
        cls.df2 = pd.DataFrame(
            {
                "dupli": [3, 2, 1],
                "also": [4, 5, 7],
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
                "< #total@": [2, 6, 4],
                "count >= 10": [6, 3, 2],
            }
        )
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)

    def test_clean_column_names(self):
        expected_results = [
            "asd_5_dollar_and_3_euro",
            "3_plus_3",
            "asd_fer_hash_9",
            "asdaeoeuess",
            "dupli",
            "also",
            "ae_some_plus_plus_dollar_percent_name",
            "dupli_7",
            "also_8",
            "verylong_column_namesare_hardto_read",
            "smaller_hash_total_at",
            "count_larger_equal_10",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                clean_column_names(self.df_clean_column_names).columns[i],
                expected_results[i],
            )
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                clean_column_names(self.df_clean_column_names, hints=False).columns[i],
                expected_results[i],
            )


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))

        # Drop further columns based on threshold
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
            ).shape,
            (4, 4),
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
        )

        # Drop further rows based on threshold
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
            ).shape,
            (2, 5),
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
            ).shape,
            (2, 4),
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
            ).shape,
            (3, 5),
        )


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", 6, "d", "e"],
                [pd.NA, "b", 7, 8, 9],
                [pd.NA, "c", 3, 4, pd.NA],
                [pd.NA, "d", 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_data_cleaning(self):
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
        # c1 will be dropped despite in col_exclude because it is single valued
        self.assertEqual(
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
        )

        expected_results = ["string", "int8", "O", "O"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
                expected_results[i],
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ]
        )

    def test_convert_dtypes(self):
        expected_results = [
            "int8",
            "Float32",
            "string",
            "string",
            "category",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
                expected_results[i],
            )

        expected_results = [
            "int8",
            "Float32",
            "string",
            "string",
            "object",
            "string",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
            )

        expected_results = [
            "int8",
            "Float32",
            "string",
            "string",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
                ).dtypes[i],
                expected_results[i],
            )

        expected_results = [
            "int8",
            "Float32",
            "string",
            "category",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
                ).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "Float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert,
                    category=False,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i],
                expected_results[i],
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ]
        )

    def test_pool_duplicate_subsets(self):
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
            (6, 6),
        )
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
        )


1			import unittest
2
3			import numpy as np
4			import pandas as pd
5
6			from klib.clean import (
7			clean_column_names,
8			convert_datatypes,
9			data_cleaning,
10			drop_missing,
11			pool_duplicate_subsets,
12			)
13
14
15			class Test_clean_column_names(unittest.TestCase):
16			@classmethod
17			def setUpClass(cls) -> None:
18			cls.df1 = pd.DataFrame(
19			{
20			"Asd 5$ & (3€)": [1, 2, 3],
21			"3+3": [2, 3, 4],
22			"AsdFer #9": [3, 4, 5],
23			'"asdäöüß"': [5, 6, 7],
24			"dupli": [5, 6, 8],
25			"also": [9, 2, 7],
26			"-Ä-__________!?:;some/(... \n ..))(++$%/name/ -.....": [2, 3, 7],
27			}
28			)
29			cls.df2 = pd.DataFrame(
30			{
31			"dupli": [3, 2, 1],
32			"also": [4, 5, 7],
33			"verylongColumnNamesareHardtoRead": [9, 2, 7],
34			"< #total@": [2, 6, 4],
35			"count >= 10": [6, 3, 2],
36			}
37			)
38			cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
39
40			def test_clean_column_names(self):
41			expected_results = [
42			"asd_5_dollar_and_3_euro",
43			"3_plus_3",
44			"asd_fer_hash_9",
45			"asdaeoeuess",
46			"dupli",
47			"also",
48			"ae_some_plus_plus_dollar_percent_name",
49			"dupli_7",
50			"also_8",
51			"verylong_column_namesare_hardto_read",
52			"smaller_hash_total_at",
53			"count_larger_equal_10",
54			]
55			for i, _ in enumerate(expected_results):
56			self.assertEqual(
57			clean_column_names(self.df_clean_column_names).columns[i],
58			expected_results[i],
59			)
60			for i, _ in enumerate(expected_results):
61			self.assertEqual(
62			clean_column_names(self.df_clean_column_names, hints=False).columns[i],
63			expected_results[i],
64			)
65
66
67			class Test_drop_missing(unittest.TestCase):
68			@classmethod
69			def setUpClass(cls):
70			cls.df_data_drop = pd.DataFrame(
71			[
72			[np.nan, np.nan, np.nan, np.nan, np.nan],
73			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
74			[pd.NA, "b", "c", "d", "e"],
75			[pd.NA, 6, 7, 8, 9],
76			[pd.NA, 2, 3, 4, pd.NA],
77			[pd.NA, 6, 7, pd.NA, pd.NA],
78			],
79			columns=["c1", "c2", "c3", "c4", "c5"],
80			)
81
82			def test_drop_missing(self):
83			self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
84
85			# Drop further columns based on threshold
86			self.assertEqual(
87			drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
88			)
89			self.assertEqual(
90			drop_missing(
91			self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
92			).shape,
93			(4, 4),
94			)
95			self.assertEqual(
96			drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
97			)
98			self.assertEqual(
99			drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
100			)
101
102			# Drop further rows based on threshold
103			self.assertEqual(
104			drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
105			)
106			self.assertEqual(
107			drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
108			)
109			self.assertEqual(
110			drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
111			)
112			self.assertEqual(
113			drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
114			)
115			self.assertEqual(
116			drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
117			)
118			self.assertEqual(
119			drop_missing(
120			self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
121			).shape,
122			(2, 5),
123			)
124			self.assertEqual(
125			drop_missing(
126			self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
127			).shape,
128			(2, 4),
129			)
130			self.assertEqual(
131			drop_missing(
132			self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
133			).shape,
134			(3, 5),
135			)
136
137
138			class Test_data_cleaning(unittest.TestCase):
139			@classmethod
140			def setUpClass(cls):
141			cls.df_data_cleaning = pd.DataFrame(
142			[
143			[np.nan, np.nan, np.nan, np.nan, np.nan],
144			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
145			[pd.NA, "b", 6, "d", "e"],
146			[pd.NA, "b", 7, 8, 9],
147			[pd.NA, "c", 3, 4, pd.NA],
148			[pd.NA, "d", 7, pd.NA, pd.NA],
149			],
150			columns=["c1", "c2", "c3", "c4", "c5"],
151			)
152
153			def test_data_cleaning(self):
154			self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
155			# c1 will be dropped despite in col_exclude because it is single valued
156			self.assertEqual(
157			data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
158			)
159
160			expected_results = ["string", "int8", "O", "O"]
161			for i, _ in enumerate(expected_results):
162			self.assertEqual(
163			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
164			expected_results[i],
165			)
166
167
168			class Test_convert_dtypes(unittest.TestCase):
169			@classmethod
170			def setUpClass(cls):
171			cls.df_data_convert = pd.DataFrame(
172			[
173			[1, 7.0, "y", "x", pd.NA, "v"],
174			[3, 8.0, "d", "e", pd.NA, "v"],
175			[5, 7.0, "o", "z", pd.NA, "v"],
176			[1, 7.0, "u", "f", pd.NA, "p"],
177			[1, 7.0, "u", "f", pd.NA, "p"],
178			[2, 7.0, "g", "a", pd.NA, "p"],
179			]
180			)
181
182			def test_convert_dtypes(self):
183			expected_results = [
184			"int8",
185			"Float32",
186			"string",
187			"string",
188			"category",
189			"category",
190			]
191			for i, _ in enumerate(expected_results):
192			self.assertEqual(
193			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
194			expected_results[i],
195			)
196
197			expected_results = [
198			"int8",
199			"Float32",
200			"string",
201			"string",
202			"object",
203			"string",
204			]
205			for i, _ in enumerate(expected_results):
206			self.assertEqual(
207			convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
208			)
209
210			expected_results = [
211			"int8",
212			"Float32",
213			"string",
214			"string",
215			"object",
216			"category",
217			]
218			for i, _ in enumerate(expected_results):
219			self.assertEqual(
220			convert_datatypes(
221			self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
222			).dtypes[i],
223			expected_results[i],
224			)
225
226			expected_results = [
227			"int8",
228			"Float32",
229			"string",
230			"category",
231			"object",
232			"category",
233			]
234			for i, _ in enumerate(expected_results):
235			self.assertEqual(
236			convert_datatypes(
237			self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
238			).dtypes[i],
239			expected_results[i],
240			)
241
242			expected_results = ["int8", "Float32", "string", "string", "object", "string"]
243			for i, _ in enumerate(expected_results):
244			self.assertEqual(
245			convert_datatypes(
246			self.df_data_convert,
247			category=False,
248			cat_threshold=0.95,
249			cat_exclude=[2, 4],
250			).dtypes[i],
251			expected_results[i],
252			)
253
254
255			class Test_pool_duplicate_subsets(unittest.TestCase):
256			@classmethod
257			def setUpClass(cls):
258			cls.df_data_subsets = pd.DataFrame(
259			[
260			[1, 7, "d", "x", pd.NA, "v"],
261			[1, 8, "d", "e", pd.NA, "v"],
262			[2, 7, "g", "z", pd.NA, "v"],
263			[1, 7, "u", "f", pd.NA, "p"],
264			[1, 7, "u", "z", pd.NA, "p"],
265			[2, 7, "g", "z", pd.NA, "p"],
266			]
267			)
268
269			def test_pool_duplicate_subsets(self):
270			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
271			self.assertEqual(
272			pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
273			(6, 6),
274			)
275			self.assertEqual(
276			pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
277			)
278

akanz1 / klib

GitHub Access Token became invalid

Pull Request — main (#17)

tests.test_clean A

Complexity

Size/Duplication

Importance

10 Methods

Duplication Side-by-Side

Filter issues like