klib.tests.test_clean.Test_drop_missing.test_drop_missing() - Code Metrics - Inspection of "update formatting" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 8dc02e...5d400f )

by Andreas

created 2020-09-21 06:15 UTC

Test_drop_missing.test_drop_missing() B

↳ Parent: klib.tests.test_clean

Complexity

Conditions

Size

Total Lines	53
Code Lines	38

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	38
nop	1
dl	0
loc	53
rs	8.968
c	0
b	0
f	0

How to fix Long Method

import numpy as np
import pandas as pd
import unittest
from ..clean import (
    clean_column_names,
    data_cleaning,
    drop_missing,
    convert_datatypes,
    pool_duplicate_subsets,
)


class Test_clean_column_names(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df1 = pd.DataFrame(
            {
                "Asd 5$ & (3€)": [1, 2, 3],
                "3+3": [2, 3, 4],
                "AsdFer #9": [3, 4, 5],
                '"asd"': [5, 6, 7],
                "dupli": [5, 6, 8],
                "also": [9, 2, 7],
                "-Ä-__some/(... \n ..))++$%/name/   .........": [2, 3, 7],
            }
        )
        cls.df2 = pd.DataFrame(
            {
                "dupli": [3, 2, 1],
                "also": [4, 5, 7],
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
            }
        )
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)

    def test_clean_column_names(self):
        expected_results = [
            "asd_5_dollar_and_3_euro",
            "3_plus_3",
            "asd_fer_number_9",
            "asd",
            "dupli",
            "also",
            "ae_some_plus_plus_dollar_percent_name",
            "dupli_7",
            "also_8",
            "verylong_column_namesare_hardto_read",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                clean_column_names(self.df_clean_column_names).columns[i],
                expected_results[i],
            )
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                clean_column_names(self.df_clean_column_names, hints=False).columns[i],
                expected_results[i],
            )


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))

        # Drop further columns based on threshold
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
            ).shape,
            (4, 4),
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
        )

        # Drop further rows based on threshold
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
            ).shape,
            (2, 5),
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
            ).shape,
            (2, 4),
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
            ).shape,
            (3, 5),
        )


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", 6, "d", "e"],
                [pd.NA, "b", 7, 8, 9],
                [pd.NA, "c", 3, 4, pd.NA],
                [pd.NA, "d", 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_data_cleaning(self):
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
        # c1 will be dropped despite in col_exclude because it is single valued
        self.assertEqual(
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
        )

        expected_results = ["string", "int8", "O", "O"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
                expected_results[i],
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ]
        )

    def test_convert_dtypes(self):
        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "category",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
            )

        expected_results = ["int8", "float32", "string", "string", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
                ).dtypes[i],
                expected_results[i],
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "category",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
                ).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert,
                    category=False,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i],
                expected_results[i],
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ]
        )

    def test_pool_duplicate_subsets(self):
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
            (6, 6),
        )
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
        )


1			import numpy as np
2			import pandas as pd
3			import unittest
4			from ..clean import (
5			clean_column_names,
6			data_cleaning,
7			drop_missing,
8			convert_datatypes,
9			pool_duplicate_subsets,
10			)
11
12
13			class Test_clean_column_names(unittest.TestCase):
14			@classmethod
15			def setUpClass(cls) -> None:
16			cls.df1 = pd.DataFrame(
17			{
18			"Asd 5$ & (3€)": [1, 2, 3],
19			"3+3": [2, 3, 4],
20			"AsdFer #9": [3, 4, 5],
21			'"asd"': [5, 6, 7],
22			"dupli": [5, 6, 8],
23			"also": [9, 2, 7],
24			"-Ä-__some/(... \n ..))++$%/name/ .........": [2, 3, 7],
25			}
26			)
27			cls.df2 = pd.DataFrame(
28			{
29			"dupli": [3, 2, 1],
30			"also": [4, 5, 7],
31			"verylongColumnNamesareHardtoRead": [9, 2, 7],
32			}
33			)
34			cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
35
36			def test_clean_column_names(self):
37			expected_results = [
38			"asd_5_dollar_and_3_euro",
39			"3_plus_3",
40			"asd_fer_number_9",
41			"asd",
42			"dupli",
43			"also",
44			"ae_some_plus_plus_dollar_percent_name",
45			"dupli_7",
46			"also_8",
47			"verylong_column_namesare_hardto_read",
48			]
49			for i, _ in enumerate(expected_results):
50			self.assertEqual(
51			clean_column_names(self.df_clean_column_names).columns[i],
52			expected_results[i],
53			)
54			for i, _ in enumerate(expected_results):
55			self.assertEqual(
56			clean_column_names(self.df_clean_column_names, hints=False).columns[i],
57			expected_results[i],
58			)
59
60
61			class Test_drop_missing(unittest.TestCase):
62			@classmethod
63			def setUpClass(cls):
64			cls.df_data_drop = pd.DataFrame(
65			[
66			[np.nan, np.nan, np.nan, np.nan, np.nan],
67			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
68			[pd.NA, "b", "c", "d", "e"],
69			[pd.NA, 6, 7, 8, 9],
70			[pd.NA, 2, 3, 4, pd.NA],
71			[pd.NA, 6, 7, pd.NA, pd.NA],
72			],
73			columns=["c1", "c2", "c3", "c4", "c5"],
74			)
75
76			def test_drop_missing(self):
77			self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
78
79			# Drop further columns based on threshold
80			self.assertEqual(
81			drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
82			)
83			self.assertEqual(
84			drop_missing(
85			self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
86			).shape,
87			(4, 4),
88			)
89			self.assertEqual(
90			drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
91			)
92			self.assertEqual(
93			drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
94			)
95
96			# Drop further rows based on threshold
97			self.assertEqual(
98			drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
99			)
100			self.assertEqual(
101			drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
102			)
103			self.assertEqual(
104			drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
105			)
106			self.assertEqual(
107			drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
108			)
109			self.assertEqual(
110			drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
111			)
112			self.assertEqual(
113			drop_missing(
114			self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
115			).shape,
116			(2, 5),
117			)
118			self.assertEqual(
119			drop_missing(
120			self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
121			).shape,
122			(2, 4),
123			)
124			self.assertEqual(
125			drop_missing(
126			self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
127			).shape,
128			(3, 5),
129			)
130
131
132			class Test_data_cleaning(unittest.TestCase):
133			@classmethod
134			def setUpClass(cls):
135			cls.df_data_cleaning = pd.DataFrame(
136			[
137			[np.nan, np.nan, np.nan, np.nan, np.nan],
138			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
139			[pd.NA, "b", 6, "d", "e"],
140			[pd.NA, "b", 7, 8, 9],
141			[pd.NA, "c", 3, 4, pd.NA],
142			[pd.NA, "d", 7, pd.NA, pd.NA],
143			],
144			columns=["c1", "c2", "c3", "c4", "c5"],
145			)
146
147			def test_data_cleaning(self):
148			self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
149			# c1 will be dropped despite in col_exclude because it is single valued
150			self.assertEqual(
151			data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
152			)
153
154			expected_results = ["string", "int8", "O", "O"]
155			for i, _ in enumerate(expected_results):
156			self.assertEqual(
157			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
158			expected_results[i],
159			)
160
161
162			class Test_convert_dtypes(unittest.TestCase):
163			@classmethod
164			def setUpClass(cls):
165			cls.df_data_convert = pd.DataFrame(
166			[
167			[1, 7.0, "y", "x", pd.NA, "v"],
168			[3, 8.0, "d", "e", pd.NA, "v"],
169			[5, 7.0, "o", "z", pd.NA, "v"],
170			[1, 7.0, "u", "f", pd.NA, "p"],
171			[1, 7.0, "u", "f", pd.NA, "p"],
172			[2, 7.0, "g", "a", pd.NA, "p"],
173			]
174			)
175
176			def test_convert_dtypes(self):
177			expected_results = [
178			"int8",
179			"float32",
180			"string",
181			"string",
182			"category",
183			"category",
184			]
185			for i, _ in enumerate(expected_results):
186			self.assertEqual(
187			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
188			expected_results[i],
189			)
190
191			expected_results = ["int8", "float32", "string", "string", "object", "string"]
192			for i, _ in enumerate(expected_results):
193			self.assertEqual(
194			convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
195			)
196
197			expected_results = ["int8", "float32", "string", "string", "object", "category"]
198			for i, _ in enumerate(expected_results):
199			self.assertEqual(
200			convert_datatypes(
201			self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
202			).dtypes[i],
203			expected_results[i],
204			)
205
206			expected_results = [
207			"int8",
208			"float32",
209			"string",
210			"category",
211			"object",
212			"category",
213			]
214			for i, _ in enumerate(expected_results):
215			self.assertEqual(
216			convert_datatypes(
217			self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
218			).dtypes[i],
219			expected_results[i],
220			)
221
222			expected_results = ["int8", "float32", "string", "string", "object", "string"]
223			for i, _ in enumerate(expected_results):
224			self.assertEqual(
225			convert_datatypes(
226			self.df_data_convert,
227			category=False,
228			cat_threshold=0.95,
229			cat_exclude=[2, 4],
230			).dtypes[i],
231			expected_results[i],
232			)
233
234
235			class Test_pool_duplicate_subsets(unittest.TestCase):
236			@classmethod
237			def setUpClass(cls):
238			cls.df_data_subsets = pd.DataFrame(
239			[
240			[1, 7, "d", "x", pd.NA, "v"],
241			[1, 8, "d", "e", pd.NA, "v"],
242			[2, 7, "g", "z", pd.NA, "v"],
243			[1, 7, "u", "f", pd.NA, "p"],
244			[1, 7, "u", "z", pd.NA, "p"],
245			[2, 7, "g", "z", pd.NA, "p"],
246			]
247			)
248
249			def test_pool_duplicate_subsets(self):
250			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
251			self.assertEqual(
252			pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
253			(6, 6),
254			)
255			self.assertEqual(
256			pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
257			)
258

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 8dc02e...5d400f )

Test_drop_missing.test_drop_missing() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like