klib.tests.test_clean.Test_clean_column_names.setUpClass() - Code Metrics - Inspection of "include col name cleaning in data_cleaning()" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( bb0913...522ac3 )

by Andreas

created 2020-08-01 17:09 UTC

Test_clean_column_names.setUpClass() A

↳ Parent: klib.tests.test_clean

Complexity

Conditions

Size

Total Lines	16
Code Lines	13

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	13
nop	1
dl	0
loc	16
rs	9.75
c	0
b	0
f	0

import numpy as np
import pandas as pd
import unittest
from ..clean import clean_column_names, data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets


class Test_clean_column_names(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.df1 = pd.DataFrame(
            {
                "Asd 5$ & (3€)": [1, 2, 3],
                "3+3": [2, 3, 4],
                "AsdFer #9": [3, 4, 5],
                '"asd"': [5, 6, 7],
                "dupli": [5, 6, 8],
                "also": [9, 2, 7],
            }
        )
        cls.df2 = pd.DataFrame(
            {"dupli": [3, 2, 1], "also": [4, 5, 7], "verylongColumnNamesareHardtoRead": [9, 2, 7]}
        )
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)

    def test_clean_column_names(self):
        expected_results = [
            "asd_5_dollar_and_3_euro",
            "3_plus_3",
            "asd_fer_number_9",
            "asd",
            "dupli",
            "also",
            "dupli_6",
            "also_7",
            "verylong_column_namesare_hardto_read",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(clean_column_names(self.df_clean_column_names).columns[i], expected_results[i])
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                clean_column_names(self.df_clean_column_names, hints=False).columns[i], expected_results[i]
            )


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))

        # Drop further columns based on threshold
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4)
        )
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))

        # Drop further rows based on threshold
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5)
        )


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", 6, "d", "e"],
                [pd.NA, "b", 7, 8, 9],
                [pd.NA, "c", 3, 4, pd.NA],
                [pd.NA, "d", 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_data_cleaning(self):
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
        # c1 will be dropped despite in col_exclude because it is single valued
        self.assertEqual(
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
        )

        expected_results = ["string", "int8", "O", "O"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
                expected_results[i],
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ]
        )

    def test_convert_dtypes(self):
        expected_results = ["int8", "float32", "string", "string", "category", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i]
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])

        expected_results = ["int8", "float32", "string", "string", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "category", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4]
                ).dtypes[i],
                expected_results[i],
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ]
        )

    def test_pool_duplicate_subsets(self):
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6))
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))


1			import numpy as np
2			import pandas as pd
3			import unittest
4			from ..clean import clean_column_names, data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets
5
6
7			class Test_clean_column_names(unittest.TestCase):
8			@classmethod
9			def setUpClass(cls) -> None:
10			cls.df1 = pd.DataFrame(
11			{
12			"Asd 5$ & (3€)": [1, 2, 3],
13			"3+3": [2, 3, 4],
14			"AsdFer #9": [3, 4, 5],
15			'"asd"': [5, 6, 7],
16			"dupli": [5, 6, 8],
17			"also": [9, 2, 7],
18			}
19			)
20			cls.df2 = pd.DataFrame(
21			{"dupli": [3, 2, 1], "also": [4, 5, 7], "verylongColumnNamesareHardtoRead": [9, 2, 7]}
22			)
23			cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
24
25			def test_clean_column_names(self):
26			expected_results = [
27			"asd_5_dollar_and_3_euro",
28			"3_plus_3",
29			"asd_fer_number_9",
30			"asd",
31			"dupli",
32			"also",
33			"dupli_6",
34			"also_7",
35			"verylong_column_namesare_hardto_read",
36			]
37			for i, _ in enumerate(expected_results):
38			self.assertEqual(clean_column_names(self.df_clean_column_names).columns[i], expected_results[i])
39			for i, _ in enumerate(expected_results):
40			self.assertEqual(
41			clean_column_names(self.df_clean_column_names, hints=False).columns[i], expected_results[i]
42			)
43
44
45			class Test_drop_missing(unittest.TestCase):
46			@classmethod
47			def setUpClass(cls):
48			cls.df_data_drop = pd.DataFrame(
49			[
50			[np.nan, np.nan, np.nan, np.nan, np.nan],
51			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
52			[pd.NA, "b", "c", "d", "e"],
53			[pd.NA, 6, 7, 8, 9],
54			[pd.NA, 2, 3, 4, pd.NA],
55			[pd.NA, 6, 7, pd.NA, pd.NA],
56			],
57			columns=["c1", "c2", "c3", "c4", "c5"],
58			)
59
60			def test_drop_missing(self):
61			self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
62
63			# Drop further columns based on threshold
64			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
65			self.assertEqual(
66			drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4)
67			)
68			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
69			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))
70
71			# Drop further rows based on threshold
72			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
73			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
74			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
75			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
76			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
77			self.assertEqual(
78			drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5)
79			)
80			self.assertEqual(
81			drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4)
82			)
83			self.assertEqual(
84			drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5)
85			)
86
87
88			class Test_data_cleaning(unittest.TestCase):
89			@classmethod
90			def setUpClass(cls):
91			cls.df_data_cleaning = pd.DataFrame(
92			[
93			[np.nan, np.nan, np.nan, np.nan, np.nan],
94			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
95			[pd.NA, "b", 6, "d", "e"],
96			[pd.NA, "b", 7, 8, 9],
97			[pd.NA, "c", 3, 4, pd.NA],
98			[pd.NA, "d", 7, pd.NA, pd.NA],
99			],
100			columns=["c1", "c2", "c3", "c4", "c5"],
101			)
102
103			def test_data_cleaning(self):
104			self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
105			# c1 will be dropped despite in col_exclude because it is single valued
106			self.assertEqual(
107			data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
108			)
109
110			expected_results = ["string", "int8", "O", "O"]
111			for i, _ in enumerate(expected_results):
112			self.assertEqual(
113			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
114			expected_results[i],
115			)
116
117
118			class Test_convert_dtypes(unittest.TestCase):
119			@classmethod
120			def setUpClass(cls):
121			cls.df_data_convert = pd.DataFrame(
122			[
123			[1, 7.0, "y", "x", pd.NA, "v"],
124			[3, 8.0, "d", "e", pd.NA, "v"],
125			[5, 7.0, "o", "z", pd.NA, "v"],
126			[1, 7.0, "u", "f", pd.NA, "p"],
127			[1, 7.0, "u", "f", pd.NA, "p"],
128			[2, 7.0, "g", "a", pd.NA, "p"],
129			]
130			)
131
132			def test_convert_dtypes(self):
133			expected_results = ["int8", "float32", "string", "string", "category", "category"]
134			for i, _ in enumerate(expected_results):
135			self.assertEqual(
136			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i]
137			)
138
139			expected_results = ["int8", "float32", "string", "string", "object", "string"]
140			for i, _ in enumerate(expected_results):
141			self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])
142
143			expected_results = ["int8", "float32", "string", "string", "object", "category"]
144			for i, _ in enumerate(expected_results):
145			self.assertEqual(
146			convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
147			expected_results[i],
148			)
149
150			expected_results = ["int8", "float32", "string", "category", "object", "category"]
151			for i, _ in enumerate(expected_results):
152			self.assertEqual(
153			convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
154			expected_results[i],
155			)
156
157			expected_results = ["int8", "float32", "string", "string", "object", "string"]
158			for i, _ in enumerate(expected_results):
159			self.assertEqual(
160			convert_datatypes(
161			self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4]
162			).dtypes[i],
163			expected_results[i],
164			)
165
166
167			class Test_pool_duplicate_subsets(unittest.TestCase):
168			@classmethod
169			def setUpClass(cls):
170			cls.df_data_subsets = pd.DataFrame(
171			[
172			[1, 7, "d", "x", pd.NA, "v"],
173			[1, 8, "d", "e", pd.NA, "v"],
174			[2, 7, "g", "z", pd.NA, "v"],
175			[1, 7, "u", "f", pd.NA, "p"],
176			[1, 7, "u", "z", pd.NA, "p"],
177			[2, 7, "g", "z", pd.NA, "p"],
178			]
179			)
180
181			def test_pool_duplicate_subsets(self):
182			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
183			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6))
184			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))
185

akanz1 / klib

GitHub Access Token became invalid

Push — master ( bb0913...522ac3 )

Test_clean_column_names.setUpClass() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like