klib.tests.test_clean.Test_drop_missing.test_drop_missing() - Code Metrics - Inspection of "Resolving pre-commit-hook changes" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c20d75...406f67 )

by Andreas

created 2020-07-24 15:30 UTC

Test_drop_missing.test_drop_missing() B

↳ Parent: klib.tests.test_clean

Complexity

Conditions

Size

Total Lines	53
Code Lines	38

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	38
nop	1
dl	0
loc	53
rs	8.968
c	0
b	0
f	0

How to fix Long Method

import numpy as np
import pandas as pd
import unittest
from ..clean import (
    data_cleaning,
    drop_missing,
    convert_datatypes,
    pool_duplicate_subsets,
)


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))

        # Drop further columns based on threshold
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
            ).shape,
            (4, 4),
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
        )

        # Drop further rows based on threshold
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
            ).shape,
            (2, 5),
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
            ).shape,
            (2, 4),
        )
        self.assertEqual(
            drop_missing(
                self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
            ).shape,
            (3, 5),
        )


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", 6, "d", "e"],
                [pd.NA, "b", 7, 8, 9],
                [pd.NA, "c", 3, 4, pd.NA],
                [pd.NA, "d", 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_data_cleaning(self):
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
        # c1 will be dropped despite in col_exclude because it is single valued
        self.assertEqual(
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
        )

        expected_results = ["string", "int8", "O", "O"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
                expected_results[i],
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ]
        )

    def test_convert_dtypes(self):
        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "category",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
            )

        expected_results = ["int8", "float32", "string", "string", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
                ).dtypes[i],
                expected_results[i],
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "category",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
                ).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert,
                    category=False,
                    cat_threshold=0.95,
                    cat_exclude=[2, 4],
                ).dtypes[i],
                expected_results[i],
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ]
        )

    def test_pool_duplicate_subsets(self):
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
            (6, 6),
        )
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
        )


1			import numpy as np
2			import pandas as pd
3			import unittest
4			from ..clean import (
5			data_cleaning,
6			drop_missing,
7			convert_datatypes,
8			pool_duplicate_subsets,
9			)
10
11
12			class Test_drop_missing(unittest.TestCase):
13			@classmethod
14			def setUpClass(cls):
15			cls.df_data_drop = pd.DataFrame(
16			[
17			[np.nan, np.nan, np.nan, np.nan, np.nan],
18			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
19			[pd.NA, "b", "c", "d", "e"],
20			[pd.NA, 6, 7, 8, 9],
21			[pd.NA, 2, 3, 4, pd.NA],
22			[pd.NA, 6, 7, pd.NA, pd.NA],
23			],
24			columns=["c1", "c2", "c3", "c4", "c5"],
25			)
26
27			def test_drop_missing(self):
28			self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
29
30			# Drop further columns based on threshold
31			self.assertEqual(
32			drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
33			)
34			self.assertEqual(
35			drop_missing(
36			self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
37			).shape,
38			(4, 4),
39			)
40			self.assertEqual(
41			drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
42			)
43			self.assertEqual(
44			drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
45			)
46
47			# Drop further rows based on threshold
48			self.assertEqual(
49			drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
50			)
51			self.assertEqual(
52			drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
53			)
54			self.assertEqual(
55			drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
56			)
57			self.assertEqual(
58			drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
59			)
60			self.assertEqual(
61			drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
62			)
63			self.assertEqual(
64			drop_missing(
65			self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
66			).shape,
67			(2, 5),
68			)
69			self.assertEqual(
70			drop_missing(
71			self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
72			).shape,
73			(2, 4),
74			)
75			self.assertEqual(
76			drop_missing(
77			self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
78			).shape,
79			(3, 5),
80			)
81
82
83			class Test_data_cleaning(unittest.TestCase):
84			@classmethod
85			def setUpClass(cls):
86			cls.df_data_cleaning = pd.DataFrame(
87			[
88			[np.nan, np.nan, np.nan, np.nan, np.nan],
89			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
90			[pd.NA, "b", 6, "d", "e"],
91			[pd.NA, "b", 7, 8, 9],
92			[pd.NA, "c", 3, 4, pd.NA],
93			[pd.NA, "d", 7, pd.NA, pd.NA],
94			],
95			columns=["c1", "c2", "c3", "c4", "c5"],
96			)
97
98			def test_data_cleaning(self):
99			self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
100			# c1 will be dropped despite in col_exclude because it is single valued
101			self.assertEqual(
102			data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
103			)
104
105			expected_results = ["string", "int8", "O", "O"]
106			for i, _ in enumerate(expected_results):
107			self.assertEqual(
108			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
109			expected_results[i],
110			)
111
112
113			class Test_convert_dtypes(unittest.TestCase):
114			@classmethod
115			def setUpClass(cls):
116			cls.df_data_convert = pd.DataFrame(
117			[
118			[1, 7.0, "y", "x", pd.NA, "v"],
119			[3, 8.0, "d", "e", pd.NA, "v"],
120			[5, 7.0, "o", "z", pd.NA, "v"],
121			[1, 7.0, "u", "f", pd.NA, "p"],
122			[1, 7.0, "u", "f", pd.NA, "p"],
123			[2, 7.0, "g", "a", pd.NA, "p"],
124			]
125			)
126
127			def test_convert_dtypes(self):
128			expected_results = [
129			"int8",
130			"float32",
131			"string",
132			"string",
133			"category",
134			"category",
135			]
136			for i, _ in enumerate(expected_results):
137			self.assertEqual(
138			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
139			expected_results[i],
140			)
141
142			expected_results = ["int8", "float32", "string", "string", "object", "string"]
143			for i, _ in enumerate(expected_results):
144			self.assertEqual(
145			convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
146			)
147
148			expected_results = ["int8", "float32", "string", "string", "object", "category"]
149			for i, _ in enumerate(expected_results):
150			self.assertEqual(
151			convert_datatypes(
152			self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
153			).dtypes[i],
154			expected_results[i],
155			)
156
157			expected_results = [
158			"int8",
159			"float32",
160			"string",
161			"category",
162			"object",
163			"category",
164			]
165			for i, _ in enumerate(expected_results):
166			self.assertEqual(
167			convert_datatypes(
168			self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
169			).dtypes[i],
170			expected_results[i],
171			)
172
173			expected_results = ["int8", "float32", "string", "string", "object", "string"]
174			for i, _ in enumerate(expected_results):
175			self.assertEqual(
176			convert_datatypes(
177			self.df_data_convert,
178			category=False,
179			cat_threshold=0.95,
180			cat_exclude=[2, 4],
181			).dtypes[i],
182			expected_results[i],
183			)
184
185
186			class Test_pool_duplicate_subsets(unittest.TestCase):
187			@classmethod
188			def setUpClass(cls):
189			cls.df_data_subsets = pd.DataFrame(
190			[
191			[1, 7, "d", "x", pd.NA, "v"],
192			[1, 8, "d", "e", pd.NA, "v"],
193			[2, 7, "g", "z", pd.NA, "v"],
194			[1, 7, "u", "f", pd.NA, "p"],
195			[1, 7, "u", "z", pd.NA, "p"],
196			[2, 7, "g", "z", pd.NA, "p"],
197			]
198			)
199
200			def test_pool_duplicate_subsets(self):
201			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
202			self.assertEqual(
203			pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
204			(6, 6),
205			)
206			self.assertEqual(
207			pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
208			)
209

akanz1 / klib

GitHub Access Token became invalid

Push — master ( c20d75...406f67 )

Test_drop_missing.test_drop_missing() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like