klib.tests.test_clean - Code Metrics - Inspection of "black formatting" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 012cfd...853c75 )

by Andreas

created 2020-07-24 15:48 UTC

klib.tests.test_clean A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	165
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	123
dl	0
loc	165
rs	10
c	0
b	0
f	0
wmc	14

8 Methods

Rating	Name	Size	Complexity
A	Test_drop_missing.setUpClass()	12	1
A	Test_data_cleaning.test_data_cleaning()	9	2
B	Test_convert_dtypes.test_convert_dtypes()	46	6
A	Test_data_cleaning.setUpClass()	12	1
A	Test_drop_missing.test_drop_missing()	25	1
A	Test_convert_dtypes.setUpClass()	10	1
A	Test_pool_duplicate_subsets.test_pool_duplicate_subsets()	6	1
A	Test_pool_duplicate_subsets.setUpClass()	10	1

import numpy as np
import pandas as pd
import unittest
from ..clean import (
    data_cleaning,
    drop_missing,
    convert_datatypes,
    pool_duplicate_subsets,
)


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))

        # Drop further columns based on threshold
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4),
        )
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))

        # Drop further rows based on threshold
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5),
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4),
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5),
        )


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", 6, "d", "e"],
                [pd.NA, "b", 7, 8, 9],
                [pd.NA, "c", 3, 4, pd.NA],
                [pd.NA, "d", 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_data_cleaning(self):
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
        # c1 will be dropped despite in col_exclude because it is single valued
        self.assertEqual(data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4))

        expected_results = ["string", "int8", "O", "O"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i], expected_results[i],
            )


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ]
        )

    def test_convert_dtypes(self):
        expected_results = [
            "int8",
            "float32",
            "string",
            "string",
            "category",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])

        expected_results = ["int8", "float32", "string", "string", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
                expected_results[i],
            )

        expected_results = [
            "int8",
            "float32",
            "string",
            "category",
            "object",
            "category",
        ]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
                expected_results[i],
            )

        expected_results = ["int8", "float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4],
                ).dtypes[i],
                expected_results[i],
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ]
        )

    def test_pool_duplicate_subsets(self):
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
        self.assertEqual(
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6),
        )
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))


1			import numpy as np
2			import pandas as pd
3			import unittest
4			from ..clean import (
5			data_cleaning,
6			drop_missing,
7			convert_datatypes,
8			pool_duplicate_subsets,
9			)
10
11
12			class Test_drop_missing(unittest.TestCase):
13			@classmethod
14			def setUpClass(cls):
15			cls.df_data_drop = pd.DataFrame(
16			[
17			[np.nan, np.nan, np.nan, np.nan, np.nan],
18			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
19			[pd.NA, "b", "c", "d", "e"],
20			[pd.NA, 6, 7, 8, 9],
21			[pd.NA, 2, 3, 4, pd.NA],
22			[pd.NA, 6, 7, pd.NA, pd.NA],
23			],
24			columns=["c1", "c2", "c3", "c4", "c5"],
25			)
26
27			def test_drop_missing(self):
28			self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
29
30			# Drop further columns based on threshold
31			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
32			self.assertEqual(
33			drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4),
34			)
35			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
36			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))
37
38			# Drop further rows based on threshold
39			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
40			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
41			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
42			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
43			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
44			self.assertEqual(
45			drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5),
46			)
47			self.assertEqual(
48			drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4),
49			)
50			self.assertEqual(
51			drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5),
52			)
53
54
55			class Test_data_cleaning(unittest.TestCase):
56			@classmethod
57			def setUpClass(cls):
58			cls.df_data_cleaning = pd.DataFrame(
59			[
60			[np.nan, np.nan, np.nan, np.nan, np.nan],
61			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
62			[pd.NA, "b", 6, "d", "e"],
63			[pd.NA, "b", 7, 8, 9],
64			[pd.NA, "c", 3, 4, pd.NA],
65			[pd.NA, "d", 7, pd.NA, pd.NA],
66			],
67			columns=["c1", "c2", "c3", "c4", "c5"],
68			)
69
70			def test_data_cleaning(self):
71			self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
72			# c1 will be dropped despite in col_exclude because it is single valued
73			self.assertEqual(data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4))
74
75			expected_results = ["string", "int8", "O", "O"]
76			for i, _ in enumerate(expected_results):
77			self.assertEqual(
78			data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i], expected_results[i],
79			)
80
81
82			class Test_convert_dtypes(unittest.TestCase):
83			@classmethod
84			def setUpClass(cls):
85			cls.df_data_convert = pd.DataFrame(
86			[
87			[1, 7.0, "y", "x", pd.NA, "v"],
88			[3, 8.0, "d", "e", pd.NA, "v"],
89			[5, 7.0, "o", "z", pd.NA, "v"],
90			[1, 7.0, "u", "f", pd.NA, "p"],
91			[1, 7.0, "u", "f", pd.NA, "p"],
92			[2, 7.0, "g", "a", pd.NA, "p"],
93			]
94			)
95
96			def test_convert_dtypes(self):
97			expected_results = [
98			"int8",
99			"float32",
100			"string",
101			"string",
102			"category",
103			"category",
104			]
105			for i, _ in enumerate(expected_results):
106			self.assertEqual(
107			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i],
108			)
109
110			expected_results = ["int8", "float32", "string", "string", "object", "string"]
111			for i, _ in enumerate(expected_results):
112			self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])
113
114			expected_results = ["int8", "float32", "string", "string", "object", "category"]
115			for i, _ in enumerate(expected_results):
116			self.assertEqual(
117			convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
118			expected_results[i],
119			)
120
121			expected_results = [
122			"int8",
123			"float32",
124			"string",
125			"category",
126			"object",
127			"category",
128			]
129			for i, _ in enumerate(expected_results):
130			self.assertEqual(
131			convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
132			expected_results[i],
133			)
134
135			expected_results = ["int8", "float32", "string", "string", "object", "string"]
136			for i, _ in enumerate(expected_results):
137			self.assertEqual(
138			convert_datatypes(
139			self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4],
140			).dtypes[i],
141			expected_results[i],
142			)
143
144
145			class Test_pool_duplicate_subsets(unittest.TestCase):
146			@classmethod
147			def setUpClass(cls):
148			cls.df_data_subsets = pd.DataFrame(
149			[
150			[1, 7, "d", "x", pd.NA, "v"],
151			[1, 8, "d", "e", pd.NA, "v"],
152			[2, 7, "g", "z", pd.NA, "v"],
153			[1, 7, "u", "f", pd.NA, "p"],
154			[1, 7, "u", "z", pd.NA, "p"],
155			[2, 7, "g", "z", pd.NA, "p"],
156			]
157			)
158
159			def test_pool_duplicate_subsets(self):
160			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
161			self.assertEqual(
162			pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6),
163			)
164			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))
165

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 012cfd...853c75 )

klib.tests.test_clean A

Complexity

Size/Duplication

Importance

8 Methods

Duplication Side-by-Side

Filter issues like