klib.tests.test_clean.Test_data_cleaning.setUpClass() - Code Metrics - Inspection of "update docstrings and tests" - akanz1/klib - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 922334...81fced )

by Andreas

created 2020-07-23 17:16 UTC

Test_data_cleaning.setUpClass() A

↳ Parent: klib.tests.test_clean

Complexity

Conditions

Size

Total Lines	12
Code Lines	11

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	11
nop	1
dl	0
loc	12
rs	9.85
c	0
b	0
f	0

import numpy as np
import pandas as pd
import unittest
from ..clean import data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets


class Test_drop_missing(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_drop = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_drop_missing(self):
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))

        # Drop further columns based on threshold
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4)
        )
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))

        # Drop further rows based on threshold
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4)
        )
        self.assertEqual(
            drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5)
        )


class Test_data_cleaning(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_cleaning = pd.DataFrame(
            [
                [np.nan, np.nan, np.nan, np.nan, np.nan],
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
                [pd.NA, "b", "c", "d", "e"],
                [pd.NA, 6, 7, 8, 9],
                [pd.NA, 2, 3, 4, pd.NA],
                [pd.NA, 6, 7, pd.NA, pd.NA],
            ],
            columns=["c1", "c2", "c3", "c4", "c5"],
        )

    def test_data_cleaning(self):
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))


class Test_convert_dtypes(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_convert = pd.DataFrame(
            [
                [1, 7.0, "y", "x", pd.NA, "v"],
                [3, 8.0, "d", "e", pd.NA, "v"],
                [5, 7.0, "o", "z", pd.NA, "v"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [1, 7.0, "u", "f", pd.NA, "p"],
                [2, 7.0, "g", "a", pd.NA, "p"],
            ]
        )

    def test_convert_dtypes(self):
        expected_results = ["Int8", "Float32", "string", "string", "category", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i]
            )

        expected_results = ["Int8", "Float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])

        expected_results = ["Int8", "Float32", "string", "string", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
                expected_results[i],
            )

        expected_results = ["Int8", "Float32", "string", "category", "object", "category"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
                expected_results[i],
            )

        expected_results = ["Int8", "Float32", "string", "string", "object", "string"]
        for i, _ in enumerate(expected_results):
            self.assertEqual(
                convert_datatypes(
                    self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4]
                ).dtypes[i],
                expected_results[i],
            )


class Test_pool_duplicate_subsets(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.df_data_subsets = pd.DataFrame(
            [
                [1, 7, "d", "x", pd.NA, "v"],
                [1, 8, "d", "e", pd.NA, "v"],
                [2, 7, "g", "z", pd.NA, "v"],
                [1, 7, "u", "f", pd.NA, "p"],
                [1, 7, "u", "z", pd.NA, "p"],
                [2, 7, "g", "z", pd.NA, "p"],
            ]
        )

    def test_pool_duplicate_subsets(self):
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6))
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))


1			import numpy as np
2			import pandas as pd
3			import unittest
4			from ..clean import data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets
5
6
7			class Test_drop_missing(unittest.TestCase):
8			@classmethod
9			def setUpClass(cls):
10			cls.df_data_drop = pd.DataFrame(
11			[
12			[np.nan, np.nan, np.nan, np.nan, np.nan],
13			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
14			[pd.NA, "b", "c", "d", "e"],
15			[pd.NA, 6, 7, 8, 9],
16			[pd.NA, 2, 3, 4, pd.NA],
17			[pd.NA, 6, 7, pd.NA, pd.NA],
18			],
19			columns=["c1", "c2", "c3", "c4", "c5"],
20			)
21
22			def test_drop_missing(self):
23			self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
24
25			# Drop further columns based on threshold
26			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
27			self.assertEqual(
28			drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4)
29			)
30			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
31			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))
32
33			# Drop further rows based on threshold
34			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
35			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
36			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
37			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
38			self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
39			self.assertEqual(
40			drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5)
41			)
42			self.assertEqual(
43			drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4)
44			)
45			self.assertEqual(
46			drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5)
47			)
48
49
50			class Test_data_cleaning(unittest.TestCase):
51			@classmethod
52			def setUpClass(cls):
53			cls.df_data_cleaning = pd.DataFrame(
54			[
55			[np.nan, np.nan, np.nan, np.nan, np.nan],
56			[pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
57			[pd.NA, "b", "c", "d", "e"],
58			[pd.NA, 6, 7, 8, 9],
59			[pd.NA, 2, 3, 4, pd.NA],
60			[pd.NA, 6, 7, pd.NA, pd.NA],
61			],
62			columns=["c1", "c2", "c3", "c4", "c5"],
63			)
64
65			def test_data_cleaning(self):
66			self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
67
68
69			class Test_convert_dtypes(unittest.TestCase):
70			@classmethod
71			def setUpClass(cls):
72			cls.df_data_convert = pd.DataFrame(
73			[
74			[1, 7.0, "y", "x", pd.NA, "v"],
75			[3, 8.0, "d", "e", pd.NA, "v"],
76			[5, 7.0, "o", "z", pd.NA, "v"],
77			[1, 7.0, "u", "f", pd.NA, "p"],
78			[1, 7.0, "u", "f", pd.NA, "p"],
79			[2, 7.0, "g", "a", pd.NA, "p"],
80			]
81			)
82
83			def test_convert_dtypes(self):
84			expected_results = ["Int8", "Float32", "string", "string", "category", "category"]
85			for i, _ in enumerate(expected_results):
86			self.assertEqual(
87			convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i]
88			)
89
90			expected_results = ["Int8", "Float32", "string", "string", "object", "string"]
91			for i, _ in enumerate(expected_results):
92			self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])
93
94			expected_results = ["Int8", "Float32", "string", "string", "object", "category"]
95			for i, _ in enumerate(expected_results):
96			self.assertEqual(
97			convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
98			expected_results[i],
99			)
100
101			expected_results = ["Int8", "Float32", "string", "category", "object", "category"]
102			for i, _ in enumerate(expected_results):
103			self.assertEqual(
104			convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
105			expected_results[i],
106			)
107
108			expected_results = ["Int8", "Float32", "string", "string", "object", "string"]
109			for i, _ in enumerate(expected_results):
110			self.assertEqual(
111			convert_datatypes(
112			self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4]
113			).dtypes[i],
114			expected_results[i],
115			)
116
117
118			class Test_pool_duplicate_subsets(unittest.TestCase):
119			@classmethod
120			def setUpClass(cls):
121			cls.df_data_subsets = pd.DataFrame(
122			[
123			[1, 7, "d", "x", pd.NA, "v"],
124			[1, 8, "d", "e", pd.NA, "v"],
125			[2, 7, "g", "z", pd.NA, "v"],
126			[1, 7, "u", "f", pd.NA, "p"],
127			[1, 7, "u", "z", pd.NA, "p"],
128			[2, 7, "g", "z", pd.NA, "p"],
129			]
130			)
131
132			def test_pool_duplicate_subsets(self):
133			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
134			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6))
135			self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))
136

akanz1 / klib

GitHub Access Token became invalid

Push — master ( 922334...81fced )

Test_data_cleaning.setUpClass() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like