GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 012cfd...853c75 )
by Andreas
01:13
created

klib.tests.test_clean   A

Complexity

Total Complexity 14

Size/Duplication

Total Lines 165
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 123
dl 0
loc 165
rs 10
c 0
b 0
f 0
wmc 14

8 Methods

Rating   Name   Duplication   Size   Complexity  
A Test_drop_missing.setUpClass() 0 12 1
A Test_data_cleaning.test_data_cleaning() 0 9 2
B Test_convert_dtypes.test_convert_dtypes() 0 46 6
A Test_data_cleaning.setUpClass() 0 12 1
A Test_drop_missing.test_drop_missing() 0 25 1
A Test_convert_dtypes.setUpClass() 0 10 1
A Test_pool_duplicate_subsets.test_pool_duplicate_subsets() 0 6 1
A Test_pool_duplicate_subsets.setUpClass() 0 10 1
1
import numpy as np
2
import pandas as pd
3
import unittest
4
from ..clean import (
5
    data_cleaning,
6
    drop_missing,
7
    convert_datatypes,
8
    pool_duplicate_subsets,
9
)
10
11
12
class Test_drop_missing(unittest.TestCase):
13
    @classmethod
14
    def setUpClass(cls):
15
        cls.df_data_drop = pd.DataFrame(
16
            [
17
                [np.nan, np.nan, np.nan, np.nan, np.nan],
18
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
19
                [pd.NA, "b", "c", "d", "e"],
20
                [pd.NA, 6, 7, 8, 9],
21
                [pd.NA, 2, 3, 4, pd.NA],
22
                [pd.NA, 6, 7, pd.NA, pd.NA],
23
            ],
24
            columns=["c1", "c2", "c3", "c4", "c5"],
25
        )
26
27
    def test_drop_missing(self):
28
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
29
30
        # Drop further columns based on threshold
31
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
32
        self.assertEqual(
33
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4),
34
        )
35
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
36
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))
37
38
        # Drop further rows based on threshold
39
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
40
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
41
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
42
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
43
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
44
        self.assertEqual(
45
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5),
46
        )
47
        self.assertEqual(
48
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4),
49
        )
50
        self.assertEqual(
51
            drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5),
52
        )
53
54
55
class Test_data_cleaning(unittest.TestCase):
56
    @classmethod
57
    def setUpClass(cls):
58
        cls.df_data_cleaning = pd.DataFrame(
59
            [
60
                [np.nan, np.nan, np.nan, np.nan, np.nan],
61
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
62
                [pd.NA, "b", 6, "d", "e"],
63
                [pd.NA, "b", 7, 8, 9],
64
                [pd.NA, "c", 3, 4, pd.NA],
65
                [pd.NA, "d", 7, pd.NA, pd.NA],
66
            ],
67
            columns=["c1", "c2", "c3", "c4", "c5"],
68
        )
69
70
    def test_data_cleaning(self):
71
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
72
        # c1 will be dropped despite in col_exclude because it is single valued
73
        self.assertEqual(data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4))
74
75
        expected_results = ["string", "int8", "O", "O"]
76
        for i, _ in enumerate(expected_results):
77
            self.assertEqual(
78
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i], expected_results[i],
79
            )
80
81
82
class Test_convert_dtypes(unittest.TestCase):
83
    @classmethod
84
    def setUpClass(cls):
85
        cls.df_data_convert = pd.DataFrame(
86
            [
87
                [1, 7.0, "y", "x", pd.NA, "v"],
88
                [3, 8.0, "d", "e", pd.NA, "v"],
89
                [5, 7.0, "o", "z", pd.NA, "v"],
90
                [1, 7.0, "u", "f", pd.NA, "p"],
91
                [1, 7.0, "u", "f", pd.NA, "p"],
92
                [2, 7.0, "g", "a", pd.NA, "p"],
93
            ]
94
        )
95
96
    def test_convert_dtypes(self):
97
        expected_results = [
98
            "int8",
99
            "float32",
100
            "string",
101
            "string",
102
            "category",
103
            "category",
104
        ]
105
        for i, _ in enumerate(expected_results):
106
            self.assertEqual(
107
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i],
108
            )
109
110
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
111
        for i, _ in enumerate(expected_results):
112
            self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])
113
114
        expected_results = ["int8", "float32", "string", "string", "object", "category"]
115
        for i, _ in enumerate(expected_results):
116
            self.assertEqual(
117
                convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
118
                expected_results[i],
119
            )
120
121
        expected_results = [
122
            "int8",
123
            "float32",
124
            "string",
125
            "category",
126
            "object",
127
            "category",
128
        ]
129
        for i, _ in enumerate(expected_results):
130
            self.assertEqual(
131
                convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
132
                expected_results[i],
133
            )
134
135
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
136
        for i, _ in enumerate(expected_results):
137
            self.assertEqual(
138
                convert_datatypes(
139
                    self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4],
140
                ).dtypes[i],
141
                expected_results[i],
142
            )
143
144
145
class Test_pool_duplicate_subsets(unittest.TestCase):
146
    @classmethod
147
    def setUpClass(cls):
148
        cls.df_data_subsets = pd.DataFrame(
149
            [
150
                [1, 7, "d", "x", pd.NA, "v"],
151
                [1, 8, "d", "e", pd.NA, "v"],
152
                [2, 7, "g", "z", pd.NA, "v"],
153
                [1, 7, "u", "f", pd.NA, "p"],
154
                [1, 7, "u", "z", pd.NA, "p"],
155
                [2, 7, "g", "z", pd.NA, "p"],
156
            ]
157
        )
158
159
    def test_pool_duplicate_subsets(self):
160
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
161
        self.assertEqual(
162
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6),
163
        )
164
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))
165