GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Test Failed
Pull Request — main (#17)
by Andreas
02:31
created

tests.test_clean   A

Complexity

Total Complexity 18

Size/Duplication

Total Lines 277
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 211
dl 0
loc 277
rs 10
c 0
b 0
f 0
wmc 18

10 Methods

Rating   Name   Duplication   Size   Complexity  
B Test_drop_missing.test_drop_missing() 0 53 1
A Test_clean_column_names.setUpClass() 0 23 1
A Test_convert_dtypes.setUpClass() 0 10 1
B Test_convert_dtypes.test_convert_dtypes() 0 70 6
A Test_drop_missing.setUpClass() 0 12 1
A Test_data_cleaning.test_data_cleaning() 0 12 2
A Test_clean_column_names.test_clean_column_names() 0 24 3
A Test_pool_duplicate_subsets.test_pool_duplicate_subsets() 0 8 1
A Test_pool_duplicate_subsets.setUpClass() 0 10 1
A Test_data_cleaning.setUpClass() 0 12 1
1
import unittest
2
3
import numpy as np
4
import pandas as pd
5
6
from klib.clean import (
7
    clean_column_names,
8
    convert_datatypes,
9
    data_cleaning,
10
    drop_missing,
11
    pool_duplicate_subsets,
12
)
13
14
15
class Test_clean_column_names(unittest.TestCase):
16
    @classmethod
17
    def setUpClass(cls) -> None:
18
        cls.df1 = pd.DataFrame(
19
            {
20
                "Asd 5$ & (3€)": [1, 2, 3],
21
                "3+3": [2, 3, 4],
22
                "AsdFer #9": [3, 4, 5],
23
                '"asdäöüß"': [5, 6, 7],
24
                "dupli": [5, 6, 8],
25
                "also": [9, 2, 7],
26
                "-Ä-__________!?:;some/(... \n ..))(++$%/name/    -.....": [2, 3, 7],
27
            }
28
        )
29
        cls.df2 = pd.DataFrame(
30
            {
31
                "dupli": [3, 2, 1],
32
                "also": [4, 5, 7],
33
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
34
                "< #total@": [2, 6, 4],
35
                "count >= 10": [6, 3, 2],
36
            }
37
        )
38
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
39
40
    def test_clean_column_names(self):
41
        expected_results = [
42
            "asd_5_dollar_and_3_euro",
43
            "3_plus_3",
44
            "asd_fer_hash_9",
45
            "asdaeoeuess",
46
            "dupli",
47
            "also",
48
            "ae_some_plus_plus_dollar_percent_name",
49
            "dupli_7",
50
            "also_8",
51
            "verylong_column_namesare_hardto_read",
52
            "smaller_hash_total_at",
53
            "count_larger_equal_10",
54
        ]
55
        for i, _ in enumerate(expected_results):
56
            self.assertEqual(
57
                clean_column_names(self.df_clean_column_names).columns[i],
58
                expected_results[i],
59
            )
60
        for i, _ in enumerate(expected_results):
61
            self.assertEqual(
62
                clean_column_names(self.df_clean_column_names, hints=False).columns[i],
63
                expected_results[i],
64
            )
65
66
67
class Test_drop_missing(unittest.TestCase):
68
    @classmethod
69
    def setUpClass(cls):
70
        cls.df_data_drop = pd.DataFrame(
71
            [
72
                [np.nan, np.nan, np.nan, np.nan, np.nan],
73
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
74
                [pd.NA, "b", "c", "d", "e"],
75
                [pd.NA, 6, 7, 8, 9],
76
                [pd.NA, 2, 3, 4, pd.NA],
77
                [pd.NA, 6, 7, pd.NA, pd.NA],
78
            ],
79
            columns=["c1", "c2", "c3", "c4", "c5"],
80
        )
81
82
    def test_drop_missing(self):
83
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
84
85
        # Drop further columns based on threshold
86
        self.assertEqual(
87
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
88
        )
89
        self.assertEqual(
90
            drop_missing(
91
                self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
92
            ).shape,
93
            (4, 4),
94
        )
95
        self.assertEqual(
96
            drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
97
        )
98
        self.assertEqual(
99
            drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
100
        )
101
102
        # Drop further rows based on threshold
103
        self.assertEqual(
104
            drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
105
        )
106
        self.assertEqual(
107
            drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
108
        )
109
        self.assertEqual(
110
            drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
111
        )
112
        self.assertEqual(
113
            drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
114
        )
115
        self.assertEqual(
116
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
117
        )
118
        self.assertEqual(
119
            drop_missing(
120
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
121
            ).shape,
122
            (2, 5),
123
        )
124
        self.assertEqual(
125
            drop_missing(
126
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
127
            ).shape,
128
            (2, 4),
129
        )
130
        self.assertEqual(
131
            drop_missing(
132
                self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
133
            ).shape,
134
            (3, 5),
135
        )
136
137
138
class Test_data_cleaning(unittest.TestCase):
139
    @classmethod
140
    def setUpClass(cls):
141
        cls.df_data_cleaning = pd.DataFrame(
142
            [
143
                [np.nan, np.nan, np.nan, np.nan, np.nan],
144
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
145
                [pd.NA, "b", 6, "d", "e"],
146
                [pd.NA, "b", 7, 8, 9],
147
                [pd.NA, "c", 3, 4, pd.NA],
148
                [pd.NA, "d", 7, pd.NA, pd.NA],
149
            ],
150
            columns=["c1", "c2", "c3", "c4", "c5"],
151
        )
152
153
    def test_data_cleaning(self):
154
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
155
        # c1 will be dropped despite in col_exclude because it is single valued
156
        self.assertEqual(
157
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
158
        )
159
160
        expected_results = ["string", "int8", "O", "O"]
161
        for i, _ in enumerate(expected_results):
162
            self.assertEqual(
163
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
164
                expected_results[i],
165
            )
166
167
168
class Test_convert_dtypes(unittest.TestCase):
169
    @classmethod
170
    def setUpClass(cls):
171
        cls.df_data_convert = pd.DataFrame(
172
            [
173
                [1, 7.0, "y", "x", pd.NA, "v"],
174
                [3, 8.0, "d", "e", pd.NA, "v"],
175
                [5, 7.0, "o", "z", pd.NA, "v"],
176
                [1, 7.0, "u", "f", pd.NA, "p"],
177
                [1, 7.0, "u", "f", pd.NA, "p"],
178
                [2, 7.0, "g", "a", pd.NA, "p"],
179
            ]
180
        )
181
182
    def test_convert_dtypes(self):
183
        expected_results = [
184
            "int8",
185
            "Float32",
186
            "string",
187
            "string",
188
            "category",
189
            "category",
190
        ]
191
        for i, _ in enumerate(expected_results):
192
            self.assertEqual(
193
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
194
                expected_results[i],
195
            )
196
197
        expected_results = [
198
            "int8",
199
            "Float32",
200
            "string",
201
            "string",
202
            "object",
203
            "string",
204
        ]
205
        for i, _ in enumerate(expected_results):
206
            self.assertEqual(
207
                convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
208
            )
209
210
        expected_results = [
211
            "int8",
212
            "Float32",
213
            "string",
214
            "string",
215
            "object",
216
            "category",
217
        ]
218
        for i, _ in enumerate(expected_results):
219
            self.assertEqual(
220
                convert_datatypes(
221
                    self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
222
                ).dtypes[i],
223
                expected_results[i],
224
            )
225
226
        expected_results = [
227
            "int8",
228
            "Float32",
229
            "string",
230
            "category",
231
            "object",
232
            "category",
233
        ]
234
        for i, _ in enumerate(expected_results):
235
            self.assertEqual(
236
                convert_datatypes(
237
                    self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
238
                ).dtypes[i],
239
                expected_results[i],
240
            )
241
242
        expected_results = ["int8", "Float32", "string", "string", "object", "string"]
243
        for i, _ in enumerate(expected_results):
244
            self.assertEqual(
245
                convert_datatypes(
246
                    self.df_data_convert,
247
                    category=False,
248
                    cat_threshold=0.95,
249
                    cat_exclude=[2, 4],
250
                ).dtypes[i],
251
                expected_results[i],
252
            )
253
254
255
class Test_pool_duplicate_subsets(unittest.TestCase):
256
    @classmethod
257
    def setUpClass(cls):
258
        cls.df_data_subsets = pd.DataFrame(
259
            [
260
                [1, 7, "d", "x", pd.NA, "v"],
261
                [1, 8, "d", "e", pd.NA, "v"],
262
                [2, 7, "g", "z", pd.NA, "v"],
263
                [1, 7, "u", "f", pd.NA, "p"],
264
                [1, 7, "u", "z", pd.NA, "p"],
265
                [2, 7, "g", "z", pd.NA, "p"],
266
            ]
267
        )
268
269
    def test_pool_duplicate_subsets(self):
270
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
271
        self.assertEqual(
272
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
273
            (6, 6),
274
        )
275
        self.assertEqual(
276
            pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
277
        )
278