GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Test Failed
Push — main ( 38f5af...23f36e )
by Andreas
06:49
created

Test_drop_missing.test_drop_missing()   A

Complexity

Conditions 1

Size

Total Lines 34
Code Lines 30

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 30
nop 1
dl 0
loc 34
rs 9.16
c 0
b 0
f 0
1
from __future__ import annotations
2
3
import io
4
import sys
5
import unittest
6
7
import numpy as np
8
import pandas as pd
9
from klib.clean import clean_column_names
10
from klib.clean import convert_datatypes
11
from klib.clean import data_cleaning
12
from klib.clean import drop_missing
13
from klib.clean import pool_duplicate_subsets
14
15
16
class Test_clean_column_names(unittest.TestCase):
17
    @classmethod
18
    def setUpClass(cls) -> None:
19
        cls.df1 = pd.DataFrame(
20
            {
21
                "Asd 5$ & (3€)": [1, 2, 3],
22
                "3+3": [2, 3, 4],
23
                "AsdFer #9": [3, 4, 5],
24
                '"asdäöüß"': [5, 6, 7],
25
                "dupli": [5, 6, 8],
26
                "also": [9, 2, 7],
27
                "-ä-__________!?:;some/(... \n ..))(++$%/name/    -.....": [2, 3, 7],
28
            },
29
        )
30
        cls.df2 = pd.DataFrame(
31
            {
32
                "dupli": [3, 2, 1],
33
                "also": [4, 5, 7],
34
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
35
                "< #total@": [2, 6, 4],
36
                "count >= 10": [6, 3, 2],
37
            },
38
        )
39
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
40
41
    def test_clean_column_names(self):
42
        expected_results = [
43
            "asd_5_dollar_and_3_euro",
44
            "3_plus_3",
45
            "asd_fer_hash_9",
46
            "asdaeoeuess",
47
            "dupli",
48
            "also",
49
            "ae_some_plus_plus_dollar_percent_name",
50
            "dupli_7",
51
            "also_8",
52
            "verylong_column_namesare_hardto_read",
53
            "smaller_hash_total_at",
54
            "count_larger_equal_10",
55
        ]
56
        for i, _ in enumerate(expected_results):
57
            assert (
58
                clean_column_names(self.df_clean_column_names).columns[i]
59
                == expected_results[i]
60
            )
61
        for i, _ in enumerate(expected_results):
62
            assert (
63
                clean_column_names(self.df_clean_column_names, hints=False).columns[i]
64
                == expected_results[i]
65
            )
66
67
    def test_clean_column_names_prints(self):
68
        captured_output = io.StringIO()
69
        sys.stdout = captured_output
70
        clean_column_names(self.df_clean_column_names, hints=True)
71
        sys.stdout = sys.__stdout__
72
        assert captured_output.getvalue() == (
73
            "(\"Duplicate column names detected! Columns with index [7, 8] and names ['dupli', 'also'] have been renamed to ['dupli_7', 'also_8'].\", \"Long column names detected (>25 characters). Consider renaming the following columns ['ae_some_plus_plus_dollar_percent_name', 'verylong_column_namesare_hardto_read'].\")\n"
74
        )
75
76
77
class Test_drop_missing(unittest.TestCase):
78
    @classmethod
79
    def setUpClass(cls):
80
        cls.df_data_drop = pd.DataFrame(
81
            [
82
                [np.nan, np.nan, np.nan, np.nan, np.nan],
83
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
84
                [pd.NA, "b", "c", "d", "e"],
85
                [pd.NA, 6, 7, 8, 9],
86
                [pd.NA, 2, 3, 4, pd.NA],
87
                [pd.NA, 6, 7, pd.NA, pd.NA],
88
            ],
89
            columns=["c1", "c2", "c3", "c4", "c5"],
90
        )
91
92
    def test_drop_missing(self):
93
        assert drop_missing(self.df_data_drop).shape == (4, 4)
94
95
        # Drop further columns based on threshold
96
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape == (4, 3)
97
        assert drop_missing(
98
            self.df_data_drop,
99
            drop_threshold_cols=0.5,
100
            col_exclude=["c1"],
101
        ).shape == (4, 4)
102
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape == (4, 2)
103
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0).shape == (0, 0)
104
105
        # Drop further rows based on threshold
106
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape == (4, 4)
107
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape == (4, 4)
108
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape == (3, 4)
109
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape == (3, 4)
110
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape == (2, 4)
111
        assert drop_missing(
112
            self.df_data_drop,
113
            drop_threshold_rows=0.24,
114
            col_exclude=["c1"],
115
        ).shape == (2, 5)
116
        assert drop_missing(
117
            self.df_data_drop,
118
            drop_threshold_rows=0.24,
119
            col_exclude=["c2"],
120
        ).shape == (2, 4)
121
        assert drop_missing(
122
            self.df_data_drop,
123
            drop_threshold_rows=0.51,
124
            col_exclude=["c1"],
125
        ).shape == (3, 5)
126
127
128
class Test_data_cleaning(unittest.TestCase):
129
    @classmethod
130
    def setUpClass(cls):
131
        cls.df_data_cleaning = pd.DataFrame(
132
            [
133
                [np.nan, np.nan, np.nan, np.nan, np.nan, 1],
134
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1],
135
                [pd.NA, "b", 6, "d", "e", 1],
136
                [pd.NA, "b", 7, 8, 9, 1],
137
                [pd.NA, "c", 3, 4, pd.NA, 1],
138
                [pd.NA, "d", 7, pd.NA, pd.NA, 1],
139
            ],
140
            columns=["c1", "c2", "c3", "c 4", "c5", "c6"],
141
        )
142
143
    def test_data_cleaning(self):
144
        assert data_cleaning(self.df_data_cleaning, show="all").shape == (5, 4)
145
        assert data_cleaning(self.df_data_cleaning, show=None).shape == (5, 4)
146
147
        assert data_cleaning(self.df_data_cleaning, col_exclude=["c6"]).shape == (5, 5)
148
149
        assert data_cleaning(
150
            self.df_data_cleaning,
151
            show="changes",
152
            clean_col_names=False,
153
            drop_duplicates=False,
154
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]
155
156
        assert data_cleaning(
157
            self.df_data_cleaning,
158
            show="changes",
159
            clean_col_names=False,
160
            drop_duplicates=False,
161
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]
162
163
        expected_results = ["string", "float32", "O", "O"]
164
        for i, _ in enumerate(expected_results):
165
            assert (
166
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i]
167
                == expected_results[i]
168
            )
169
170
        expected_results = ["O", "O", "O", "O"]
171
        for i, _ in enumerate(expected_results):
172
            assert (
173
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
174
                == expected_results[i]
175
            )
176
177
        expected_results = ["O", "O", "O", "O"]
178
        for i, _ in enumerate(expected_results):
179
            assert (
180
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
181
                == expected_results[i]
182
            )
183
184
185
class Test_convert_dtypes(unittest.TestCase):
186
    @classmethod
187
    def setUpClass(cls):
188
        cls.df_data_convert = pd.DataFrame(
189
            [
190
                [1, 7.0, "y", "x", pd.NA, "v"],
191
                [3, 8.0, "d", "e", pd.NA, "v"],
192
                [5, 7.0, "o", "z", pd.NA, "v"],
193
                [1, 7.0, "u", "f", pd.NA, "p"],
194
                [1, 7.0, "u", "f", pd.NA, "p"],
195
                [2, 7.0, "g", "a", pd.NA, "p"],
196
            ],
197
        )
198
199
    def test_convert_dtypes(self):
200
        expected_results = [
201
            "int8",
202
            "float32",
203
            "string",
204
            "string",
205
            "category",
206
            "category",
207
        ]
208
        for i, _ in enumerate(expected_results):
209
            assert (
210
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i]
211
                == expected_results[i]
212
            )
213
214
        expected_results = [
215
            "int8",
216
            "float32",
217
            "string",
218
            "string",
219
            "object",
220
            "string",
221
        ]
222
        for i, _ in enumerate(expected_results):
223
            assert (
224
                convert_datatypes(self.df_data_convert).dtypes[i] == expected_results[i]
225
            )
226
227
        expected_results = [
228
            "int8",
229
            "float32",
230
            "string",
231
            "string",
232
            "object",
233
            "category",
234
        ]
235
        for i, _ in enumerate(expected_results):
236
            assert (
237
                convert_datatypes(
238
                    self.df_data_convert,
239
                    cat_threshold=0.5,
240
                    cat_exclude=[4],
241
                ).dtypes[i]
242
                == expected_results[i]
243
            )
244
245
        expected_results = [
246
            "int8",
247
            "float32",
248
            "string",
249
            "category",
250
            "object",
251
            "category",
252
        ]
253
        for i, _ in enumerate(expected_results):
254
            assert (
255
                convert_datatypes(
256
                    self.df_data_convert,
257
                    cat_threshold=0.95,
258
                    cat_exclude=[2, 4],
259
                ).dtypes[i]
260
                == expected_results[i]
261
            )
262
263
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
264
        for i, _ in enumerate(expected_results):
265
            assert (
266
                convert_datatypes(
267
                    self.df_data_convert,
268
                    category=False,
269
                    cat_threshold=0.95,
270
                    cat_exclude=[2, 4],
271
                ).dtypes[i]
272
                == expected_results[i]
273
            )
274
275
276
class Test_pool_duplicate_subsets(unittest.TestCase):
277
    @classmethod
278
    def setUpClass(cls):
279
        cls.df_data_subsets = pd.DataFrame(
280
            [
281
                [1, 7, "d", "x", pd.NA, "v"],
282
                [1, 8, "d", "e", pd.NA, "v"],
283
                [2, 7, "g", "z", pd.NA, "v"],
284
                [1, 7, "u", "f", pd.NA, "p"],
285
                [1, 7, "u", "z", pd.NA, "p"],
286
                [2, 7, "g", "z", pd.NA, "p"],
287
            ],
288
            columns=["c1", "c2", "c3", "c4", "c5", "c6"],
289
        )
290
291
    def test_pool_duplicate_subsets(self):
292
        assert pool_duplicate_subsets(self.df_data_subsets).shape == (6, 3)
293
        assert pool_duplicate_subsets(
294
            self.df_data_subsets,
295
            col_dupl_thresh=1,
296
        ).shape == (6, 6)
297
298
        assert pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape == (
299
            6,
300
            2,
301
        )
302
303
        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[
304
            0
305
        ].shape == (6, 3)
306
        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[1] == [
307
            "c1",
308
            "c2",
309
            "c3",
310
            "c5",
311
        ]
312
313
        assert pool_duplicate_subsets(self.df_data_subsets, exclude=["c1"]).shape == (
314
            6,
315
            4,
316
        )
317
318
        assert pool_duplicate_subsets(
319
            self.df_data_subsets,
320
            exclude=["c1"],
321
            return_details=True,
322
        )[1] == ["c2", "c5", "c6"]
323