GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Test_pool_duplicate_subsets.setUpClass()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 12
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 11
nop 1
dl 0
loc 12
rs 9.85
c 0
b 0
f 0
1
from __future__ import annotations
2
3
import io
4
import sys
5
import unittest
6
7
import numpy as np
8
import pandas as pd
9
10
from klib.clean import clean_column_names
11
from klib.clean import convert_datatypes
12
from klib.clean import data_cleaning
13
from klib.clean import drop_missing
14
from klib.clean import pool_duplicate_subsets
15
16
17
class Test_clean_column_names(unittest.TestCase):
18
    @classmethod
19
    def setUpClass(cls) -> None:
20
        cls.df1 = pd.DataFrame(
21
            {
22
                "Asd 5$ & (3€)": [1, 2, 3],
23
                "3+3": [2, 3, 4],
24
                "AsdFer #9": [3, 4, 5],
25
                '"asdäöüß"': [5, 6, 7],
26
                "dupli": [5, 6, 8],
27
                "also": [9, 2, 7],
28
                "-ä-__________!?:;some/(... \n ..))(++$%/name/    -.....": [2, 3, 7],
29
            },
30
        )
31
        cls.df2 = pd.DataFrame(
32
            {
33
                "dupli": [3, 2, 1],
34
                "also": [4, 5, 7],
35
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
36
                "< #total@": [2, 6, 4],
37
                "count >= 10": [6, 3, 2],
38
            },
39
        )
40
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
41
42
    def test_clean_column_names(self) -> None:
43
        expected_results = [
44
            "asd_5_dollar_and_3_euro",
45
            "3_plus_3",
46
            "asd_fer_hash_9",
47
            "asdaeoeuess",
48
            "dupli",
49
            "also",
50
            "ae_some_plus_plus_dollar_percent_name",
51
            "dupli_7",
52
            "also_8",
53
            "verylong_column_namesare_hardto_read",
54
            "smaller_hash_total_at",
55
            "count_larger_equal_10",
56
        ]
57
        for i, _ in enumerate(expected_results):
58
            assert clean_column_names(self.df_clean_column_names).columns[i] == expected_results[i]
59
        for i, _ in enumerate(expected_results):
60
            assert (
61
                clean_column_names(self.df_clean_column_names, hints=False).columns[i]
62
                == expected_results[i]
63
            )
64
65
    def test_clean_column_names_prints(self) -> None:
66
        captured_output = io.StringIO()
67
        sys.stdout = captured_output
68
        clean_column_names(self.df_clean_column_names, hints=True)
69
        sys.stdout = sys.__stdout__
70
        assert captured_output.getvalue() == (
71
            "(\"Duplicate column names detected! Columns with index [7, 8] and names ['dupli', 'also'] have been renamed to ['dupli_7', 'also_8'].\", \"Long column names detected (>25 characters). Consider renaming the following columns ['ae_some_plus_plus_dollar_percent_name', 'verylong_column_namesare_hardto_read'].\")\n"
72
        )
73
74
75
class Test_drop_missing(unittest.TestCase):
76
    @classmethod
77
    def setUpClass(cls) -> None:
78
        cls.df_data_drop = pd.DataFrame(
79
            [
80
                [np.nan, np.nan, np.nan, np.nan, np.nan],
81
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
82
                [pd.NA, "b", "c", "d", "e"],
83
                [pd.NA, 6, 7, 8, 9],
84
                [pd.NA, 2, 3, 4, pd.NA],
85
                [pd.NA, 6, 7, pd.NA, pd.NA],
86
            ],
87
            columns=["c1", "c2", "c3", "c4", "c5"],
88
        )
89
90
    def test_drop_missing(self) -> None:
91
        assert drop_missing(self.df_data_drop).shape == (4, 4)
92
93
        # Drop further columns based on threshold
94
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape == (4, 3)
95
        assert drop_missing(
96
            self.df_data_drop,
97
            drop_threshold_cols=0.5,
98
            col_exclude=["c1"],
99
        ).shape == (4, 4)
100
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape == (4, 2)
101
        assert drop_missing(self.df_data_drop, drop_threshold_cols=0).shape == (0, 0)
102
103
        # Drop further rows based on threshold
104
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape == (4, 4)
105
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape == (4, 4)
106
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape == (3, 4)
107
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape == (3, 4)
108
        assert drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape == (2, 4)
109
        assert drop_missing(
110
            self.df_data_drop,
111
            drop_threshold_rows=0.24,
112
            col_exclude=["c1"],
113
        ).shape == (2, 5)
114
        assert drop_missing(
115
            self.df_data_drop,
116
            drop_threshold_rows=0.24,
117
            col_exclude=["c2"],
118
        ).shape == (2, 4)
119
        assert drop_missing(
120
            self.df_data_drop,
121
            drop_threshold_rows=0.51,
122
            col_exclude=["c1"],
123
        ).shape == (3, 5)
124
125
126
class Test_data_cleaning(unittest.TestCase):
127
    @classmethod
128
    def setUpClass(cls) -> None:
129
        cls.df_data_cleaning = pd.DataFrame(
130
            [
131
                [np.nan, np.nan, np.nan, np.nan, np.nan, 1],
132
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1],
133
                [pd.NA, "b", 6, "d", "e", 1],
134
                [pd.NA, "b", 7, 8, 9, 1],
135
                [pd.NA, "c", 3, 4, pd.NA, 1],
136
                [pd.NA, "d", 7, pd.NA, pd.NA, 1],
137
            ],
138
            columns=["c1", "c2", "c3", "c 4", "c5", "c6"],
139
        )
140
141
    def test_data_cleaning(self) -> None:
142
        assert data_cleaning(self.df_data_cleaning, show="all").shape == (5, 4)
143
        assert data_cleaning(self.df_data_cleaning, show=None).shape == (5, 4)
144
145
        assert data_cleaning(self.df_data_cleaning, col_exclude=["c6"]).shape == (5, 5)
146
147
        assert data_cleaning(
148
            self.df_data_cleaning,
149
            show="changes",
150
            clean_col_names=False,
151
            drop_duplicates=False,
152
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]
153
154
        assert data_cleaning(
155
            self.df_data_cleaning,
156
            show="changes",
157
            clean_col_names=False,
158
            drop_duplicates=False,
159
        ).columns.tolist() == ["c2", "c3", "c 4", "c5"]
160
161
        expected_results = ["string", "float32", "O", "O"]
162
        for i, _ in enumerate(expected_results):
163
            assert (
164
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i]
165
                == expected_results[i]
166
            )
167
168
        expected_results = ["O", "O", "O", "O"]
169
        for i, _ in enumerate(expected_results):
170
            assert (
171
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
172
                == expected_results[i]
173
            )
174
175
        expected_results = ["O", "O", "O", "O"]
176
        for i, _ in enumerate(expected_results):
177
            assert (
178
                data_cleaning(self.df_data_cleaning, convert_dtypes=False).dtypes[i]
179
                == expected_results[i]
180
            )
181
182
183
class Test_convert_dtypes(unittest.TestCase):
184
    @classmethod
185
    def setUpClass(cls) -> None:
186
        cls.df_data_convert = pd.DataFrame(
187
            [
188
                [1, 7.0, "y", "x", pd.NA, "v"],
189
                [3, 8.0, "d", "e", pd.NA, "v"],
190
                [5, 7.0, "o", "z", pd.NA, "v"],
191
                [1, 7.0, "u", "f", pd.NA, "p"],
192
                [1, 7.0, "u", "f", pd.NA, "p"],
193
                [2, 7.0, "g", "a", pd.NA, "p"],
194
            ],
195
        )
196
197
    def test_convert_dtypes(self) -> None:
198
        expected_results = [
199
            "int8",
200
            "float32",
201
            "string",
202
            "string",
203
            "category",
204
            "category",
205
        ]
206
        for i, _ in enumerate(expected_results):
207
            assert (
208
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i]
209
                == expected_results[i]
210
            )
211
212
        expected_results = [
213
            "int8",
214
            "float32",
215
            "string",
216
            "string",
217
            "object",
218
            "string",
219
        ]
220
        for i, _ in enumerate(expected_results):
221
            assert convert_datatypes(self.df_data_convert).dtypes[i] == expected_results[i]
222
223
        expected_results = [
224
            "int8",
225
            "float32",
226
            "string",
227
            "string",
228
            "object",
229
            "category",
230
        ]
231
        for i, _ in enumerate(expected_results):
232
            assert (
233
                convert_datatypes(
234
                    self.df_data_convert,
235
                    cat_threshold=0.5,
236
                    cat_exclude=[4],
237
                ).dtypes[i]
238
                == expected_results[i]
239
            )
240
241
        expected_results = [
242
            "int8",
243
            "float32",
244
            "string",
245
            "category",
246
            "object",
247
            "category",
248
        ]
249
        for i, _ in enumerate(expected_results):
250
            assert (
251
                convert_datatypes(
252
                    self.df_data_convert,
253
                    cat_threshold=0.95,
254
                    cat_exclude=[2, 4],
255
                ).dtypes[i]
256
                == expected_results[i]
257
            )
258
259
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
260
        for i, _ in enumerate(expected_results):
261
            assert (
262
                convert_datatypes(
263
                    self.df_data_convert,
264
                    category=False,
265
                    cat_threshold=0.95,
266
                    cat_exclude=[2, 4],
267
                ).dtypes[i]
268
                == expected_results[i]
269
            )
270
271
272
class Test_pool_duplicate_subsets(unittest.TestCase):
273
    @classmethod
274
    def setUpClass(cls) -> None:
275
        cls.df_data_subsets = pd.DataFrame(
276
            [
277
                [1, 7, "d", "x", pd.NA, "v"],
278
                [1, 8, "d", "e", pd.NA, "v"],
279
                [2, 7, "g", "z", pd.NA, "v"],
280
                [1, 7, "u", "f", pd.NA, "p"],
281
                [1, 7, "u", "z", pd.NA, "p"],
282
                [2, 7, "g", "z", pd.NA, "p"],
283
            ],
284
            columns=["c1", "c2", "c3", "c4", "c5", "c6"],
285
        )
286
287
    def test_pool_duplicate_subsets(self) -> None:
288
        assert pool_duplicate_subsets(self.df_data_subsets).shape == (6, 3)
289
        assert pool_duplicate_subsets(
290
            self.df_data_subsets,
291
            col_dupl_thresh=1,
292
        ).shape == (6, 6)
293
294
        assert pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape == (
295
            6,
296
            2,
297
        )
298
299
        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[0].shape == (6, 3)
300
        assert pool_duplicate_subsets(self.df_data_subsets, return_details=True)[1] == [
301
            "c1",
302
            "c2",
303
            "c3",
304
            "c5",
305
        ]
306
307
        assert pool_duplicate_subsets(self.df_data_subsets, exclude=["c1"]).shape == (
308
            6,
309
            4,
310
        )
311
312
        assert pool_duplicate_subsets(
313
            self.df_data_subsets,
314
            exclude=["c1"],
315
            return_details=True,
316
        )[1] == ["c2", "c5", "c6"]
317