GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 8dc02e...5d400f )
by Andreas
01:18
created

Test_drop_missing.test_drop_missing()   B

Complexity

Conditions 1

Size

Total Lines 53
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 38
nop 1
dl 0
loc 53
rs 8.968
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
import numpy as np
2
import pandas as pd
3
import unittest
4
from ..clean import (
5
    clean_column_names,
6
    data_cleaning,
7
    drop_missing,
8
    convert_datatypes,
9
    pool_duplicate_subsets,
10
)
11
12
13
class Test_clean_column_names(unittest.TestCase):
14
    @classmethod
15
    def setUpClass(cls) -> None:
16
        cls.df1 = pd.DataFrame(
17
            {
18
                "Asd 5$ & (3€)": [1, 2, 3],
19
                "3+3": [2, 3, 4],
20
                "AsdFer #9": [3, 4, 5],
21
                '"asd"': [5, 6, 7],
22
                "dupli": [5, 6, 8],
23
                "also": [9, 2, 7],
24
                "-Ä-__some/(... \n ..))++$%/name/   .........": [2, 3, 7],
25
            }
26
        )
27
        cls.df2 = pd.DataFrame(
28
            {
29
                "dupli": [3, 2, 1],
30
                "also": [4, 5, 7],
31
                "verylongColumnNamesareHardtoRead": [9, 2, 7],
32
            }
33
        )
34
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
35
36
    def test_clean_column_names(self):
37
        expected_results = [
38
            "asd_5_dollar_and_3_euro",
39
            "3_plus_3",
40
            "asd_fer_number_9",
41
            "asd",
42
            "dupli",
43
            "also",
44
            "ae_some_plus_plus_dollar_percent_name",
45
            "dupli_7",
46
            "also_8",
47
            "verylong_column_namesare_hardto_read",
48
        ]
49
        for i, _ in enumerate(expected_results):
50
            self.assertEqual(
51
                clean_column_names(self.df_clean_column_names).columns[i],
52
                expected_results[i],
53
            )
54
        for i, _ in enumerate(expected_results):
55
            self.assertEqual(
56
                clean_column_names(self.df_clean_column_names, hints=False).columns[i],
57
                expected_results[i],
58
            )
59
60
61
class Test_drop_missing(unittest.TestCase):
62
    @classmethod
63
    def setUpClass(cls):
64
        cls.df_data_drop = pd.DataFrame(
65
            [
66
                [np.nan, np.nan, np.nan, np.nan, np.nan],
67
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
68
                [pd.NA, "b", "c", "d", "e"],
69
                [pd.NA, 6, 7, 8, 9],
70
                [pd.NA, 2, 3, 4, pd.NA],
71
                [pd.NA, 6, 7, pd.NA, pd.NA],
72
            ],
73
            columns=["c1", "c2", "c3", "c4", "c5"],
74
        )
75
76
    def test_drop_missing(self):
77
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
78
79
        # Drop further columns based on threshold
80
        self.assertEqual(
81
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
82
        )
83
        self.assertEqual(
84
            drop_missing(
85
                self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
86
            ).shape,
87
            (4, 4),
88
        )
89
        self.assertEqual(
90
            drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
91
        )
92
        self.assertEqual(
93
            drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
94
        )
95
96
        # Drop further rows based on threshold
97
        self.assertEqual(
98
            drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
99
        )
100
        self.assertEqual(
101
            drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
102
        )
103
        self.assertEqual(
104
            drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
105
        )
106
        self.assertEqual(
107
            drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
108
        )
109
        self.assertEqual(
110
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
111
        )
112
        self.assertEqual(
113
            drop_missing(
114
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
115
            ).shape,
116
            (2, 5),
117
        )
118
        self.assertEqual(
119
            drop_missing(
120
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
121
            ).shape,
122
            (2, 4),
123
        )
124
        self.assertEqual(
125
            drop_missing(
126
                self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
127
            ).shape,
128
            (3, 5),
129
        )
130
131
132
class Test_data_cleaning(unittest.TestCase):
133
    @classmethod
134
    def setUpClass(cls):
135
        cls.df_data_cleaning = pd.DataFrame(
136
            [
137
                [np.nan, np.nan, np.nan, np.nan, np.nan],
138
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
139
                [pd.NA, "b", 6, "d", "e"],
140
                [pd.NA, "b", 7, 8, 9],
141
                [pd.NA, "c", 3, 4, pd.NA],
142
                [pd.NA, "d", 7, pd.NA, pd.NA],
143
            ],
144
            columns=["c1", "c2", "c3", "c4", "c5"],
145
        )
146
147
    def test_data_cleaning(self):
148
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
149
        # c1 will be dropped despite in col_exclude because it is single valued
150
        self.assertEqual(
151
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
152
        )
153
154
        expected_results = ["string", "int8", "O", "O"]
155
        for i, _ in enumerate(expected_results):
156
            self.assertEqual(
157
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
158
                expected_results[i],
159
            )
160
161
162
class Test_convert_dtypes(unittest.TestCase):
163
    @classmethod
164
    def setUpClass(cls):
165
        cls.df_data_convert = pd.DataFrame(
166
            [
167
                [1, 7.0, "y", "x", pd.NA, "v"],
168
                [3, 8.0, "d", "e", pd.NA, "v"],
169
                [5, 7.0, "o", "z", pd.NA, "v"],
170
                [1, 7.0, "u", "f", pd.NA, "p"],
171
                [1, 7.0, "u", "f", pd.NA, "p"],
172
                [2, 7.0, "g", "a", pd.NA, "p"],
173
            ]
174
        )
175
176
    def test_convert_dtypes(self):
177
        expected_results = [
178
            "int8",
179
            "float32",
180
            "string",
181
            "string",
182
            "category",
183
            "category",
184
        ]
185
        for i, _ in enumerate(expected_results):
186
            self.assertEqual(
187
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
188
                expected_results[i],
189
            )
190
191
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
192
        for i, _ in enumerate(expected_results):
193
            self.assertEqual(
194
                convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
195
            )
196
197
        expected_results = ["int8", "float32", "string", "string", "object", "category"]
198
        for i, _ in enumerate(expected_results):
199
            self.assertEqual(
200
                convert_datatypes(
201
                    self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
202
                ).dtypes[i],
203
                expected_results[i],
204
            )
205
206
        expected_results = [
207
            "int8",
208
            "float32",
209
            "string",
210
            "category",
211
            "object",
212
            "category",
213
        ]
214
        for i, _ in enumerate(expected_results):
215
            self.assertEqual(
216
                convert_datatypes(
217
                    self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
218
                ).dtypes[i],
219
                expected_results[i],
220
            )
221
222
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
223
        for i, _ in enumerate(expected_results):
224
            self.assertEqual(
225
                convert_datatypes(
226
                    self.df_data_convert,
227
                    category=False,
228
                    cat_threshold=0.95,
229
                    cat_exclude=[2, 4],
230
                ).dtypes[i],
231
                expected_results[i],
232
            )
233
234
235
class Test_pool_duplicate_subsets(unittest.TestCase):
236
    @classmethod
237
    def setUpClass(cls):
238
        cls.df_data_subsets = pd.DataFrame(
239
            [
240
                [1, 7, "d", "x", pd.NA, "v"],
241
                [1, 8, "d", "e", pd.NA, "v"],
242
                [2, 7, "g", "z", pd.NA, "v"],
243
                [1, 7, "u", "f", pd.NA, "p"],
244
                [1, 7, "u", "z", pd.NA, "p"],
245
                [2, 7, "g", "z", pd.NA, "p"],
246
            ]
247
        )
248
249
    def test_pool_duplicate_subsets(self):
250
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
251
        self.assertEqual(
252
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
253
            (6, 6),
254
        )
255
        self.assertEqual(
256
            pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
257
        )
258