GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( c20d75...406f67 )
by Andreas
01:47
created

Test_drop_missing.test_drop_missing()   B

Complexity

Conditions 1

Size

Total Lines 53
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 38
nop 1
dl 0
loc 53
rs 8.968
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
import numpy as np
2
import pandas as pd
3
import unittest
4
from ..clean import (
5
    data_cleaning,
6
    drop_missing,
7
    convert_datatypes,
8
    pool_duplicate_subsets,
9
)
10
11
12
class Test_drop_missing(unittest.TestCase):
13
    @classmethod
14
    def setUpClass(cls):
15
        cls.df_data_drop = pd.DataFrame(
16
            [
17
                [np.nan, np.nan, np.nan, np.nan, np.nan],
18
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
19
                [pd.NA, "b", "c", "d", "e"],
20
                [pd.NA, 6, 7, 8, 9],
21
                [pd.NA, 2, 3, 4, pd.NA],
22
                [pd.NA, 6, 7, pd.NA, pd.NA],
23
            ],
24
            columns=["c1", "c2", "c3", "c4", "c5"],
25
        )
26
27
    def test_drop_missing(self):
28
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
29
30
        # Drop further columns based on threshold
31
        self.assertEqual(
32
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)
33
        )
34
        self.assertEqual(
35
            drop_missing(
36
                self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]
37
            ).shape,
38
            (4, 4),
39
        )
40
        self.assertEqual(
41
            drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)
42
        )
43
        self.assertEqual(
44
            drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)
45
        )
46
47
        # Drop further rows based on threshold
48
        self.assertEqual(
49
            drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)
50
        )
51
        self.assertEqual(
52
            drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)
53
        )
54
        self.assertEqual(
55
            drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)
56
        )
57
        self.assertEqual(
58
            drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)
59
        )
60
        self.assertEqual(
61
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)
62
        )
63
        self.assertEqual(
64
            drop_missing(
65
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]
66
            ).shape,
67
            (2, 5),
68
        )
69
        self.assertEqual(
70
            drop_missing(
71
                self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]
72
            ).shape,
73
            (2, 4),
74
        )
75
        self.assertEqual(
76
            drop_missing(
77
                self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]
78
            ).shape,
79
            (3, 5),
80
        )
81
82
83
class Test_data_cleaning(unittest.TestCase):
84
    @classmethod
85
    def setUpClass(cls):
86
        cls.df_data_cleaning = pd.DataFrame(
87
            [
88
                [np.nan, np.nan, np.nan, np.nan, np.nan],
89
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
90
                [pd.NA, "b", 6, "d", "e"],
91
                [pd.NA, "b", 7, 8, 9],
92
                [pd.NA, "c", 3, 4, pd.NA],
93
                [pd.NA, "d", 7, pd.NA, pd.NA],
94
            ],
95
            columns=["c1", "c2", "c3", "c4", "c5"],
96
        )
97
98
    def test_data_cleaning(self):
99
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
100
        # c1 will be dropped despite in col_exclude because it is single valued
101
        self.assertEqual(
102
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
103
        )
104
105
        expected_results = ["string", "int8", "O", "O"]
106
        for i, _ in enumerate(expected_results):
107
            self.assertEqual(
108
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
109
                expected_results[i],
110
            )
111
112
113
class Test_convert_dtypes(unittest.TestCase):
114
    @classmethod
115
    def setUpClass(cls):
116
        cls.df_data_convert = pd.DataFrame(
117
            [
118
                [1, 7.0, "y", "x", pd.NA, "v"],
119
                [3, 8.0, "d", "e", pd.NA, "v"],
120
                [5, 7.0, "o", "z", pd.NA, "v"],
121
                [1, 7.0, "u", "f", pd.NA, "p"],
122
                [1, 7.0, "u", "f", pd.NA, "p"],
123
                [2, 7.0, "g", "a", pd.NA, "p"],
124
            ]
125
        )
126
127
    def test_convert_dtypes(self):
128
        expected_results = [
129
            "int8",
130
            "float32",
131
            "string",
132
            "string",
133
            "category",
134
            "category",
135
        ]
136
        for i, _ in enumerate(expected_results):
137
            self.assertEqual(
138
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i],
139
                expected_results[i],
140
            )
141
142
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
143
        for i, _ in enumerate(expected_results):
144
            self.assertEqual(
145
                convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]
146
            )
147
148
        expected_results = ["int8", "float32", "string", "string", "object", "category"]
149
        for i, _ in enumerate(expected_results):
150
            self.assertEqual(
151
                convert_datatypes(
152
                    self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]
153
                ).dtypes[i],
154
                expected_results[i],
155
            )
156
157
        expected_results = [
158
            "int8",
159
            "float32",
160
            "string",
161
            "category",
162
            "object",
163
            "category",
164
        ]
165
        for i, _ in enumerate(expected_results):
166
            self.assertEqual(
167
                convert_datatypes(
168
                    self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]
169
                ).dtypes[i],
170
                expected_results[i],
171
            )
172
173
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
174
        for i, _ in enumerate(expected_results):
175
            self.assertEqual(
176
                convert_datatypes(
177
                    self.df_data_convert,
178
                    category=False,
179
                    cat_threshold=0.95,
180
                    cat_exclude=[2, 4],
181
                ).dtypes[i],
182
                expected_results[i],
183
            )
184
185
186
class Test_pool_duplicate_subsets(unittest.TestCase):
187
    @classmethod
188
    def setUpClass(cls):
189
        cls.df_data_subsets = pd.DataFrame(
190
            [
191
                [1, 7, "d", "x", pd.NA, "v"],
192
                [1, 8, "d", "e", pd.NA, "v"],
193
                [2, 7, "g", "z", pd.NA, "v"],
194
                [1, 7, "u", "f", pd.NA, "p"],
195
                [1, 7, "u", "z", pd.NA, "p"],
196
                [2, 7, "g", "z", pd.NA, "p"],
197
            ]
198
        )
199
200
    def test_pool_duplicate_subsets(self):
201
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
202
        self.assertEqual(
203
            pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape,
204
            (6, 6),
205
        )
206
        self.assertEqual(
207
            pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)
208
        )
209