GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( bb0913...522ac3 )
by Andreas
01:07
created

Test_clean_column_names.setUpClass()   A

Complexity

Conditions 1

Size

Total Lines 16
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 13
nop 1
dl 0
loc 16
rs 9.75
c 0
b 0
f 0
1
import numpy as np
2
import pandas as pd
3
import unittest
4
from ..clean import clean_column_names, data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets
5
6
7
class Test_clean_column_names(unittest.TestCase):
8
    @classmethod
9
    def setUpClass(cls) -> None:
10
        cls.df1 = pd.DataFrame(
11
            {
12
                "Asd 5$ & (3€)": [1, 2, 3],
13
                "3+3": [2, 3, 4],
14
                "AsdFer #9": [3, 4, 5],
15
                '"asd"': [5, 6, 7],
16
                "dupli": [5, 6, 8],
17
                "also": [9, 2, 7],
18
            }
19
        )
20
        cls.df2 = pd.DataFrame(
21
            {"dupli": [3, 2, 1], "also": [4, 5, 7], "verylongColumnNamesareHardtoRead": [9, 2, 7]}
22
        )
23
        cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1)
24
25
    def test_clean_column_names(self):
26
        expected_results = [
27
            "asd_5_dollar_and_3_euro",
28
            "3_plus_3",
29
            "asd_fer_number_9",
30
            "asd",
31
            "dupli",
32
            "also",
33
            "dupli_6",
34
            "also_7",
35
            "verylong_column_namesare_hardto_read",
36
        ]
37
        for i, _ in enumerate(expected_results):
38
            self.assertEqual(clean_column_names(self.df_clean_column_names).columns[i], expected_results[i])
39
        for i, _ in enumerate(expected_results):
40
            self.assertEqual(
41
                clean_column_names(self.df_clean_column_names, hints=False).columns[i], expected_results[i]
42
            )
43
44
45
class Test_drop_missing(unittest.TestCase):
46
    @classmethod
47
    def setUpClass(cls):
48
        cls.df_data_drop = pd.DataFrame(
49
            [
50
                [np.nan, np.nan, np.nan, np.nan, np.nan],
51
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
52
                [pd.NA, "b", "c", "d", "e"],
53
                [pd.NA, 6, 7, 8, 9],
54
                [pd.NA, 2, 3, 4, pd.NA],
55
                [pd.NA, 6, 7, pd.NA, pd.NA],
56
            ],
57
            columns=["c1", "c2", "c3", "c4", "c5"],
58
        )
59
60
    def test_drop_missing(self):
61
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
62
63
        # Drop further columns based on threshold
64
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
65
        self.assertEqual(
66
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4)
67
        )
68
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
69
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))
70
71
        # Drop further rows based on threshold
72
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
73
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
74
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
75
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
76
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
77
        self.assertEqual(
78
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5)
79
        )
80
        self.assertEqual(
81
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4)
82
        )
83
        self.assertEqual(
84
            drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5)
85
        )
86
87
88
class Test_data_cleaning(unittest.TestCase):
89
    @classmethod
90
    def setUpClass(cls):
91
        cls.df_data_cleaning = pd.DataFrame(
92
            [
93
                [np.nan, np.nan, np.nan, np.nan, np.nan],
94
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
95
                [pd.NA, "b", 6, "d", "e"],
96
                [pd.NA, "b", 7, 8, 9],
97
                [pd.NA, "c", 3, 4, pd.NA],
98
                [pd.NA, "d", 7, pd.NA, pd.NA],
99
            ],
100
            columns=["c1", "c2", "c3", "c4", "c5"],
101
        )
102
103
    def test_data_cleaning(self):
104
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
105
        # c1 will be dropped despite in col_exclude because it is single valued
106
        self.assertEqual(
107
            data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4)
108
        )
109
110
        expected_results = ["string", "int8", "O", "O"]
111
        for i, _ in enumerate(expected_results):
112
            self.assertEqual(
113
                data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i],
114
                expected_results[i],
115
            )
116
117
118
class Test_convert_dtypes(unittest.TestCase):
119
    @classmethod
120
    def setUpClass(cls):
121
        cls.df_data_convert = pd.DataFrame(
122
            [
123
                [1, 7.0, "y", "x", pd.NA, "v"],
124
                [3, 8.0, "d", "e", pd.NA, "v"],
125
                [5, 7.0, "o", "z", pd.NA, "v"],
126
                [1, 7.0, "u", "f", pd.NA, "p"],
127
                [1, 7.0, "u", "f", pd.NA, "p"],
128
                [2, 7.0, "g", "a", pd.NA, "p"],
129
            ]
130
        )
131
132
    def test_convert_dtypes(self):
133
        expected_results = ["int8", "float32", "string", "string", "category", "category"]
134
        for i, _ in enumerate(expected_results):
135
            self.assertEqual(
136
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i]
137
            )
138
139
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
140
        for i, _ in enumerate(expected_results):
141
            self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])
142
143
        expected_results = ["int8", "float32", "string", "string", "object", "category"]
144
        for i, _ in enumerate(expected_results):
145
            self.assertEqual(
146
                convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
147
                expected_results[i],
148
            )
149
150
        expected_results = ["int8", "float32", "string", "category", "object", "category"]
151
        for i, _ in enumerate(expected_results):
152
            self.assertEqual(
153
                convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
154
                expected_results[i],
155
            )
156
157
        expected_results = ["int8", "float32", "string", "string", "object", "string"]
158
        for i, _ in enumerate(expected_results):
159
            self.assertEqual(
160
                convert_datatypes(
161
                    self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4]
162
                ).dtypes[i],
163
                expected_results[i],
164
            )
165
166
167
class Test_pool_duplicate_subsets(unittest.TestCase):
168
    @classmethod
169
    def setUpClass(cls):
170
        cls.df_data_subsets = pd.DataFrame(
171
            [
172
                [1, 7, "d", "x", pd.NA, "v"],
173
                [1, 8, "d", "e", pd.NA, "v"],
174
                [2, 7, "g", "z", pd.NA, "v"],
175
                [1, 7, "u", "f", pd.NA, "p"],
176
                [1, 7, "u", "z", pd.NA, "p"],
177
                [2, 7, "g", "z", pd.NA, "p"],
178
            ]
179
        )
180
181
    def test_pool_duplicate_subsets(self):
182
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
183
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6))
184
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))
185