GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Passed
Push — master ( 922334...81fced )
by Andreas
01:16
created

Test_data_cleaning.setUpClass()   A

Complexity

Conditions 1

Size

Total Lines 12
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 11
nop 1
dl 0
loc 12
rs 9.85
c 0
b 0
f 0
1
import numpy as np
2
import pandas as pd
3
import unittest
4
from ..clean import data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets
5
6
7
class Test_drop_missing(unittest.TestCase):
8
    @classmethod
9
    def setUpClass(cls):
10
        cls.df_data_drop = pd.DataFrame(
11
            [
12
                [np.nan, np.nan, np.nan, np.nan, np.nan],
13
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
14
                [pd.NA, "b", "c", "d", "e"],
15
                [pd.NA, 6, 7, 8, 9],
16
                [pd.NA, 2, 3, 4, pd.NA],
17
                [pd.NA, 6, 7, pd.NA, pd.NA],
18
            ],
19
            columns=["c1", "c2", "c3", "c4", "c5"],
20
        )
21
22
    def test_drop_missing(self):
23
        self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4))
24
25
        # Drop further columns based on threshold
26
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3))
27
        self.assertEqual(
28
            drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4)
29
        )
30
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2))
31
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0))
32
33
        # Drop further rows based on threshold
34
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4))
35
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4))
36
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4))
37
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4))
38
        self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4))
39
        self.assertEqual(
40
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5)
41
        )
42
        self.assertEqual(
43
            drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4)
44
        )
45
        self.assertEqual(
46
            drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5)
47
        )
48
49
50
class Test_data_cleaning(unittest.TestCase):
51
    @classmethod
52
    def setUpClass(cls):
53
        cls.df_data_cleaning = pd.DataFrame(
54
            [
55
                [np.nan, np.nan, np.nan, np.nan, np.nan],
56
                [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
57
                [pd.NA, "b", "c", "d", "e"],
58
                [pd.NA, 6, 7, 8, 9],
59
                [pd.NA, 2, 3, 4, pd.NA],
60
                [pd.NA, 6, 7, pd.NA, pd.NA],
61
            ],
62
            columns=["c1", "c2", "c3", "c4", "c5"],
63
        )
64
65
    def test_data_cleaning(self):
66
        self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4))
67
68
69
class Test_convert_dtypes(unittest.TestCase):
70
    @classmethod
71
    def setUpClass(cls):
72
        cls.df_data_convert = pd.DataFrame(
73
            [
74
                [1, 7.0, "y", "x", pd.NA, "v"],
75
                [3, 8.0, "d", "e", pd.NA, "v"],
76
                [5, 7.0, "o", "z", pd.NA, "v"],
77
                [1, 7.0, "u", "f", pd.NA, "p"],
78
                [1, 7.0, "u", "f", pd.NA, "p"],
79
                [2, 7.0, "g", "a", pd.NA, "p"],
80
            ]
81
        )
82
83
    def test_convert_dtypes(self):
84
        expected_results = ["Int8", "Float32", "string", "string", "category", "category"]
85
        for i, _ in enumerate(expected_results):
86
            self.assertEqual(
87
                convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i]
88
            )
89
90
        expected_results = ["Int8", "Float32", "string", "string", "object", "string"]
91
        for i, _ in enumerate(expected_results):
92
            self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i])
93
94
        expected_results = ["Int8", "Float32", "string", "string", "object", "category"]
95
        for i, _ in enumerate(expected_results):
96
            self.assertEqual(
97
                convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i],
98
                expected_results[i],
99
            )
100
101
        expected_results = ["Int8", "Float32", "string", "category", "object", "category"]
102
        for i, _ in enumerate(expected_results):
103
            self.assertEqual(
104
                convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i],
105
                expected_results[i],
106
            )
107
108
        expected_results = ["Int8", "Float32", "string", "string", "object", "string"]
109
        for i, _ in enumerate(expected_results):
110
            self.assertEqual(
111
                convert_datatypes(
112
                    self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4]
113
                ).dtypes[i],
114
                expected_results[i],
115
            )
116
117
118
class Test_pool_duplicate_subsets(unittest.TestCase):
119
    @classmethod
120
    def setUpClass(cls):
121
        cls.df_data_subsets = pd.DataFrame(
122
            [
123
                [1, 7, "d", "x", pd.NA, "v"],
124
                [1, 8, "d", "e", pd.NA, "v"],
125
                [2, 7, "g", "z", pd.NA, "v"],
126
                [1, 7, "u", "f", pd.NA, "p"],
127
                [1, 7, "u", "z", pd.NA, "p"],
128
                [2, 7, "g", "z", pd.NA, "p"],
129
            ]
130
        )
131
132
    def test_pool_duplicate_subsets(self):
133
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3))
134
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6))
135
        self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2))
136