| 1 |  |  | import numpy as np | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | import pandas as pd | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import unittest | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | from ..clean import clean_column_names, data_cleaning, drop_missing, convert_datatypes, pool_duplicate_subsets | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | class Test_clean_column_names(unittest.TestCase): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |     def setUpClass(cls) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |         cls.df1 = pd.DataFrame( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |             { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |                 "Asd 5$ & (3€)": [1, 2, 3], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |                 "3+3": [2, 3, 4], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |                 "AsdFer #9": [3, 4, 5], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |                 '"asd"': [5, 6, 7], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |                 "dupli": [5, 6, 8], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |                 "also": [9, 2, 7], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |         cls.df2 = pd.DataFrame( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |             {"dupli": [3, 2, 1], "also": [4, 5, 7], "verylongColumnNamesareHardtoRead": [9, 2, 7]} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |         cls.df_clean_column_names = pd.concat([cls.df1, cls.df2], axis=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |     def test_clean_column_names(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |         expected_results = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |             "asd_5_dollar_and_3_euro", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |             "3_plus_3", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |             "asd_fer_number_9", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |             "asd", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |             "dupli", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |             "also", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |             "dupli_6", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |             "also_7", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |             "verylong_column_namesare_hardto_read", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |             self.assertEqual(clean_column_names(self.df_clean_column_names).columns[i], expected_results[i]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |             self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |                 clean_column_names(self.df_clean_column_names, hints=False).columns[i], expected_results[i] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  | class Test_drop_missing(unittest.TestCase): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     def setUpClass(cls): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |         cls.df_data_drop = pd.DataFrame( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |             [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |                 [np.nan, np.nan, np.nan, np.nan, np.nan], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |                 [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |                 [pd.NA, "b", "c", "d", "e"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |                 [pd.NA, 6, 7, 8, 9], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |                 [pd.NA, 2, 3, 4, pd.NA], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |                 [pd.NA, 6, 7, pd.NA, pd.NA], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |             ], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |             columns=["c1", "c2", "c3", "c4", "c5"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |     def test_drop_missing(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         self.assertEqual(drop_missing(self.df_data_drop).shape, (4, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         # Drop further columns based on threshold | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.5).shape, (4, 3)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |         self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |             drop_missing(self.df_data_drop, drop_threshold_cols=0.5, col_exclude=["c1"]).shape, (4, 4) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0.49).shape, (4, 2)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_cols=0).shape, (0, 0)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |         # Drop further rows based on threshold | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.67).shape, (4, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.5).shape, (4, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.49).shape, (3, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.25).shape, (3, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         self.assertEqual(drop_missing(self.df_data_drop, drop_threshold_rows=0.24).shape, (2, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |         self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |             drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c1"]).shape, (2, 5) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |         self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |             drop_missing(self.df_data_drop, drop_threshold_rows=0.24, col_exclude=["c2"]).shape, (2, 4) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |         self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |             drop_missing(self.df_data_drop, drop_threshold_rows=0.51, col_exclude=["c1"]).shape, (3, 5) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 88 |  |  | class Test_data_cleaning(unittest.TestCase): | 
            
                                                                        
                            
            
                                    
            
            
                | 89 |  |  |     @classmethod | 
            
                                                                        
                            
            
                                    
            
            
                | 90 |  |  |     def setUpClass(cls): | 
            
                                                                        
                            
            
                                    
            
            
                | 91 |  |  |         cls.df_data_cleaning = pd.DataFrame( | 
            
                                                                        
                            
            
                                    
            
            
                | 92 |  |  |             [ | 
            
                                                                        
                            
            
                                    
            
            
                | 93 |  |  |                 [np.nan, np.nan, np.nan, np.nan, np.nan], | 
            
                                                                        
                            
            
                                    
            
            
                | 94 |  |  |                 [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA], | 
            
                                                                        
                            
            
                                    
            
            
                | 95 |  |  |                 [pd.NA, "b", 6, "d", "e"], | 
            
                                                                        
                            
            
                                    
            
            
                | 96 |  |  |                 [pd.NA, "b", 7, 8, 9], | 
            
                                                                        
                            
            
                                    
            
            
                | 97 |  |  |                 [pd.NA, "c", 3, 4, pd.NA], | 
            
                                                                        
                            
            
                                    
            
            
                | 98 |  |  |                 [pd.NA, "d", 7, pd.NA, pd.NA], | 
            
                                                                        
                            
            
                                    
            
            
                | 99 |  |  |             ], | 
            
                                                                        
                            
            
                                    
            
            
                | 100 |  |  |             columns=["c1", "c2", "c3", "c4", "c5"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |     def test_data_cleaning(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |         self.assertEqual(data_cleaning(self.df_data_cleaning).shape, (4, 4)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         # c1 will be dropped despite in col_exclude because it is single valued | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |         self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |             data_cleaning(self.df_data_cleaning, col_exclude=["c1"]).shape, (4, 4) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         expected_results = ["string", "int8", "O", "O"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |             self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |                 data_cleaning(self.df_data_cleaning, convert_dtypes=True).dtypes[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |                 expected_results[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  | class Test_convert_dtypes(unittest.TestCase): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |     def setUpClass(cls): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |         cls.df_data_convert = pd.DataFrame( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |             [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |                 [1, 7.0, "y", "x", pd.NA, "v"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |                 [3, 8.0, "d", "e", pd.NA, "v"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |                 [5, 7.0, "o", "z", pd.NA, "v"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |                 [1, 7.0, "u", "f", pd.NA, "p"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |                 [1, 7.0, "u", "f", pd.NA, "p"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |                 [2, 7.0, "g", "a", pd.NA, "p"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |             ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |     def test_convert_dtypes(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |         expected_results = ["int8", "float32", "string", "string", "category", "category"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |             self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |                 convert_datatypes(self.df_data_convert, cat_threshold=0.4).dtypes[i], expected_results[i] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |         expected_results = ["int8", "float32", "string", "string", "object", "string"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |             self.assertEqual(convert_datatypes(self.df_data_convert).dtypes[i], expected_results[i]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |         expected_results = ["int8", "float32", "string", "string", "object", "category"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |             self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |                 convert_datatypes(self.df_data_convert, cat_threshold=0.5, cat_exclude=[4]).dtypes[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |                 expected_results[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |         expected_results = ["int8", "float32", "string", "category", "object", "category"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |             self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |                 convert_datatypes(self.df_data_convert, cat_threshold=0.95, cat_exclude=[2, 4]).dtypes[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |                 expected_results[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |         expected_results = ["int8", "float32", "string", "string", "object", "string"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |         for i, _ in enumerate(expected_results): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |             self.assertEqual( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |                 convert_datatypes( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |                     self.df_data_convert, category=False, cat_threshold=0.95, cat_exclude=[2, 4] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |                 ).dtypes[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |                 expected_results[i], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  | class Test_pool_duplicate_subsets(unittest.TestCase): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |     def setUpClass(cls): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |         cls.df_data_subsets = pd.DataFrame( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |             [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |                 [1, 7, "d", "x", pd.NA, "v"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |                 [1, 8, "d", "e", pd.NA, "v"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |                 [2, 7, "g", "z", pd.NA, "v"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |                 [1, 7, "u", "f", pd.NA, "p"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |                 [1, 7, "u", "z", pd.NA, "p"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |                 [2, 7, "g", "z", pd.NA, "p"], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 |  |  |             ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |     def test_pool_duplicate_subsets(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |         self.assertEqual(pool_duplicate_subsets(self.df_data_subsets).shape, (6, 3)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |         self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, col_dupl_thresh=1).shape, (6, 6)) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 184 |  |  |         self.assertEqual(pool_duplicate_subsets(self.df_data_subsets, subset_thresh=0).shape, (6, 2)) | 
            
                                                        
            
                                    
            
            
                | 185 |  |  |  |