| 1 |  |  | """Defines backend-dependent (eg using pandas as backend library) functions that will serve as engine commands. | 
            
                                                        
            
                                    
            
            
                | 2 |  |  | These commands should be "built" using a suitable function/decorator. | 
            
                                                        
            
                                    
            
            
                | 3 |  |  | These commands should be able to be defined at runtime, as part of client code (with respect to this library). | 
            
                                                        
            
                                    
            
            
                | 4 |  |  | """ | 
            
                                                        
            
                                    
            
            
                | 5 |  |  | from functools import reduce | 
            
                                                        
            
                                    
            
            
                | 6 |  |  | import numpy as np | 
            
                                                        
            
                                    
            
            
                | 7 |  |  | import pandas as pd | 
            
                                                        
            
                                    
            
            
                | 8 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 9 |  |  | from so_magic.data.encoding import NominalAttributeEncoder | 
            
                                                        
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 12 |  |  | __all__ = ['data_manager_commands', 'arbitrary_commands'] | 
            
                                                        
            
                                    
            
            
                | 13 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 14 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 15 |  |  | # CMD 1 | 
            
                                                        
            
                                    
            
            
                | 16 |  |  | def observations_command(file_path): | 
            
                                                        
            
                                    
            
            
                | 17 |  |  |     return pd.read_json(file_path, lines=True) | 
            
                                                        
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 20 |  |  | class OneHotEncoder(NominalAttributeEncoder): | 
            
                                                        
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 22 |  |  |     def encode(self, *args, **kwargs): | 
            
                                                        
            
                                    
            
            
                | 23 |  |  |         datapoints = args[0] | 
            
                                                        
            
                                    
            
            
                | 24 |  |  |         attribute = args[1] | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |         prefix_separator = '_' | 
            
                                                        
            
                                    
            
            
                | 26 |  |  |         dataframe = pd.get_dummies(datapoints.observations[attribute], prefix=attribute, prefix_sep='_', | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |                                    drop_first=False) | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |         self.values_set = [x.replace(f'{attribute}{prefix_separator}', '') for x in dataframe.columns] | 
            
                                                        
            
                                    
            
            
                | 29 |  |  |         self.columns = list(dataframe.columns) | 
            
                                                        
            
                                    
            
            
                | 30 |  |  |         return dataframe | 
            
                                                        
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 33 |  |  | # CMD 2 | 
            
                                                        
            
                                    
            
            
                | 34 |  |  | def one_hot_encoding_command(_data_manager, _datapoints, _attribute): | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |     dataframe = OneHotEncoder().encode(_datapoints, _attribute) | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |     # TODO add a add_columns method to the mutator interface | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |     # replace below with datapoints.mutator.add_columns(...) (similar to the encode_nominal_subsets_command above) | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |     _data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1) | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 41 |  |  | class OneHotListEncoder(NominalAttributeEncoder): | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |     binary_transformer = {True: 1.0, False: 0.0} | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |     def encode(self, *args, **kwargs): | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |         datapoints = args[0] | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |         attribute = args[1] | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |         self.values_set = reduce(lambda i, j: set(i).union(set(j)), | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |                                  [_ for _ in datapoints.observations[attribute] if isinstance(_, list)]) | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |         self.columns = list(self.values_set) | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |         return pd.DataFrame([self._yield_vector(datarow, attribute) for index, datarow in datapoints.iterrows()], | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |                             columns=self.columns) | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 53 |  |  |     def _yield_vector(self, datarow, attribute): | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |         decision = {True: self._encode, False: self._encode_none} | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |         return decision[isinstance(datarow[attribute], list)](datarow, attribute) | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |     def _encode(self, datarow, attribute): | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |         return [OneHotListEncoder.binary_transformer[column in datarow[attribute]] for column in self.columns] | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |     def _encode_none(self, _datarow, _attribute): | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |         return [0.0] * len(self.values_set) | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 64 |  |  | # CMD 3 | 
            
                                                        
            
                                    
            
            
                | 65 |  |  | def one_hot_encoding_list_command(_data_manager, _datapoints, _attribute): | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |     _data_manager.datapoints.observations[_attribute].fillna(value=np.nan, inplace=True) | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |     dataframe = OneHotListEncoder().encode(_datapoints, _attribute) | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |     _data_manager.datapoints.observations = pd.concat([_data_manager.datapoints.observations, dataframe], axis=1) | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 71 |  |  | data_manager_commands = (one_hot_encoding_list_command, one_hot_encoding_command) | 
            
                                                        
            
                                    
            
            
                | 72 |  |  | arbitrary_commands = (observations_command,) | 
            
                                                        
            
                                    
            
            
                | 73 |  |  |  |