| 1 |  |  | #! /usr/bin/env python | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | # | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | # Copyright (C) 2016 Rich Lewis <[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | # License: 3-clause BSD | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 | 1 |  | import os | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 | 1 |  | import zipfile | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 | 1 |  | import logging | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 | 1 |  | LOGGER = logging.getLogger(__name__) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 | 1 |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 | 1 |  | import numpy as np | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 | 1 |  | import skchem | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 | 1 |  | from .base import Converter | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 | 1 |  | from ... import standardizers | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 | 1 |  | PATCHES = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |     '820-75-7': r'NNC(=O)CNC(=O)C=[N+]=[N-]', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |     '2435-76-9': r'[N-]=[N+]=C1C=NC(=O)NC1=O', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |     '817-99-2': r'NC(=O)CNC(=O)\C=[N+]=[N-]', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |     '116539-70-9': r'CCCCN(CC(O)C1=C\C(=[N+]=[N-])\C(=O)C=C1)N=O', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     '115-02-6': r'NC(COC(=O)\C=[N+]=[N-])C(=O)O', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |     '122341-55-3': r'NC(COC(=O)\C=[N+]=[N-])C(=O)O' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  | } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 | 1 |  | class MullerAmesConverter(Converter): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 29 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 30 | 1 |  |     def __init__(self, directory, output_directory, output_filename='muller_ames.h5'): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 32 |  |  |         """ | 
            
                                                                        
                            
            
                                    
            
            
                | 33 |  |  |         Args: | 
            
                                                                        
                            
            
                                    
            
            
                | 34 |  |  |             directory (str): | 
            
                                                                        
                            
            
                                    
            
            
                | 35 |  |  |                 Directory in which input files reside. | 
            
                                                                        
                            
            
                                    
            
            
                | 36 |  |  |             output_directory (str): | 
            
                                                                        
                            
            
                                    
            
            
                | 37 |  |  |                 Directory in which to save the converted dataset. | 
            
                                                                        
                            
            
                                    
            
            
                | 38 |  |  |             output_filename (str): | 
            
                                                                        
                            
            
                                    
            
            
                | 39 |  |  |                 Name of the saved dataset. Defaults to `muller_ames.h5`. | 
            
                                                                        
                            
            
                                    
            
            
                | 40 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 41 |  |  |         Returns: | 
            
                                                                        
                            
            
                                    
            
            
                | 42 |  |  |             tuple of str: | 
            
                                                                        
                            
            
                                    
            
            
                | 43 |  |  |                 Single-element tuple containing the path to the converted dataset. | 
            
                                                                        
                            
            
                                    
            
            
                | 44 |  |  |         """ | 
            
                                                                        
                            
            
                                    
            
            
                | 45 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 46 |  |  |         zip_path = os.path.join(directory, 'ci900161g_si_001.zip') | 
            
                                                                        
                            
            
                                    
            
            
                | 47 |  |  |         output_path = os.path.join(output_directory, output_filename) | 
            
                                                                        
                            
            
                                    
            
            
                | 48 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 49 |  |  |         with zipfile.ZipFile(zip_path) as f: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 50 |  |  |             f.extractall() | 
            
                                                                        
                            
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 52 |  |  |         # create dataframe | 
            
                                                                        
                            
            
                                    
            
            
                | 53 |  |  |         data = pd.read_csv(os.path.join(directory, 'smiles_cas_N6512.smi'), | 
            
                                                                        
                            
            
                                    
            
            
                | 54 |  |  |                            delimiter='\t', index_col=1, | 
            
                                                                        
                            
            
                                    
            
            
                | 55 |  |  |                            converters={1: lambda s: s.strip()}, | 
            
                                                                        
                            
            
                                    
            
            
                | 56 |  |  |                            header=None, names=['structure', 'id', 'is_mutagen']) | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |         data = self.patch_data(data, PATCHES) | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |         data['structure'] = data.structure.apply(skchem.Mol.from_smiles) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |         data = self.standardize(data) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |         data = self.optimize(data) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |         keep = self.filter(data) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |         ms, ys = keep.structure, keep.is_mutagen | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |         indices = data.reset_index().index.difference(keep.reset_index().index) | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |         train = self.parse_splits(os.path.join('splits_train_N6512.csv')) | 
            
                                                                        
                            
            
                                    
            
            
                | 71 |  |  |         train = self.drop_indices(train, indices) | 
            
                                                                        
                            
            
                                    
            
            
                | 72 |  |  |         splits = self.create_split_dict(train, 'train') | 
            
                                                                        
                            
            
                                    
            
            
                | 73 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  |         test = self.parse_splits(os.path.join(directory, 'splits_test_N6512.csv')) | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |         test = self.drop_indices(test, indices) | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |         splits.update(self.create_split_dict(test, 'test')) | 
            
                                                                        
                            
            
                                    
            
            
                | 77 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 78 |  |  |         self.run(ms, ys, output_path, splits=splits) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 | 1 |  |     def patch_data(self, data, patches): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |         """ Patch smiles in a DataFrame with rewritten ones that specify diazo | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |         groups in rdkit friendly way. """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         LOGGER.info('Patching data...') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |         for cas, smiles in patches.items(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |             data.loc[cas, 'structure'] = smiles | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 | 1 |  |     def parse_splits(self, f_path): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         LOGGER.info('Parsing splits...') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |         with open(f_path) as f: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |             splits = [split for split in f.read().strip().splitlines()] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         splits = [[n for n in split.strip().split(',')] for split in splits] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |         splits = [sorted(int(n) for n in split) for split in splits] # sorted ints | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |         return [np.array(split) - 1 for split in splits] # zero based indexing | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 | 1 |  |     def drop_indices(self, splits, indices): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         LOGGER.info('Dropping failed compounds from split indices...') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         for i, split in enumerate(splits): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |             split = split - sum(split > ix for ix in indices) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |             splits[i] = np.delete(split, indices) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         return splits | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 | 1 |  |     def create_split_dict(self, splits, name): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         return {'{}_{}'.format(name, i + 1): split \ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |                         for i, split in enumerate(splits)} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 | 1 |  | if __name__ == '__main__': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |     logging.basicConfig(level=logging.INFO) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |     LOGGER.info('Converting Muller Ames Dataset...') | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 114 |  |  |     MullerAmesConverter.convert() | 
            
                                                        
            
                                    
            
            
                | 115 |  |  |  | 
            
                        
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.