| 1 |  |  | #! /usr/bin/env python | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | # | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | # Copyright (C) 2016 Rich Lewis <[email protected]> | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | # License: 3-clause BSD | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 | 1 |  | import os | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 | 1 |  | import zipfile | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 | 1 |  | import logging | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 | 1 |  | LOGGER = logging.getLogger(__name__) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 | 1 |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 | 1 |  | import numpy as np | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 | 1 |  | from ... import io | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 | 1 |  | from .base import Converter, contiguous_order | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 | 1 |  | from ...cross_validation import SimThresholdSplit | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 | 1 |  | TXT_COLUMNS = [l.lower() for l in """CAS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | Formula | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | Mol_Weight | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  | Chemical_Name | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | WS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  | WS_temp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  | WS_type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  | WS_reference | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  | LogP | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | LogP_temp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  | LogP_type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  | LogP_reference | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  | VP | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  | VP_temp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  | VP_type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  | VP_reference | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  | DC_pKa | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  | DC_temp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  | DC_type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  | DC_reference | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  | henry_law Constant | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  | HL_temp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  | HL_type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  | HL_reference | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  | OH | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  | OH_temp | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  | OH_type | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  | OH_reference | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  | BP_pressure | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  | MP | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  | BP | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  | FP""".split('\n')] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 | 1 |  | class PhysPropConverter(Converter): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 | 1 |  |     def __init__(self, directory, output_directory, output_filename='physprop.h5'): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |         output_path = os.path.join(output_directory, output_filename) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         sdf, txt = self.extract(directory) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |         mols, data = self.process_sdf(sdf), self.process_txt(txt) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         LOGGER.debug('Compounds with data extracted: %s', len(data)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         data = mols.to_frame().join(data) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |         data = self.drop_inconsistencies(data) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |         y = self.process_targets(data) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |         LOGGER.debug('Compounds with experimental: %s', len(y)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         data = data.ix[y.index] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         data.columns.name = 'targets' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |         ms, y = data.structure, data.drop('structure', axis=1) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         cv = SimThresholdSplit(min_threshold=0.6, block_width=4000, n_jobs=-1).fit(ms) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         train, valid, test = cv.split((70, 15, 15)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         (ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test)) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |         splits = (('train', train), ('valid', valid), ('test', test)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |         self.run(ms, y, output_path=output_path, splits=splits) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 | 1 |  |     def extract(self, directory): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |         LOGGER.info('Extracting from %s', directory) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |         with zipfile.ZipFile(os.path.join(directory, 'phys_sdf.zip')) as f: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |             sdf = f.extract('PhysProp.sdf') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |         with zipfile.ZipFile(os.path.join(directory, 'phys_txt.zip')) as f: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |             txt = f.extract('PhysProp.txt') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         return sdf, txt | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 | 1 |  |     def process_sdf(self, path): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         LOGGER.info('Processing sdf at %s', path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         mols = io.read_sdf(path, read_props=False).structure | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |         mols.index = mols.apply(lambda m: m.GetProp('CAS')) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         mols.index.name = 'cas' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |         LOGGER.debug('Structures extracted: %s', len(mols)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         return mols | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 | 1 |  |     def process_txt(self, path): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         LOGGER.info('Processing txt at %s', path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         data = pd.read_table(path, header=None, engine='python').iloc[:, :32] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         data.columns = TXT_COLUMNS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         data_types = data.columns[[s.endswith('_type') for s in data.columns]] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         data[data_types] = data[data_types].fillna('NAN') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         data = data.set_index('cas') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 | 1 |  |     def drop_inconsistencies(self, data): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         LOGGER.info('Dropping inconsistent data...') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         formula = data.structure.apply(lambda m: m.to_formula()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         LOGGER.info('Inconsistent compounds: %s', (formula != data.formula).sum()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         data = data[formula == data.formula] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 | 1 |  |     def process_targets(self, data): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |         LOGGER.info('Dropping estimated data...') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |         data = pd.concat([self.process_logS(data), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |                           self.process_logP(data), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |                           self.process_mp(data), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |                           self.process_bp(data)], axis=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |         LOGGER.info('Dropped compounds: %s', data.isnull().all(axis=1).sum()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |         data = data[data.notnull().any(axis=1)] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |         LOGGER.debug('Compounds with experimental activities: %s', len(data)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 | 1 |  |     def process_logS(self, data): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         cleaned = pd.DataFrame(index=data.index) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |         S = 0.001 * data.ws / data.mol_weight | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |         logS = np.log10(S) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |         return logS[data.ws_type == 'EXP'] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 | 1 |  |     def process_logP(self, data): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |         logP = data.logp[data.logp_type == 'EXP'] | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         return logP[logP > -10] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 | 1 |  |     def process_mp(self, data): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |         return data.mp.apply(self.fix_temp) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 | 1 |  |     def process_bp(self, data): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         return data.bp.apply(self.fix_temp) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 139 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 140 | 1 |  |     @staticmethod | 
            
                                                                        
                            
            
                                    
            
            
                | 141 | 1 |  |     def fix_temp(s, mean_range=5): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 142 |  |  |         try: | 
            
                                                                        
                            
            
                                    
            
            
                | 143 |  |  |             return float(s) | 
            
                                                                        
                            
            
                                    
            
            
                | 144 |  |  |         except ValueError: | 
            
                                                                        
                            
            
                                    
            
            
                | 145 |  |  |             if '<' in s or '>' in s: | 
            
                                                                        
                            
            
                                    
            
            
                | 146 |  |  |                 return np.nan | 
            
                                                                        
                            
            
                                    
            
            
                | 147 |  |  |             s = s.strip(' dec') | 
            
                                                                        
                            
            
                                    
            
            
                | 148 |  |  |             s = s.strip(' sub') | 
            
                                                                        
                            
            
                                    
            
            
                | 149 |  |  |             if '-' in s and mean_range: | 
            
                                                                        
                            
            
                                    
            
            
                | 150 |  |  |                 rng = [float(n) for n in s.split('-')] | 
            
                                                                        
                            
            
                                    
            
            
                | 151 |  |  |                 if len(rng) > 2: | 
            
                                                                        
                            
            
                                    
            
            
                | 152 |  |  |                     return np.nan | 
            
                                                                        
                            
            
                                    
            
            
                | 153 |  |  |                 if np.abs(rng[1] - rng[0]) < mean_range: | 
            
                                                                        
                            
            
                                    
            
            
                | 154 |  |  |                     return (rng[0] + rng[1])/2 | 
            
                                                                        
                            
            
                                    
            
            
                | 155 |  |  |             try: | 
            
                                                                        
                            
            
                                    
            
            
                | 156 |  |  |                 return float(s) | 
            
                                                                        
                            
            
                                    
            
            
                | 157 |  |  |             except ValueError: | 
            
                                                                        
                            
            
                                    
            
            
                | 158 |  |  |                 return np.nan | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 | 1 |  | if __name__ == '__main__': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |     logging.basicConfig(level=logging.INFO) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |     LOGGER.info('Converting PhysProp Dataset...') | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 165 |  |  |     PhysPropConverter.convert() | 
            
                                                        
            
                                    
            
            
                | 166 |  |  |  | 
            
                        
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.