1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
## skchem.descriptors.atom |
8
|
|
|
|
9
|
|
|
Module specifying atom based descriptor generators. |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
import logging |
13
|
1 |
|
import subprocess |
14
|
1 |
|
import re |
15
|
1 |
|
from abc import ABCMeta |
16
|
|
|
|
17
|
1 |
|
import pandas as pd |
|
|
|
|
18
|
1 |
|
import numpy as np |
|
|
|
|
19
|
|
|
|
20
|
1 |
|
from ..utils import line_count, nanarray |
21
|
1 |
|
from ..base import ( |
22
|
|
|
CLIWrapper, Transformer, AtomTransformer, BatchTransformer, Featurizer |
23
|
|
|
) |
24
|
|
|
|
25
|
1 |
|
LOGGER = logging.getLogger(__file__) |
26
|
|
|
|
27
|
|
|
# TODO: fix averagemicrospeciescharge |
|
|
|
|
28
|
|
|
# TODO: fix logd logp logs |
|
|
|
|
29
|
|
|
# TODO: oen (orbital electronegativity) - sigma + pi |
|
|
|
|
30
|
|
|
# TODO: water accessible surface area |
|
|
|
|
31
|
|
|
|
32
|
|
|
# TODO: these don't produce csv |
|
|
|
|
33
|
|
|
# ['doublebondstereoisomers', 'conformers', 'stereoisomers', |
34
|
|
|
# 'moleculardynamics', 'stereoanalysis', 'lowestenergyconformer', 'msdistr2', |
35
|
|
|
# 'conformations', 'dominanttautomerdistribution', 'hnmr', 'moldyn', 'cnmr', |
36
|
|
|
# 'frameworks', 'microspeciesdistribution', 'nmr', 'leconformer', 'msdistr', |
37
|
|
|
# 'tetrahedralstereoisomers'] |
38
|
|
|
# |
39
|
|
|
|
40
|
1 |
|
CHEMAXON_HINT = """ Install ChemAxon from https://www.chemaxon.com. |
41
|
|
|
It requires a license, which can be freely obtained for academics. """ |
42
|
|
|
|
43
|
|
|
|
44
|
1 |
|
class ChemAxonBaseFeaturizer(CLIWrapper, Featurizer): |
|
|
|
|
45
|
|
|
|
46
|
|
|
""" Base class for ChemAxonFeaturizers (using cxcalc). |
47
|
|
|
|
48
|
|
|
Concrete subclasses inheriting from this should override |
49
|
|
|
`_all_feats`, `_optimal_features`. |
50
|
|
|
""" |
51
|
|
|
|
52
|
1 |
|
__metaclass__ = ABCMeta |
53
|
|
|
|
54
|
1 |
|
install_hint = CHEMAXON_HINT |
55
|
|
|
|
56
|
1 |
|
_feat_columns = {'averagepol': ['a_avg'], 'name': ['preferred_iupac_name'], |
57
|
|
|
'aromaticbondcount': ['aromatic_bond_count'], |
58
|
|
|
'maximalprojectionradius': ['maximal_projection_radius'], |
59
|
|
|
'tpolarizability': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], |
60
|
|
|
'distance': ['distance'], |
61
|
|
|
'acceptor': ['acceptor_count', 'acceptor_site_count'], |
62
|
|
|
'fusedringcount': ['fused_ring_count'], |
63
|
|
|
'charge': ['total_charge'], |
64
|
|
|
'donor': ['donor_count', 'donor_site_count'], |
65
|
|
|
'ringcount': ['ring_count'], |
66
|
|
|
'chainbond': ['chain_bond'], |
67
|
|
|
'mmff94energy': ['mmff94_energy'], |
68
|
|
|
'huckel': ['aromatic_e+/nu-_order', |
69
|
|
|
'localization_energy_l_+/l-', 'pi_energy', |
70
|
|
|
'electron_density', 'charge_density'], |
71
|
|
|
'chainatom': ['chain_atom'], |
72
|
|
|
'shortestpath': ['shortest_path'], |
73
|
|
|
'resonantcount': ['resonant_count'], |
74
|
|
|
'tpol': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], |
75
|
|
|
'moststabletautomer': ['most_stable_tautomer'], |
76
|
|
|
'generictautomer': ['generictautomer'], |
77
|
|
|
'hmoelectrophilicityorder': ['hmo_aromatic_e+_order'], |
78
|
|
|
'ringsystemcountofsize': ['ring_system_count_of_size'], |
79
|
|
|
'largestatomringsize': ['largest_ring_size_of_atom'], |
80
|
|
|
'tetrahedralstereoisomercount': [ |
81
|
|
|
'tetrahedral_stereoisomer_count'], |
82
|
|
|
'enumerations': ['enumerations'], |
83
|
|
|
'ringatom': ['ring_atom'], 'connected': ['connected'], |
84
|
|
|
'hmolocalizationenergy': [ |
85
|
|
|
'hmo_localization_energy_l+/l-'], |
86
|
|
|
'averagemolecularpolarizability': ['a_avg'], |
87
|
|
|
'donorsitecount': ['donor_site_count'], |
88
|
|
|
'donorcount': ['donor_count'], |
89
|
|
|
'asymmetricatom': ['asymmetric_atom'], |
90
|
|
|
'pienergy': ['pi_energy'], |
91
|
|
|
'bondcount': ['bond_count'], |
92
|
|
|
'chiralcenters': ['chiral_centers'], |
93
|
|
|
'hmohuckel': ['hmo_aromatic_e+/nu-_order', |
94
|
|
|
'hmo_localization_energy_l+/l-', |
95
|
|
|
'hmo_pi_energy', |
96
|
|
|
'hmo_electron_density', |
97
|
|
|
'hmo_charge_density'], |
98
|
|
|
'huckeleigenvector': ['eigenvector'], |
99
|
|
|
'ringcountofsize': ['ring_count_of_size'], |
100
|
|
|
'heteroaliphaticringcount': [ |
101
|
|
|
'heteroaliphatic_ring_count'], |
102
|
|
|
'markushenumerations': ['enumerations'], |
103
|
|
|
'minimalprojectionradius': ['minimal_projection_radius'], |
104
|
|
|
'dipole': ['dipole'], |
105
|
|
|
'balabanindex': ['balaban_index'], |
106
|
|
|
'aromaticnucleophilicityorder': ['aromatic_nu-_order'], |
107
|
|
|
'tautomercount': ['tautomer_count'], |
108
|
|
|
'cyclomaticnumber': ['cyclomatic_number'], |
109
|
|
|
'psa': ['polar_surface_area'], 'isoelectricpoint': ['pi'], |
110
|
|
|
'hmopienergy': ['hmo_pi_energy'], |
111
|
|
|
'ayypol': ['a_yy'], 'fragmentcount': ['fragment_count'], |
112
|
|
|
'acceptormultiplicity': ['acceptor_multiplicity'], |
113
|
|
|
'topologyanalysistable': ['atom_count', |
114
|
|
|
'aliphatic_atom_count', |
115
|
|
|
'aromatic_atom_count', |
116
|
|
|
'bond_count', |
117
|
|
|
'aliphatic_bond_count', |
118
|
|
|
'aromatic_bond_count', |
119
|
|
|
'rotatable_bond_count', |
120
|
|
|
'ring_count', |
121
|
|
|
'aliphatic_ring_count', |
122
|
|
|
'aromatic_ring_count', |
123
|
|
|
'hetero_ring_count', |
124
|
|
|
'heteroaliphatic_ring_count', |
125
|
|
|
'heteroaromatic_ring_count', |
126
|
|
|
'ring_atom_count', |
127
|
|
|
'ring_bond_count', |
128
|
|
|
'chain_atom_count', |
129
|
|
|
'chain_bond_count', |
130
|
|
|
'smallest_ring_size', |
131
|
|
|
'largest_ring_size'], |
132
|
|
|
'ioncharge': ['charge'], |
133
|
|
|
'asymmetricatoms': ['asymmetric_atoms'], |
134
|
|
|
'wateraccessiblesurfacearea': ['asa', 'asa+', 'asa-', |
135
|
|
|
'asa_h', 'asa_p'], |
136
|
|
|
'avgpol': ['a_avg'], |
137
|
|
|
'carboaliphaticringcount': ['carboaliphatic_ring_count'], |
138
|
|
|
'aliphaticringcount': ['aliphatic_ring_count'], |
139
|
|
|
'donormultiplicity': ['donor_multiplicity'], |
140
|
|
|
'minimalprojectionarea': ['minimal_projection_area'], |
141
|
|
|
'nucleophiliclocalizationenergy': [ |
142
|
|
|
'localization_energy_l-'], 'dihedral': ['dihedral'], |
143
|
|
|
'heteroringcount': ['hetero_ring_count'], |
144
|
|
|
'azzpol': ['a_zz'], |
145
|
|
|
'molecularsurfacearea': ['van_der_waals_surface_area_3d'], |
146
|
|
|
'hmonucleophiliclocalizationenergy': [ |
147
|
|
|
'hmo_localization_energy_l-'], |
148
|
|
|
'chargedistribution': ['charge_distribution'], |
149
|
|
|
'pol': ['a_mol', 'a_atom'], |
150
|
|
|
'hmoelectrondensity': ['hmo_electron_density'], |
151
|
|
|
'carboaromaticringcount': ['carboaromatic_ring_count'], |
152
|
|
|
'acceptorsitecount': ['acceptor_site_count'], |
153
|
|
|
'markushenumerationcount': ['markush_library_size'], |
154
|
|
|
'localizationenergy': ['localization_energy_l+/l-'], |
155
|
|
|
'hararyindex': ['harary_index'], |
156
|
|
|
'asa': ['asa', 'asa+', 'asa-', 'asa_h', 'asa_p'], |
157
|
|
|
'acc': ['acc'], 'majortautomer': ['major_tautomer'], |
158
|
|
|
'majormicrospecies': ['major-ms'], |
159
|
|
|
'aliphaticatomcount': ['aliphatic_atom_count'], |
160
|
|
|
'angle': ['angle'], 'huckeleigenvalue': ['eigenvalue'], |
161
|
|
|
'axxpol': ['a_xx'], |
162
|
|
|
'chiralcenter': ['chiral_center'], |
163
|
|
|
'aliphaticbondcount': ['aliphatic_bond_count'], |
164
|
|
|
'smallestatomringsize': ['smallest_ring_size_of_atom'], |
165
|
|
|
'dreidingenergy': ['dreiding_energy'], |
166
|
|
|
'maximalprojectionsize': [ |
167
|
|
|
'length_perpendicular_to_the_max_area'], |
168
|
|
|
'largestringsystemsize': ['largest_ring_system_size'], |
169
|
|
|
'accsitecount': ['acceptor_site_count'], |
170
|
|
|
'refractivity': ['refractivity'], |
171
|
|
|
'bondtype': ['bond_type'], |
172
|
|
|
'chargedensity': ['charge_density'], |
173
|
|
|
'resonants': ['resonants'], |
174
|
|
|
'aromaticatomcount': ['aromatic_atom_count'], |
175
|
|
|
'distancedegree': ['distance_degree'], |
176
|
|
|
'hasvalidconformer': ['has_valid_conformer'], |
177
|
|
|
'electrondensity': ['electron_density'], |
178
|
|
|
'asymmetricatomcount': ['asymmetric_atom_count'], |
179
|
|
|
'fsp3': ['fsp3'], 'don': ['don'], |
180
|
|
|
'fusedaliphaticringcount': ['fused_aliphatic_ring_count'], |
181
|
|
|
'pkat': ['pkat'], |
182
|
|
|
'fusedaromaticringcount': ['fused_aromatic_ring_count'], |
183
|
|
|
'majorms2': ['majorms2'], |
184
|
|
|
'maximalprojectionarea': ['maximal_projection_area'], |
185
|
|
|
'hbonddonoracceptor': ['acceptor_count', 'donor_count', |
186
|
|
|
'acceptor_site_count', |
187
|
|
|
'donor_site_count'], |
188
|
|
|
'acceptorcount': ['acceptor_count'], |
189
|
|
|
'molecularpolarizability': ['a_mol'], |
190
|
|
|
'huckeltable': ['aromatic_e+/nu-_order', |
191
|
|
|
'localization_energy_l+/l-', 'pi_energy', |
192
|
|
|
'electron_density', 'charge_density'], |
193
|
|
|
'rotatablebondcount': ['rotatable_bond_count'], |
194
|
|
|
'minimalprojectionsize': [ |
195
|
|
|
'length_perpendicular_to_the_min_area'], |
196
|
|
|
'polarizability': ['a_mol', 'a_atom'], |
197
|
|
|
'acceptortable': ['acceptor_count', |
198
|
|
|
'acceptor_site_count'], |
199
|
|
|
'aliphaticringcountofsize': [ |
200
|
|
|
'aliphatic_ring_count_of_size'], 'hlb': ['hlb'], |
201
|
|
|
'eccentricity': ['eccentricity'], |
202
|
|
|
'hmochargedensity': ['hmo_charge_density'], |
203
|
|
|
'hmohuckeleigenvalue': ['hmo_eigenvalue'], |
204
|
|
|
'totalchargedensity': ['total_charge_density'], |
205
|
|
|
'hmonucleophilicityorder': ['hmo_aromatic_nu-_order'], |
206
|
|
|
'aromaticringcountofsize': [ |
207
|
|
|
'aromatic_ring_count_of_size'], |
208
|
|
|
'electrophilicityorder': ['aromatic_e+_order'], |
209
|
|
|
'connectedgraph': ['connected_graph'], |
210
|
|
|
'plattindex': ['platt_index'], 'logp': ['logp'], |
211
|
|
|
'topanal': ['atom_count', 'aliphatic_atom_count', |
212
|
|
|
'aromatic_atom_count', 'bond_count', |
213
|
|
|
'aliphatic_bond_count', 'aromatic_bond_count', |
214
|
|
|
'rotatable_bond_count', 'ring_count', |
215
|
|
|
'aliphatic_ring_count', 'aromatic_ring_count', |
216
|
|
|
'hetero_ring_count', |
217
|
|
|
'heteroaliphatic_ring_count', |
218
|
|
|
'heteroaromatic_ring_count', |
219
|
|
|
'ring_atom_count', |
220
|
|
|
'ring_bond_count', 'chain_atom_count', |
221
|
|
|
'chain_bond_count', 'smallest_ring_size', |
222
|
|
|
'largest_ring_size'], |
223
|
|
|
'logdcalculator': ['ph=0', 'ph=1', 'ph=2', 'ph=3', 'ph=4', |
224
|
|
|
'ph=5', 'ph=6', 'ph=7', 'ph=8', 'ph=9', |
225
|
|
|
'ph=10', 'ph=11', 'ph=12', 'ph=13', |
226
|
|
|
'ph=14', 'unnamed:_16'], |
227
|
|
|
'logs': ['ph=0.0', 'ph=1.0', 'ph=2.0', 'ph=3.0', 'ph=4.0', |
228
|
|
|
'ph=5.0', 'ph=6.0', 'ph=7.0', 'ph=8.0', |
229
|
|
|
'ph=9.0', 'ph=10.0', 'ph=11.0', 'ph=12.0', |
230
|
|
|
'ph=13.0', 'ph=14.0', 'unnamed:_16'], |
231
|
|
|
'atompol': ['a_atom'], 'canonicalresonant': ['structure'], |
232
|
|
|
'ringbond': ['ring_bond'], |
233
|
|
|
'ringatomcount': ['ring_atom_count'], |
234
|
|
|
'donortable': ['donor_count', 'donor_site_count'], |
235
|
|
|
'randicindex': ['randic_index'], |
236
|
|
|
'rotatablebond': ['rotatable_bond'], |
237
|
|
|
'hyperwienerindex': ['hyper_wiener_index'], |
238
|
|
|
'hmohuckeleigenvector': ['hmo_eigenvector'], |
239
|
|
|
'carboringcount': ['carbo_ring_count'], |
240
|
|
|
'logpcalculator': ['logp', 'unnamed:_2'], |
241
|
|
|
'ringsystemcount': ['ring_system_count'], |
242
|
|
|
'largestringsize': ['largest_ring_size'], |
243
|
|
|
'stereodoublebondcount': ['stereo_double_bond_count'], |
244
|
|
|
'pi': ['pi'], |
245
|
|
|
'stericeffectindex': ['steric_effect_index'], |
246
|
|
|
'volume': ['van_der_waals_volume'], |
247
|
|
|
'averagemicrospeciescharge': ['charge'], |
248
|
|
|
'pka': ['apka1', 'apka2', 'bpka1', 'bpka2', 'atoms'], |
249
|
|
|
'hmohuckeltable': ['hmo_aromatic_e+/nu-_order', |
250
|
|
|
'hmo_localization_energy_l+/l-', |
251
|
|
|
'hmo_pi_energy', |
252
|
|
|
'hmo_electron_density', |
253
|
|
|
'hmo_charge_density'], |
254
|
|
|
'ringcountofatom': ['ring_count_of_atom'], |
255
|
|
|
'aromaticelectrophilicityorder': ['aromatic_e+_order'], |
256
|
|
|
'hindrance': ['steric_hindrance'], |
257
|
|
|
'chainatomcount': ['chain_atom_count'], |
258
|
|
|
'pkacalculator': ['apka1', 'apka2', 'bpka1', 'bpka2', |
259
|
|
|
'atoms'], |
260
|
|
|
'heteroaromaticringcount': ['heteroaromatic_ring_count'], |
261
|
|
|
'sterichindrance': ['steric_hindrance'], |
262
|
|
|
'hbda': ['acceptor_count', 'donor_count', |
263
|
|
|
'acceptor_site_count', 'donor_site_count'], |
264
|
|
|
'molpol': ['a_mol'], |
265
|
|
|
'atomicpolarizability': ['a_atom'], |
266
|
|
|
'msdon': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', |
267
|
|
|
'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00', |
268
|
|
|
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', |
269
|
|
|
'ph=12.00', 'ph=13.00', 'ph=14.00'], |
270
|
|
|
'enumerationcount': ['markush_library_size'], |
271
|
|
|
'vdwsa': ['van_der_waals_surface_area_3d'], |
272
|
|
|
'orbitalelectronegativity': [ |
273
|
|
|
'sigma_orbital_electronegativity', |
274
|
|
|
'pi_orbital_electronegativity'], |
275
|
|
|
'hmoelectrophiliclocalizationenergy': [ |
276
|
|
|
'hmo_localization_energy_l+'], |
277
|
|
|
'smallestringsize': ['smallest_ring_size'], |
278
|
|
|
'szegedindex': ['szeged_index'], |
279
|
|
|
'nucleophilicityorder': ['aromatic_nu-_order'], |
280
|
|
|
'canonicaltautomer': ['canonical_tautomer'], |
281
|
|
|
'stereoisomercount': ['stereoisomer_count'], |
282
|
|
|
'msa': ['van_der_waals_surface_area_3d'], |
283
|
|
|
'donsitecount': ['donor_site_count'], |
284
|
|
|
'randommarkushenumerations': [ |
285
|
|
|
'randommarkushenumerations'], |
286
|
|
|
'wienerindex': ['wiener_index'], |
287
|
|
|
'huckelorbitals': ['orbitals'], |
288
|
|
|
'doublebondstereoisomercount': [ |
289
|
|
|
'double_bond_stereoisomer_count'], |
290
|
|
|
'tautomers': ['tautomers'], |
291
|
|
|
'polarsurfacearea': ['polar_surface_area'], |
292
|
|
|
'chiralcentercount': ['chiral_center_count'], |
293
|
|
|
'electrophiliclocalizationenergy': [ |
294
|
|
|
'localization_energy_l+'], |
295
|
|
|
'aliphaticatom': ['aliphatic_atom'], |
296
|
|
|
'ringbondcount': ['ring_bond_count'], |
297
|
|
|
'wienerpolarity': ['wiener_polarity'], |
298
|
|
|
'msacc': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', |
299
|
|
|
'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00', |
300
|
|
|
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', |
301
|
|
|
'ph=12.00', 'ph=13.00', 'ph=14.00'], |
302
|
|
|
'formalcharge': ['formal_charge'], |
303
|
|
|
'smallestringsystemsize': ['smallest_ring_system_size'], |
304
|
|
|
'majorms': ['major-ms'], |
305
|
|
|
'tholepolarizability': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], |
306
|
|
|
'aromaticatom': ['aromatic_atom'], |
307
|
|
|
'oen': ['sigma_orbital_electronegativity', |
308
|
|
|
'pi_orbital_electronegativity'], |
309
|
|
|
'chainbondcount': ['chain_bond_count'], |
310
|
|
|
'logd': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', |
311
|
|
|
'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00', |
312
|
|
|
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', |
313
|
|
|
'ph=12.00', 'ph=13.00', 'ph=14.00'], |
314
|
|
|
'hmohuckelorbitals': ['hmo_orbitals'], |
315
|
|
|
'aromaticringcount': ['aromatic_ring_count'], |
316
|
|
|
'pichargedensity': ['pi_charge_density']} |
317
|
|
|
|
318
|
1 |
|
_optimal_feats = [] # override this |
319
|
|
|
|
320
|
1 |
|
def __init__(self, features='optimal', verbose=True): |
321
|
|
|
self._features = None |
322
|
|
|
super(ChemAxonBaseFeaturizer, self).__init__(verbose=verbose) |
323
|
|
|
self.features = features |
324
|
|
|
|
325
|
1 |
|
@property |
326
|
|
|
def features(self): |
|
|
|
|
327
|
|
|
return self._features |
328
|
|
|
|
329
|
1 |
|
@features.setter |
330
|
|
|
def features(self, features): |
|
|
|
|
331
|
|
|
if features in ('optimal', 'all'): |
332
|
|
|
self._features = self._optimal_feats |
333
|
|
|
elif isinstance(features, str): |
334
|
|
|
self.features = [features] |
335
|
|
|
elif isinstance(features, (list, tuple)): |
336
|
|
|
valid = np.array([feat in self._feat_columns.keys() |
337
|
|
|
for feat in features]) |
338
|
|
|
if not all(valid): |
339
|
|
|
msg = 'Descriptor "{}" not available.'.format( |
340
|
|
|
np.array(features)[~valid]) |
341
|
|
|
raise NotImplementedError(msg) |
342
|
|
|
else: |
343
|
|
|
self._features = list(features) |
344
|
|
|
else: |
345
|
|
|
raise NotImplementedError('Feature set {} not available.'.format( |
346
|
|
|
features)) |
347
|
|
|
|
348
|
1 |
|
def _feature_index(self): |
349
|
|
|
return pd.Index(sum((self._feat_columns[feat] |
350
|
|
|
for feat in self.features), []), |
351
|
|
|
name='features') |
352
|
|
|
|
353
|
1 |
|
@staticmethod |
354
|
|
|
def validate_install(): |
355
|
|
|
try: |
356
|
|
|
return 0 == subprocess.call(['cxcalc'], |
357
|
|
|
stderr=subprocess.DEVNULL, |
|
|
|
|
358
|
|
|
stdout=subprocess.DEVNULL) |
|
|
|
|
359
|
|
|
except FileNotFoundError: |
|
|
|
|
360
|
|
|
return False |
361
|
|
|
|
362
|
1 |
|
def monitor_progress(self, filename): |
363
|
|
|
res = line_count(filename) |
364
|
|
|
return res - 1 if res else 0 |
365
|
|
|
|
366
|
1 |
|
def _cli_args(self, infile, outfile): |
367
|
|
|
return ['cxcalc', infile, '-o', outfile] + self.features |
368
|
|
|
|
369
|
1 |
|
def _parse_outfile(self, outfile): |
370
|
|
|
res = pd.read_table(outfile, engine='python').drop('id', axis=1) |
371
|
|
|
return res |
372
|
|
|
|
373
|
1 |
|
def _parse_errors(self, errs): |
374
|
|
|
LOGGER.debug('stderr: %s', errs) |
375
|
|
|
return [] # instances are not skipped ever, so don't return anything |
376
|
|
|
|
377
|
|
|
|
378
|
1 |
|
class ChemAxonFeaturizer(ChemAxonBaseFeaturizer, BatchTransformer, |
|
|
|
|
379
|
|
|
Transformer): |
380
|
|
|
|
381
|
1 |
|
_optimal_feats = ['acceptorcount', 'accsitecount', 'aliphaticatomcount', |
382
|
|
|
'aliphaticbondcount', 'aliphaticringcount', |
383
|
|
|
'aromaticatomcount', 'aromaticbondcount', |
384
|
|
|
'aromaticringcount', 'asymmetricatomcount', |
385
|
|
|
'averagemolecularpolarizability', 'axxpol', 'ayypol', |
386
|
|
|
'azzpol', 'balabanindex', 'bondcount', |
387
|
|
|
'carboaliphaticringcount', 'carboaromaticringcount', |
388
|
|
|
'carboringcount', 'chainatomcount', |
389
|
|
|
'chainbondcount', 'chiralcentercount', 'connectedgraph', |
390
|
|
|
'cyclomaticnumber', 'dipole', |
391
|
|
|
'donorcount', 'donorsitecount', |
392
|
|
|
'doublebondstereoisomercount', 'dreidingenergy', |
393
|
|
|
'formalcharge', |
394
|
|
|
'fragmentcount', 'fsp3', 'fusedaliphaticringcount', |
395
|
|
|
'fusedaromaticringcount', 'fusedringcount', |
396
|
|
|
'hararyindex', 'heteroaliphaticringcount', |
397
|
|
|
'heteroaromaticringcount', 'heteroringcount', 'hlb', |
398
|
|
|
'hmopienergy', 'hyperwienerindex', 'largestringsize', |
399
|
|
|
'largestringsystemsize', |
400
|
|
|
'markushenumerationcount', 'maximalprojectionarea', |
401
|
|
|
'maximalprojectionradius', |
402
|
|
|
'maximalprojectionsize', 'minimalprojectionarea', |
403
|
|
|
'minimalprojectionradius', |
404
|
|
|
'minimalprojectionsize', 'mmff94energy', 'molpol', |
405
|
|
|
'pienergy', 'plattindex', 'psa', 'randicindex', |
406
|
|
|
'refractivity', 'resonantcount', 'ringatomcount', |
407
|
|
|
'ringbondcount', 'ringcount', 'ringsystemcount', |
408
|
|
|
'rotatablebondcount', 'smallestringsize', |
409
|
|
|
'smallestringsystemsize', |
410
|
|
|
'stereodoublebondcount', 'stereoisomercount', |
411
|
|
|
'szegedindex', 'tetrahedralstereoisomercount', |
412
|
|
|
'vdwsa', 'volume', 'wateraccessiblesurfacearea', |
413
|
|
|
'wienerindex', 'wienerpolarity'] |
414
|
|
|
|
415
|
1 |
|
@property |
416
|
|
|
def name(self): |
|
|
|
|
417
|
|
|
return 'cx_mol' |
418
|
|
|
|
419
|
1 |
|
@property |
420
|
|
|
def columns(self): |
421
|
|
|
return self._feature_index() |
422
|
|
|
|
423
|
1 |
|
def _parse_outfile(self, outfile): |
424
|
|
|
res = super(ChemAxonFeaturizer, self)._parse_outfile(outfile) |
425
|
|
|
|
426
|
|
|
def fix_failed(inp): |
|
|
|
|
427
|
|
|
if isinstance(inp, str) and 'FAILED' in inp: |
428
|
|
|
return np.nan |
429
|
|
|
else: |
430
|
|
|
return float(inp) |
431
|
|
|
|
432
|
|
|
return res.applymap(fix_failed) |
433
|
|
|
|
434
|
|
|
|
435
|
1 |
|
class ChemAxonAtomFeaturizer(ChemAxonBaseFeaturizer, AtomTransformer, |
|
|
|
|
436
|
|
|
BatchTransformer): |
437
|
1 |
|
_optimal_feats = ['acceptormultiplicity', 'aliphaticatom', 'aromaticatom', |
438
|
|
|
'aromaticelectrophilicityorder', 'asymmetricatom', |
439
|
|
|
'atomicpolarizability', 'chainatom', 'chargedensity', |
440
|
|
|
'chiralcenter', 'distancedegree', 'donormultiplicity', |
441
|
|
|
'eccentricity', 'electrondensity', |
442
|
|
|
'electrophiliclocalizationenergy', 'hindrance', |
443
|
|
|
'hmochargedensity', 'hmoelectrondensity', |
444
|
|
|
'hmoelectrophilicityorder', |
445
|
|
|
'hmoelectrophiliclocalizationenergy', |
446
|
|
|
'hmonucleophilicityorder', |
447
|
|
|
'hmonucleophiliclocalizationenergy', 'ioncharge', |
448
|
|
|
'largestatomringsize', 'nucleophilicityorder', |
449
|
|
|
'nucleophiliclocalizationenergy', 'oen', |
450
|
|
|
'pichargedensity', 'ringatom', 'ringcountofatom', |
451
|
|
|
'stericeffectindex', 'totalchargedensity'] |
452
|
|
|
|
453
|
1 |
|
_h_inc_feats = ['acc', 'atomicpolarizability', 'charge', 'distancedegree', |
454
|
|
|
'don', 'eccentricity', 'hindrance', 'largestatomringsize', |
455
|
|
|
'oen', 'ringcountofatom', 'smallestatomringsize', |
456
|
|
|
'stericeffectindex'] |
457
|
|
|
|
458
|
1 |
|
@property |
459
|
|
|
def name(self): |
|
|
|
|
460
|
|
|
return 'cx_atom' |
461
|
|
|
|
462
|
1 |
|
@property |
463
|
|
|
def minor_axis(self): |
464
|
|
|
return self._feature_index() |
465
|
|
|
|
466
|
1 |
|
def _transform_atom(self, atom): |
467
|
|
|
raise NotImplementedError('Cannot calculate per atom with ChemAxon') |
468
|
|
|
|
469
|
1 |
|
def _parse_outfile(self, outfile): |
470
|
|
|
res = super(ChemAxonAtomFeaturizer, self)._parse_outfile(outfile) |
471
|
|
|
|
472
|
|
|
def parse_string(s): |
|
|
|
|
473
|
|
|
if s == '': |
474
|
|
|
return np.nan |
475
|
|
|
elif s == 'false': |
476
|
|
|
return 0 |
477
|
|
|
elif s == 'true': |
478
|
|
|
return 1 |
479
|
|
|
else: |
480
|
|
|
try: |
481
|
|
|
return float(s) |
482
|
|
|
except ValueError: |
483
|
|
|
return np.nan |
484
|
|
|
|
485
|
|
|
def to_padded(s): |
|
|
|
|
486
|
|
|
inner_res = np.repeat(np.nan, self.max_atoms) |
487
|
|
|
ans = np.array([parse_string(i) for i in str(s).split(';')]) |
488
|
|
|
inner_res[:len(ans)] = ans |
489
|
|
|
return inner_res |
490
|
|
|
|
491
|
|
|
res = res.applymap(to_padded) |
492
|
|
|
return pd.Panel(res.values.tolist()).swapaxes(1, 2) |
493
|
|
|
|
494
|
|
|
|
495
|
1 |
|
class ChemAxonNMRPredictor(ChemAxonBaseFeaturizer, BatchTransformer, |
|
|
|
|
496
|
|
|
AtomTransformer): |
497
|
|
|
|
498
|
1 |
|
_feat_columns = {'cnmr': ['cnmr'], 'hnmr': ['hnmr']} |
499
|
1 |
|
_optimal_feats = ['cnmr'] |
500
|
|
|
|
501
|
1 |
|
@property |
502
|
|
|
def name(self): |
|
|
|
|
503
|
|
|
return 'cx_nmr' |
504
|
|
|
|
505
|
1 |
|
def _transform_atom(self, atom): |
506
|
|
|
raise NotImplementedError('ChemAxon cannot predict for atoms.') |
507
|
|
|
|
508
|
1 |
|
def monitor_progress(self, filename): |
509
|
|
|
return sum(1 for l in open(filename, 'rb') |
510
|
|
|
if l == b'##PEAKASSIGNMENTS=(XYMA)\r\n') |
511
|
|
|
|
512
|
1 |
|
@property |
513
|
|
|
def minor_axis(self): |
514
|
|
|
return pd.Index(self.features, name='shift') |
515
|
|
|
|
516
|
1 |
|
@property |
517
|
|
|
def features(self): |
518
|
|
|
return self._features |
519
|
|
|
|
520
|
1 |
|
@features.setter |
521
|
|
|
def features(self, val): |
|
|
|
|
522
|
|
|
if val == 'c': |
523
|
|
|
self._features = ['cnmr'] |
524
|
|
|
elif val == 'h': |
525
|
|
|
self._features = ['hnmr'] |
526
|
|
|
else: |
527
|
|
|
raise NotImplementedError('Feature {} not implemented'.format(val)) |
528
|
|
|
|
529
|
1 |
|
def _parse_outfile(self, outfile): |
530
|
|
|
n_mols = self.monitor_progress(outfile) |
531
|
|
|
res = nanarray((n_mols, self.max_atoms, 1)) |
532
|
|
|
regex = re.compile(b'\((-?\d+.\d+),\d+,[A-Z],<([0-9\,]+)>\)\r\n') |
|
|
|
|
533
|
|
|
|
534
|
|
|
mol_idx = 0 |
535
|
|
|
|
536
|
|
|
with open(outfile, 'rb') as f: |
|
|
|
|
537
|
|
|
# loop through the file - inner loop will also advance the pointer |
538
|
|
|
for l in f: |
|
|
|
|
539
|
|
|
if l == b'##PEAKASSIGNMENTS=(XYMA)\r\n': |
540
|
|
|
for row in f: |
541
|
|
|
if row == b'##END=\r\n': |
542
|
|
|
break |
543
|
|
|
else: |
544
|
|
|
LOGGER.debug('Row to parse: %s', row) |
545
|
|
|
shift, idxs = regex.match(row).groups() |
546
|
|
|
shift = float(shift) |
547
|
|
|
idxs = [int(idx) for idx in idxs.split(b',')] |
548
|
|
|
for atom_idx in idxs: |
549
|
|
|
res[mol_idx, atom_idx] = shift |
550
|
|
|
mol_idx += 1 |
551
|
|
|
res = pd.Panel(res) |
552
|
|
|
return res |
553
|
|
|
|
554
|
1 |
|
def transform(self, inp): |
555
|
|
|
return super(ChemAxonNMRPredictor, self).transform(inp).T |
556
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.