|
1
|
|
|
#! /usr/bin/env python |
|
2
|
|
|
# |
|
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
|
4
|
|
|
# License: 3-clause BSD |
|
5
|
|
|
|
|
6
|
1 |
|
""" |
|
7
|
|
|
## skchem.descriptors.atom |
|
8
|
|
|
|
|
9
|
|
|
Module specifying atom based descriptor generators. |
|
10
|
|
|
""" |
|
11
|
|
|
|
|
12
|
1 |
|
import logging |
|
13
|
1 |
|
import subprocess |
|
14
|
1 |
|
import re |
|
15
|
1 |
|
from abc import ABCMeta |
|
16
|
|
|
|
|
17
|
1 |
|
import pandas as pd |
|
|
|
|
|
|
18
|
1 |
|
import numpy as np |
|
|
|
|
|
|
19
|
|
|
|
|
20
|
1 |
|
from ..utils import line_count, nanarray |
|
21
|
1 |
|
from ..base import ( |
|
22
|
|
|
CLIWrapper, Transformer, AtomTransformer, BatchTransformer, Featurizer |
|
23
|
|
|
) |
|
24
|
|
|
|
|
25
|
1 |
|
LOGGER = logging.getLogger(__file__) |
|
26
|
|
|
|
|
27
|
|
|
# TODO: fix averagemicrospeciescharge |
|
|
|
|
|
|
28
|
|
|
# TODO: fix logd logp logs |
|
|
|
|
|
|
29
|
|
|
# TODO: oen (orbital electronegativity) - sigma + pi |
|
|
|
|
|
|
30
|
|
|
# TODO: water accessible surface area |
|
|
|
|
|
|
31
|
|
|
|
|
32
|
|
|
# TODO: these don't produce csv |
|
|
|
|
|
|
33
|
|
|
# ['doublebondstereoisomers', 'conformers', 'stereoisomers', |
|
34
|
|
|
# 'moleculardynamics', 'stereoanalysis', 'lowestenergyconformer', 'msdistr2', |
|
35
|
|
|
# 'conformations', 'dominanttautomerdistribution', 'hnmr', 'moldyn', 'cnmr', |
|
36
|
|
|
# 'frameworks', 'microspeciesdistribution', 'nmr', 'leconformer', 'msdistr', |
|
37
|
|
|
# 'tetrahedralstereoisomers'] |
|
38
|
|
|
# |
|
39
|
|
|
|
|
40
|
1 |
|
CHEMAXON_HINT = """ Install ChemAxon from https://www.chemaxon.com. |
|
41
|
|
|
It requires a license, which can be freely obtained for academics. """ |
|
42
|
|
|
|
|
43
|
|
|
|
|
44
|
1 |
|
class ChemAxonBaseFeaturizer(CLIWrapper, Featurizer): |
|
|
|
|
|
|
45
|
|
|
|
|
46
|
|
|
""" Base class for ChemAxonFeaturizers (using cxcalc). |
|
47
|
|
|
|
|
48
|
|
|
Concrete subclasses inheriting from this should override |
|
49
|
|
|
`_all_feats`, `_optimal_features`. |
|
50
|
|
|
""" |
|
51
|
|
|
|
|
52
|
1 |
|
__metaclass__ = ABCMeta |
|
53
|
|
|
|
|
54
|
1 |
|
install_hint = CHEMAXON_HINT |
|
55
|
|
|
|
|
56
|
1 |
|
_feat_columns = {'averagepol': ['a_avg'], 'name': ['preferred_iupac_name'], |
|
57
|
|
|
'aromaticbondcount': ['aromatic_bond_count'], |
|
58
|
|
|
'maximalprojectionradius': ['maximal_projection_radius'], |
|
59
|
|
|
'tpolarizability': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], |
|
60
|
|
|
'distance': ['distance'], |
|
61
|
|
|
'acceptor': ['acceptor_count', 'acceptor_site_count'], |
|
62
|
|
|
'fusedringcount': ['fused_ring_count'], |
|
63
|
|
|
'charge': ['total_charge'], |
|
64
|
|
|
'donor': ['donor_count', 'donor_site_count'], |
|
65
|
|
|
'ringcount': ['ring_count'], |
|
66
|
|
|
'chainbond': ['chain_bond'], |
|
67
|
|
|
'mmff94energy': ['mmff94_energy'], |
|
68
|
|
|
'huckel': ['aromatic_e+/nu-_order', |
|
69
|
|
|
'localization_energy_l_+/l-', 'pi_energy', |
|
70
|
|
|
'electron_density', 'charge_density'], |
|
71
|
|
|
'chainatom': ['chain_atom'], |
|
72
|
|
|
'shortestpath': ['shortest_path'], |
|
73
|
|
|
'resonantcount': ['resonant_count'], |
|
74
|
|
|
'tpol': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], |
|
75
|
|
|
'moststabletautomer': ['most_stable_tautomer'], |
|
76
|
|
|
'generictautomer': ['generictautomer'], |
|
77
|
|
|
'hmoelectrophilicityorder': ['hmo_aromatic_e+_order'], |
|
78
|
|
|
'ringsystemcountofsize': ['ring_system_count_of_size'], |
|
79
|
|
|
'largestatomringsize': ['largest_ring_size_of_atom'], |
|
80
|
|
|
'tetrahedralstereoisomercount': [ |
|
81
|
|
|
'tetrahedral_stereoisomer_count'], |
|
82
|
|
|
'enumerations': ['enumerations'], |
|
83
|
|
|
'ringatom': ['ring_atom'], 'connected': ['connected'], |
|
84
|
|
|
'hmolocalizationenergy': [ |
|
85
|
|
|
'hmo_localization_energy_l+/l-'], |
|
86
|
|
|
'averagemolecularpolarizability': ['a_avg'], |
|
87
|
|
|
'donorsitecount': ['donor_site_count'], |
|
88
|
|
|
'donorcount': ['donor_count'], |
|
89
|
|
|
'asymmetricatom': ['asymmetric_atom'], |
|
90
|
|
|
'pienergy': ['pi_energy'], |
|
91
|
|
|
'bondcount': ['bond_count'], |
|
92
|
|
|
'chiralcenters': ['chiral_centers'], |
|
93
|
|
|
'hmohuckel': ['hmo_aromatic_e+/nu-_order', |
|
94
|
|
|
'hmo_localization_energy_l+/l-', |
|
95
|
|
|
'hmo_pi_energy', |
|
96
|
|
|
'hmo_electron_density', |
|
97
|
|
|
'hmo_charge_density'], |
|
98
|
|
|
'huckeleigenvector': ['eigenvector'], |
|
99
|
|
|
'ringcountofsize': ['ring_count_of_size'], |
|
100
|
|
|
'heteroaliphaticringcount': [ |
|
101
|
|
|
'heteroaliphatic_ring_count'], |
|
102
|
|
|
'markushenumerations': ['enumerations'], |
|
103
|
|
|
'minimalprojectionradius': ['minimal_projection_radius'], |
|
104
|
|
|
'dipole': ['dipole'], |
|
105
|
|
|
'balabanindex': ['balaban_index'], |
|
106
|
|
|
'aromaticnucleophilicityorder': ['aromatic_nu-_order'], |
|
107
|
|
|
'tautomercount': ['tautomer_count'], |
|
108
|
|
|
'cyclomaticnumber': ['cyclomatic_number'], |
|
109
|
|
|
'psa': ['polar_surface_area'], 'isoelectricpoint': ['pi'], |
|
110
|
|
|
'hmopienergy': ['hmo_pi_energy'], |
|
111
|
|
|
'ayypol': ['a_yy'], 'fragmentcount': ['fragment_count'], |
|
112
|
|
|
'acceptormultiplicity': ['acceptor_multiplicity'], |
|
113
|
|
|
'topologyanalysistable': ['atom_count', |
|
114
|
|
|
'aliphatic_atom_count', |
|
115
|
|
|
'aromatic_atom_count', |
|
116
|
|
|
'bond_count', |
|
117
|
|
|
'aliphatic_bond_count', |
|
118
|
|
|
'aromatic_bond_count', |
|
119
|
|
|
'rotatable_bond_count', |
|
120
|
|
|
'ring_count', |
|
121
|
|
|
'aliphatic_ring_count', |
|
122
|
|
|
'aromatic_ring_count', |
|
123
|
|
|
'hetero_ring_count', |
|
124
|
|
|
'heteroaliphatic_ring_count', |
|
125
|
|
|
'heteroaromatic_ring_count', |
|
126
|
|
|
'ring_atom_count', |
|
127
|
|
|
'ring_bond_count', |
|
128
|
|
|
'chain_atom_count', |
|
129
|
|
|
'chain_bond_count', |
|
130
|
|
|
'smallest_ring_size', |
|
131
|
|
|
'largest_ring_size'], |
|
132
|
|
|
'ioncharge': ['charge'], |
|
133
|
|
|
'asymmetricatoms': ['asymmetric_atoms'], |
|
134
|
|
|
'wateraccessiblesurfacearea': ['asa', 'asa+', 'asa-', |
|
135
|
|
|
'asa_h', 'asa_p'], |
|
136
|
|
|
'avgpol': ['a_avg'], |
|
137
|
|
|
'carboaliphaticringcount': ['carboaliphatic_ring_count'], |
|
138
|
|
|
'aliphaticringcount': ['aliphatic_ring_count'], |
|
139
|
|
|
'donormultiplicity': ['donor_multiplicity'], |
|
140
|
|
|
'minimalprojectionarea': ['minimal_projection_area'], |
|
141
|
|
|
'nucleophiliclocalizationenergy': [ |
|
142
|
|
|
'localization_energy_l-'], 'dihedral': ['dihedral'], |
|
143
|
|
|
'heteroringcount': ['hetero_ring_count'], |
|
144
|
|
|
'azzpol': ['a_zz'], |
|
145
|
|
|
'molecularsurfacearea': ['van_der_waals_surface_area_3d'], |
|
146
|
|
|
'hmonucleophiliclocalizationenergy': [ |
|
147
|
|
|
'hmo_localization_energy_l-'], |
|
148
|
|
|
'chargedistribution': ['charge_distribution'], |
|
149
|
|
|
'pol': ['a_mol', 'a_atom'], |
|
150
|
|
|
'hmoelectrondensity': ['hmo_electron_density'], |
|
151
|
|
|
'carboaromaticringcount': ['carboaromatic_ring_count'], |
|
152
|
|
|
'acceptorsitecount': ['acceptor_site_count'], |
|
153
|
|
|
'markushenumerationcount': ['markush_library_size'], |
|
154
|
|
|
'localizationenergy': ['localization_energy_l+/l-'], |
|
155
|
|
|
'hararyindex': ['harary_index'], |
|
156
|
|
|
'asa': ['asa', 'asa+', 'asa-', 'asa_h', 'asa_p'], |
|
157
|
|
|
'acc': ['acc'], 'majortautomer': ['major_tautomer'], |
|
158
|
|
|
'majormicrospecies': ['major-ms'], |
|
159
|
|
|
'aliphaticatomcount': ['aliphatic_atom_count'], |
|
160
|
|
|
'angle': ['angle'], 'huckeleigenvalue': ['eigenvalue'], |
|
161
|
|
|
'axxpol': ['a_xx'], |
|
162
|
|
|
'chiralcenter': ['chiral_center'], |
|
163
|
|
|
'aliphaticbondcount': ['aliphatic_bond_count'], |
|
164
|
|
|
'smallestatomringsize': ['smallest_ring_size_of_atom'], |
|
165
|
|
|
'dreidingenergy': ['dreiding_energy'], |
|
166
|
|
|
'maximalprojectionsize': [ |
|
167
|
|
|
'length_perpendicular_to_the_max_area'], |
|
168
|
|
|
'largestringsystemsize': ['largest_ring_system_size'], |
|
169
|
|
|
'accsitecount': ['acceptor_site_count'], |
|
170
|
|
|
'refractivity': ['refractivity'], |
|
171
|
|
|
'bondtype': ['bond_type'], |
|
172
|
|
|
'chargedensity': ['charge_density'], |
|
173
|
|
|
'resonants': ['resonants'], |
|
174
|
|
|
'aromaticatomcount': ['aromatic_atom_count'], |
|
175
|
|
|
'distancedegree': ['distance_degree'], |
|
176
|
|
|
'hasvalidconformer': ['has_valid_conformer'], |
|
177
|
|
|
'electrondensity': ['electron_density'], |
|
178
|
|
|
'asymmetricatomcount': ['asymmetric_atom_count'], |
|
179
|
|
|
'fsp3': ['fsp3'], 'don': ['don'], |
|
180
|
|
|
'fusedaliphaticringcount': ['fused_aliphatic_ring_count'], |
|
181
|
|
|
'pkat': ['pkat'], |
|
182
|
|
|
'fusedaromaticringcount': ['fused_aromatic_ring_count'], |
|
183
|
|
|
'majorms2': ['majorms2'], |
|
184
|
|
|
'maximalprojectionarea': ['maximal_projection_area'], |
|
185
|
|
|
'hbonddonoracceptor': ['acceptor_count', 'donor_count', |
|
186
|
|
|
'acceptor_site_count', |
|
187
|
|
|
'donor_site_count'], |
|
188
|
|
|
'acceptorcount': ['acceptor_count'], |
|
189
|
|
|
'molecularpolarizability': ['a_mol'], |
|
190
|
|
|
'huckeltable': ['aromatic_e+/nu-_order', |
|
191
|
|
|
'localization_energy_l+/l-', 'pi_energy', |
|
192
|
|
|
'electron_density', 'charge_density'], |
|
193
|
|
|
'rotatablebondcount': ['rotatable_bond_count'], |
|
194
|
|
|
'minimalprojectionsize': [ |
|
195
|
|
|
'length_perpendicular_to_the_min_area'], |
|
196
|
|
|
'polarizability': ['a_mol', 'a_atom'], |
|
197
|
|
|
'acceptortable': ['acceptor_count', |
|
198
|
|
|
'acceptor_site_count'], |
|
199
|
|
|
'aliphaticringcountofsize': [ |
|
200
|
|
|
'aliphatic_ring_count_of_size'], 'hlb': ['hlb'], |
|
201
|
|
|
'eccentricity': ['eccentricity'], |
|
202
|
|
|
'hmochargedensity': ['hmo_charge_density'], |
|
203
|
|
|
'hmohuckeleigenvalue': ['hmo_eigenvalue'], |
|
204
|
|
|
'totalchargedensity': ['total_charge_density'], |
|
205
|
|
|
'hmonucleophilicityorder': ['hmo_aromatic_nu-_order'], |
|
206
|
|
|
'aromaticringcountofsize': [ |
|
207
|
|
|
'aromatic_ring_count_of_size'], |
|
208
|
|
|
'electrophilicityorder': ['aromatic_e+_order'], |
|
209
|
|
|
'connectedgraph': ['connected_graph'], |
|
210
|
|
|
'plattindex': ['platt_index'], 'logp': ['logp'], |
|
211
|
|
|
'topanal': ['atom_count', 'aliphatic_atom_count', |
|
212
|
|
|
'aromatic_atom_count', 'bond_count', |
|
213
|
|
|
'aliphatic_bond_count', 'aromatic_bond_count', |
|
214
|
|
|
'rotatable_bond_count', 'ring_count', |
|
215
|
|
|
'aliphatic_ring_count', 'aromatic_ring_count', |
|
216
|
|
|
'hetero_ring_count', |
|
217
|
|
|
'heteroaliphatic_ring_count', |
|
218
|
|
|
'heteroaromatic_ring_count', |
|
219
|
|
|
'ring_atom_count', |
|
220
|
|
|
'ring_bond_count', 'chain_atom_count', |
|
221
|
|
|
'chain_bond_count', 'smallest_ring_size', |
|
222
|
|
|
'largest_ring_size'], |
|
223
|
|
|
'logdcalculator': ['ph=0', 'ph=1', 'ph=2', 'ph=3', 'ph=4', |
|
224
|
|
|
'ph=5', 'ph=6', 'ph=7', 'ph=8', 'ph=9', |
|
225
|
|
|
'ph=10', 'ph=11', 'ph=12', 'ph=13', |
|
226
|
|
|
'ph=14', 'unnamed:_16'], |
|
227
|
|
|
'logs': ['ph=0.0', 'ph=1.0', 'ph=2.0', 'ph=3.0', 'ph=4.0', |
|
228
|
|
|
'ph=5.0', 'ph=6.0', 'ph=7.0', 'ph=8.0', |
|
229
|
|
|
'ph=9.0', 'ph=10.0', 'ph=11.0', 'ph=12.0', |
|
230
|
|
|
'ph=13.0', 'ph=14.0', 'unnamed:_16'], |
|
231
|
|
|
'atompol': ['a_atom'], 'canonicalresonant': ['structure'], |
|
232
|
|
|
'ringbond': ['ring_bond'], |
|
233
|
|
|
'ringatomcount': ['ring_atom_count'], |
|
234
|
|
|
'donortable': ['donor_count', 'donor_site_count'], |
|
235
|
|
|
'randicindex': ['randic_index'], |
|
236
|
|
|
'rotatablebond': ['rotatable_bond'], |
|
237
|
|
|
'hyperwienerindex': ['hyper_wiener_index'], |
|
238
|
|
|
'hmohuckeleigenvector': ['hmo_eigenvector'], |
|
239
|
|
|
'carboringcount': ['carbo_ring_count'], |
|
240
|
|
|
'logpcalculator': ['logp', 'unnamed:_2'], |
|
241
|
|
|
'ringsystemcount': ['ring_system_count'], |
|
242
|
|
|
'largestringsize': ['largest_ring_size'], |
|
243
|
|
|
'stereodoublebondcount': ['stereo_double_bond_count'], |
|
244
|
|
|
'pi': ['pi'], |
|
245
|
|
|
'stericeffectindex': ['steric_effect_index'], |
|
246
|
|
|
'volume': ['van_der_waals_volume'], |
|
247
|
|
|
'averagemicrospeciescharge': ['charge'], |
|
248
|
|
|
'pka': ['apka1', 'apka2', 'bpka1', 'bpka2', 'atoms'], |
|
249
|
|
|
'hmohuckeltable': ['hmo_aromatic_e+/nu-_order', |
|
250
|
|
|
'hmo_localization_energy_l+/l-', |
|
251
|
|
|
'hmo_pi_energy', |
|
252
|
|
|
'hmo_electron_density', |
|
253
|
|
|
'hmo_charge_density'], |
|
254
|
|
|
'ringcountofatom': ['ring_count_of_atom'], |
|
255
|
|
|
'aromaticelectrophilicityorder': ['aromatic_e+_order'], |
|
256
|
|
|
'hindrance': ['steric_hindrance'], |
|
257
|
|
|
'chainatomcount': ['chain_atom_count'], |
|
258
|
|
|
'pkacalculator': ['apka1', 'apka2', 'bpka1', 'bpka2', |
|
259
|
|
|
'atoms'], |
|
260
|
|
|
'heteroaromaticringcount': ['heteroaromatic_ring_count'], |
|
261
|
|
|
'sterichindrance': ['steric_hindrance'], |
|
262
|
|
|
'hbda': ['acceptor_count', 'donor_count', |
|
263
|
|
|
'acceptor_site_count', 'donor_site_count'], |
|
264
|
|
|
'molpol': ['a_mol'], |
|
265
|
|
|
'atomicpolarizability': ['a_atom'], |
|
266
|
|
|
'msdon': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', |
|
267
|
|
|
'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00', |
|
268
|
|
|
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', |
|
269
|
|
|
'ph=12.00', 'ph=13.00', 'ph=14.00'], |
|
270
|
|
|
'enumerationcount': ['markush_library_size'], |
|
271
|
|
|
'vdwsa': ['van_der_waals_surface_area_3d'], |
|
272
|
|
|
'orbitalelectronegativity': [ |
|
273
|
|
|
'sigma_orbital_electronegativity', |
|
274
|
|
|
'pi_orbital_electronegativity'], |
|
275
|
|
|
'hmoelectrophiliclocalizationenergy': [ |
|
276
|
|
|
'hmo_localization_energy_l+'], |
|
277
|
|
|
'smallestringsize': ['smallest_ring_size'], |
|
278
|
|
|
'szegedindex': ['szeged_index'], |
|
279
|
|
|
'nucleophilicityorder': ['aromatic_nu-_order'], |
|
280
|
|
|
'canonicaltautomer': ['canonical_tautomer'], |
|
281
|
|
|
'stereoisomercount': ['stereoisomer_count'], |
|
282
|
|
|
'msa': ['van_der_waals_surface_area_3d'], |
|
283
|
|
|
'donsitecount': ['donor_site_count'], |
|
284
|
|
|
'randommarkushenumerations': [ |
|
285
|
|
|
'randommarkushenumerations'], |
|
286
|
|
|
'wienerindex': ['wiener_index'], |
|
287
|
|
|
'huckelorbitals': ['orbitals'], |
|
288
|
|
|
'doublebondstereoisomercount': [ |
|
289
|
|
|
'double_bond_stereoisomer_count'], |
|
290
|
|
|
'tautomers': ['tautomers'], |
|
291
|
|
|
'polarsurfacearea': ['polar_surface_area'], |
|
292
|
|
|
'chiralcentercount': ['chiral_center_count'], |
|
293
|
|
|
'electrophiliclocalizationenergy': [ |
|
294
|
|
|
'localization_energy_l+'], |
|
295
|
|
|
'aliphaticatom': ['aliphatic_atom'], |
|
296
|
|
|
'ringbondcount': ['ring_bond_count'], |
|
297
|
|
|
'wienerpolarity': ['wiener_polarity'], |
|
298
|
|
|
'msacc': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', |
|
299
|
|
|
'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00', |
|
300
|
|
|
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', |
|
301
|
|
|
'ph=12.00', 'ph=13.00', 'ph=14.00'], |
|
302
|
|
|
'formalcharge': ['formal_charge'], |
|
303
|
|
|
'smallestringsystemsize': ['smallest_ring_system_size'], |
|
304
|
|
|
'majorms': ['major-ms'], |
|
305
|
|
|
'tholepolarizability': ['a_avg', 'a_xx', 'a_yy', 'a_zz'], |
|
306
|
|
|
'aromaticatom': ['aromatic_atom'], |
|
307
|
|
|
'oen': ['sigma_orbital_electronegativity', |
|
308
|
|
|
'pi_orbital_electronegativity'], |
|
309
|
|
|
'chainbondcount': ['chain_bond_count'], |
|
310
|
|
|
'logd': ['ph=0.00', 'ph=1.00', 'ph=2.00', 'ph=3.00', |
|
311
|
|
|
'ph=4.00', 'ph=5.00', 'ph=6.00', 'ph=7.00', |
|
312
|
|
|
'ph=8.00', 'ph=9.00', 'ph=10.00', 'ph=11.00', |
|
313
|
|
|
'ph=12.00', 'ph=13.00', 'ph=14.00'], |
|
314
|
|
|
'hmohuckelorbitals': ['hmo_orbitals'], |
|
315
|
|
|
'aromaticringcount': ['aromatic_ring_count'], |
|
316
|
|
|
'pichargedensity': ['pi_charge_density']} |
|
317
|
|
|
|
|
318
|
1 |
|
_optimal_feats = [] # override this |
|
319
|
|
|
|
|
320
|
1 |
|
def __init__(self, features='optimal', verbose=True): |
|
321
|
|
|
self._features = None |
|
322
|
|
|
super(ChemAxonBaseFeaturizer, self).__init__(verbose=verbose) |
|
323
|
|
|
self.features = features |
|
324
|
|
|
|
|
325
|
1 |
|
@property |
|
326
|
|
|
def features(self): |
|
|
|
|
|
|
327
|
|
|
return self._features |
|
328
|
|
|
|
|
329
|
1 |
|
@features.setter |
|
330
|
|
|
def features(self, features): |
|
|
|
|
|
|
331
|
|
|
if features in ('optimal', 'all'): |
|
332
|
|
|
self._features = self._optimal_feats |
|
333
|
|
|
elif isinstance(features, str): |
|
334
|
|
|
self.features = [features] |
|
335
|
|
|
elif isinstance(features, (list, tuple)): |
|
336
|
|
|
valid = np.array([feat in self._feat_columns.keys() |
|
337
|
|
|
for feat in features]) |
|
338
|
|
|
if not all(valid): |
|
339
|
|
|
msg = 'Descriptor "{}" not available.'.format( |
|
340
|
|
|
np.array(features)[~valid]) |
|
341
|
|
|
raise NotImplementedError(msg) |
|
342
|
|
|
else: |
|
343
|
|
|
self._features = list(features) |
|
344
|
|
|
else: |
|
345
|
|
|
raise NotImplementedError('Feature set {} not available.'.format( |
|
346
|
|
|
features)) |
|
347
|
|
|
|
|
348
|
1 |
|
def _feature_index(self): |
|
349
|
|
|
return pd.Index(sum((self._feat_columns[feat] |
|
350
|
|
|
for feat in self.features), []), |
|
351
|
|
|
name='features') |
|
352
|
|
|
|
|
353
|
1 |
|
@staticmethod |
|
354
|
|
|
def validate_install(): |
|
355
|
|
|
try: |
|
356
|
|
|
return 0 == subprocess.call(['cxcalc'], |
|
357
|
|
|
stderr=subprocess.DEVNULL, |
|
|
|
|
|
|
358
|
|
|
stdout=subprocess.DEVNULL) |
|
|
|
|
|
|
359
|
|
|
except FileNotFoundError: |
|
|
|
|
|
|
360
|
|
|
return False |
|
361
|
|
|
|
|
362
|
1 |
|
def monitor_progress(self, filename): |
|
363
|
|
|
res = line_count(filename) |
|
364
|
|
|
return res - 1 if res else 0 |
|
365
|
|
|
|
|
366
|
1 |
|
def _cli_args(self, infile, outfile): |
|
367
|
|
|
return ['cxcalc', infile, '-o', outfile] + self.features |
|
368
|
|
|
|
|
369
|
1 |
|
def _parse_outfile(self, outfile): |
|
370
|
|
|
res = pd.read_table(outfile, engine='python').drop('id', axis=1) |
|
371
|
|
|
return res |
|
372
|
|
|
|
|
373
|
1 |
|
def _parse_errors(self, errs): |
|
374
|
|
|
LOGGER.debug('stderr: %s', errs) |
|
375
|
|
|
return [] # instances are not skipped ever, so don't return anything |
|
376
|
|
|
|
|
377
|
|
|
|
|
378
|
1 |
|
class ChemAxonFeaturizer(ChemAxonBaseFeaturizer, BatchTransformer, |
|
|
|
|
|
|
379
|
|
|
Transformer): |
|
380
|
|
|
|
|
381
|
1 |
|
_optimal_feats = ['acceptorcount', 'accsitecount', 'aliphaticatomcount', |
|
382
|
|
|
'aliphaticbondcount', 'aliphaticringcount', |
|
383
|
|
|
'aromaticatomcount', 'aromaticbondcount', |
|
384
|
|
|
'aromaticringcount', 'asymmetricatomcount', |
|
385
|
|
|
'averagemolecularpolarizability', 'axxpol', 'ayypol', |
|
386
|
|
|
'azzpol', 'balabanindex', 'bondcount', |
|
387
|
|
|
'carboaliphaticringcount', 'carboaromaticringcount', |
|
388
|
|
|
'carboringcount', 'chainatomcount', |
|
389
|
|
|
'chainbondcount', 'chiralcentercount', 'connectedgraph', |
|
390
|
|
|
'cyclomaticnumber', 'dipole', |
|
391
|
|
|
'donorcount', 'donorsitecount', |
|
392
|
|
|
'doublebondstereoisomercount', 'dreidingenergy', |
|
393
|
|
|
'formalcharge', |
|
394
|
|
|
'fragmentcount', 'fsp3', 'fusedaliphaticringcount', |
|
395
|
|
|
'fusedaromaticringcount', 'fusedringcount', |
|
396
|
|
|
'hararyindex', 'heteroaliphaticringcount', |
|
397
|
|
|
'heteroaromaticringcount', 'heteroringcount', 'hlb', |
|
398
|
|
|
'hmopienergy', 'hyperwienerindex', 'largestringsize', |
|
399
|
|
|
'largestringsystemsize', |
|
400
|
|
|
'markushenumerationcount', 'maximalprojectionarea', |
|
401
|
|
|
'maximalprojectionradius', |
|
402
|
|
|
'maximalprojectionsize', 'minimalprojectionarea', |
|
403
|
|
|
'minimalprojectionradius', |
|
404
|
|
|
'minimalprojectionsize', 'mmff94energy', 'molpol', |
|
405
|
|
|
'pienergy', 'plattindex', 'psa', 'randicindex', |
|
406
|
|
|
'refractivity', 'resonantcount', 'ringatomcount', |
|
407
|
|
|
'ringbondcount', 'ringcount', 'ringsystemcount', |
|
408
|
|
|
'rotatablebondcount', 'smallestringsize', |
|
409
|
|
|
'smallestringsystemsize', |
|
410
|
|
|
'stereodoublebondcount', 'stereoisomercount', |
|
411
|
|
|
'szegedindex', 'tetrahedralstereoisomercount', |
|
412
|
|
|
'vdwsa', 'volume', 'wateraccessiblesurfacearea', |
|
413
|
|
|
'wienerindex', 'wienerpolarity'] |
|
414
|
|
|
|
|
415
|
1 |
|
@property |
|
416
|
|
|
def name(self): |
|
|
|
|
|
|
417
|
|
|
return 'cx_mol' |
|
418
|
|
|
|
|
419
|
1 |
|
@property |
|
420
|
|
|
def columns(self): |
|
421
|
|
|
return self._feature_index() |
|
422
|
|
|
|
|
423
|
1 |
|
def _parse_outfile(self, outfile): |
|
424
|
|
|
res = super(ChemAxonFeaturizer, self)._parse_outfile(outfile) |
|
425
|
|
|
|
|
426
|
|
|
def fix_failed(inp): |
|
|
|
|
|
|
427
|
|
|
if isinstance(inp, str) and 'FAILED' in inp: |
|
428
|
|
|
return np.nan |
|
429
|
|
|
else: |
|
430
|
|
|
return float(inp) |
|
431
|
|
|
|
|
432
|
|
|
return res.applymap(fix_failed) |
|
433
|
|
|
|
|
434
|
|
|
|
|
435
|
1 |
|
class ChemAxonAtomFeaturizer(ChemAxonBaseFeaturizer, AtomTransformer, |
|
|
|
|
|
|
436
|
|
|
BatchTransformer): |
|
437
|
1 |
|
_optimal_feats = ['acceptormultiplicity', 'aliphaticatom', 'aromaticatom', |
|
438
|
|
|
'aromaticelectrophilicityorder', 'asymmetricatom', |
|
439
|
|
|
'atomicpolarizability', 'chainatom', 'chargedensity', |
|
440
|
|
|
'chiralcenter', 'distancedegree', 'donormultiplicity', |
|
441
|
|
|
'eccentricity', 'electrondensity', |
|
442
|
|
|
'electrophiliclocalizationenergy', 'hindrance', |
|
443
|
|
|
'hmochargedensity', 'hmoelectrondensity', |
|
444
|
|
|
'hmoelectrophilicityorder', |
|
445
|
|
|
'hmoelectrophiliclocalizationenergy', |
|
446
|
|
|
'hmonucleophilicityorder', |
|
447
|
|
|
'hmonucleophiliclocalizationenergy', 'ioncharge', |
|
448
|
|
|
'largestatomringsize', 'nucleophilicityorder', |
|
449
|
|
|
'nucleophiliclocalizationenergy', 'oen', |
|
450
|
|
|
'pichargedensity', 'ringatom', 'ringcountofatom', |
|
451
|
|
|
'stericeffectindex', 'totalchargedensity'] |
|
452
|
|
|
|
|
453
|
1 |
|
_h_inc_feats = ['acc', 'atomicpolarizability', 'charge', 'distancedegree', |
|
454
|
|
|
'don', 'eccentricity', 'hindrance', 'largestatomringsize', |
|
455
|
|
|
'oen', 'ringcountofatom', 'smallestatomringsize', |
|
456
|
|
|
'stericeffectindex'] |
|
457
|
|
|
|
|
458
|
1 |
|
@property |
|
459
|
|
|
def name(self): |
|
|
|
|
|
|
460
|
|
|
return 'cx_atom' |
|
461
|
|
|
|
|
462
|
1 |
|
@property |
|
463
|
|
|
def minor_axis(self): |
|
464
|
|
|
return self._feature_index() |
|
465
|
|
|
|
|
466
|
1 |
|
def _transform_atom(self, atom): |
|
467
|
|
|
raise NotImplementedError('Cannot calculate per atom with ChemAxon') |
|
468
|
|
|
|
|
469
|
1 |
|
def _parse_outfile(self, outfile): |
|
470
|
|
|
res = super(ChemAxonAtomFeaturizer, self)._parse_outfile(outfile) |
|
471
|
|
|
|
|
472
|
|
|
def parse_string(s): |
|
|
|
|
|
|
473
|
|
|
if s == '': |
|
474
|
|
|
return np.nan |
|
475
|
|
|
elif s == 'false': |
|
476
|
|
|
return 0 |
|
477
|
|
|
elif s == 'true': |
|
478
|
|
|
return 1 |
|
479
|
|
|
else: |
|
480
|
|
|
try: |
|
481
|
|
|
return float(s) |
|
482
|
|
|
except ValueError: |
|
483
|
|
|
return np.nan |
|
484
|
|
|
|
|
485
|
|
|
def to_padded(s): |
|
|
|
|
|
|
486
|
|
|
inner_res = np.repeat(np.nan, self.max_atoms) |
|
487
|
|
|
ans = np.array([parse_string(i) for i in str(s).split(';')]) |
|
488
|
|
|
inner_res[:len(ans)] = ans |
|
489
|
|
|
return inner_res |
|
490
|
|
|
|
|
491
|
|
|
res = res.applymap(to_padded) |
|
492
|
|
|
return pd.Panel(res.values.tolist()).swapaxes(1, 2) |
|
493
|
|
|
|
|
494
|
|
|
|
|
495
|
1 |
|
class ChemAxonNMRPredictor(ChemAxonBaseFeaturizer, BatchTransformer, |
|
|
|
|
|
|
496
|
|
|
AtomTransformer): |
|
497
|
|
|
|
|
498
|
1 |
|
_feat_columns = {'cnmr': ['cnmr'], 'hnmr': ['hnmr']} |
|
499
|
1 |
|
_optimal_feats = ['cnmr'] |
|
500
|
|
|
|
|
501
|
1 |
|
@property |
|
502
|
|
|
def name(self): |
|
|
|
|
|
|
503
|
|
|
return 'cx_nmr' |
|
504
|
|
|
|
|
505
|
1 |
|
def _transform_atom(self, atom): |
|
506
|
|
|
raise NotImplementedError('ChemAxon cannot predict for atoms.') |
|
507
|
|
|
|
|
508
|
1 |
|
def monitor_progress(self, filename): |
|
509
|
|
|
return sum(1 for l in open(filename, 'rb') |
|
510
|
|
|
if l == b'##PEAKASSIGNMENTS=(XYMA)\r\n') |
|
511
|
|
|
|
|
512
|
1 |
|
@property |
|
513
|
|
|
def minor_axis(self): |
|
514
|
|
|
return pd.Index(self.features, name='shift') |
|
515
|
|
|
|
|
516
|
1 |
|
@property |
|
517
|
|
|
def features(self): |
|
518
|
|
|
return self._features |
|
519
|
|
|
|
|
520
|
1 |
|
@features.setter |
|
521
|
|
|
def features(self, val): |
|
|
|
|
|
|
522
|
|
|
if val == 'c': |
|
523
|
|
|
self._features = ['cnmr'] |
|
524
|
|
|
elif val == 'h': |
|
525
|
|
|
self._features = ['hnmr'] |
|
526
|
|
|
else: |
|
527
|
|
|
raise NotImplementedError('Feature {} not implemented'.format(val)) |
|
528
|
|
|
|
|
529
|
1 |
|
def _parse_outfile(self, outfile): |
|
530
|
|
|
n_mols = self.monitor_progress(outfile) |
|
531
|
|
|
res = nanarray((n_mols, self.max_atoms, 1)) |
|
532
|
|
|
regex = re.compile(b'\((-?\d+.\d+),\d+,[A-Z],<([0-9\,]+)>\)\r\n') |
|
|
|
|
|
|
533
|
|
|
|
|
534
|
|
|
mol_idx = 0 |
|
535
|
|
|
|
|
536
|
|
|
with open(outfile, 'rb') as f: |
|
|
|
|
|
|
537
|
|
|
# loop through the file - inner loop will also advance the pointer |
|
538
|
|
|
for l in f: |
|
|
|
|
|
|
539
|
|
|
if l == b'##PEAKASSIGNMENTS=(XYMA)\r\n': |
|
540
|
|
|
for row in f: |
|
541
|
|
|
if row == b'##END=\r\n': |
|
542
|
|
|
break |
|
543
|
|
|
else: |
|
544
|
|
|
LOGGER.debug('Row to parse: %s', row) |
|
545
|
|
|
shift, idxs = regex.match(row).groups() |
|
546
|
|
|
shift = float(shift) |
|
547
|
|
|
idxs = [int(idx) for idx in idxs.split(b',')] |
|
548
|
|
|
for atom_idx in idxs: |
|
549
|
|
|
res[mol_idx, atom_idx] = shift |
|
550
|
|
|
mol_idx += 1 |
|
551
|
|
|
res = pd.Panel(res) |
|
552
|
|
|
return res |
|
553
|
|
|
|
|
554
|
1 |
|
def transform(self, inp): |
|
555
|
|
|
return super(ChemAxonNMRPredictor, self).transform(inp).T |
|
556
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.pyfiles in your module folders. Make sure that you place one file in each sub-folder.