1
|
|
|
#! /usr/bin/env python |
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
|
|
""" |
7
|
|
|
## skchem.descriptors.atom |
8
|
|
|
|
9
|
|
|
Module specifying atom based descriptor generators. |
10
|
|
|
""" |
11
|
|
|
|
12
|
|
|
import logging |
13
|
|
|
import subprocess |
14
|
|
|
import tempfile |
15
|
|
|
|
16
|
|
|
import pandas as pd |
|
|
|
|
17
|
|
|
import numpy as np |
|
|
|
|
18
|
|
|
|
19
|
|
|
LOGGER = logging.getLogger(__file__) |
20
|
|
|
from ..io import write_sdf, read_sdf |
|
|
|
|
21
|
|
|
from .. import core |
22
|
|
|
|
23
|
|
|
atom_feats = ['acceptormultiplicity', 'aliphaticatom', 'aromaticatom', 'aromaticelectrophilicityorder', |
|
|
|
|
24
|
|
|
'asymmetricatom', 'atomicpolarizability', 'chainatom', 'chargedensity', 'chiralcenter', 'distancedegree', |
|
|
|
|
25
|
|
|
'donormultiplicity', 'eccentricity', 'electrondensity', 'electrophiliclocalizationenergy', 'hindrance', |
|
|
|
|
26
|
|
|
'hmochargedensity', 'hmoelectrondensity', 'hmoelectrophilicityorder', 'hmoelectrophiliclocalizationenergy', |
|
|
|
|
27
|
|
|
'hmonucleophilicityorder', 'hmonucleophiliclocalizationenergy', 'ioncharge', 'largestatomringsize', 'nucleophilicityorder', |
|
|
|
|
28
|
|
|
'nucleophiliclocalizationenergy', 'oen', 'pichargedensity', 'ringatom', 'ringcountofatom', 'stericeffectindex', |
|
|
|
|
29
|
|
|
'totalchargedensity'] |
30
|
|
|
|
31
|
|
|
# TODO: fix averagemicrospeciescharge |
|
|
|
|
32
|
|
|
# TODO: fix logd logp logs |
|
|
|
|
33
|
|
|
# TODO: oen (orbital electronegativity) - sigma + pi |
|
|
|
|
34
|
|
|
# TODO: water accessible surface area |
|
|
|
|
35
|
|
|
|
36
|
|
|
class ChemAxonFeatureCalculator(object): |
|
|
|
|
37
|
|
|
_optimal_feats = ['acceptorcount', 'accsitecount', 'aliphaticatomcount', 'aliphaticbondcount', 'aliphaticringcount', |
|
|
|
|
38
|
|
|
'aromaticatomcount', 'aromaticbondcount', 'aromaticringcount', 'asymmetricatomcount', |
|
|
|
|
39
|
|
|
'averagemolecularpolarizability', 'axxpol', 'ayypol', 'azzpol', 'balabanindex', 'bondcount', |
|
|
|
|
40
|
|
|
'carboaliphaticringcount', 'carboaromaticringcount', 'carboringcount', 'chainatomcount', |
|
|
|
|
41
|
|
|
'chainbondcount', 'charge', 'chiralcentercount', 'connectedgraph', 'cyclomaticnumber', 'dipole', |
|
|
|
|
42
|
|
|
'donorcount', 'donorsitecount', 'doublebondstereoisomercount', 'dreidingenergy', 'formalcharge', |
|
|
|
|
43
|
|
|
'fragmentcount', 'fsp3', 'fusedaliphaticringcount', 'fusedaromaticringcount', 'fusedringcount', |
|
|
|
|
44
|
|
|
'hararyindex', 'heteroaliphaticringcount', 'heteroaromaticringcount', 'heteroringcount', 'hlb', |
|
|
|
|
45
|
|
|
'hmopienergy', 'hyperwienerindex', 'largestringsize', 'largestringsystemsize', |
46
|
|
|
'markushenumerationcount', 'maximalprojectionarea', 'maximalprojectionradius', |
47
|
|
|
'maximalprojectionsize', 'minimalprojectionarea', 'minimalprojectionradius', |
48
|
|
|
'minimalprojectionsize', 'mmff94energy', 'molpol', 'pienergy', 'plattindex', 'psa', 'randicindex', |
|
|
|
|
49
|
|
|
'refractivity', 'resonantcount', 'ringatomcount', 'ringbondcount', 'ringcount', 'ringsystemcount', |
|
|
|
|
50
|
|
|
'rotatablebondcount', 'smallestatomringsize', 'smallestringsize', 'smallestringsystemsize', |
|
|
|
|
51
|
|
|
'stereodoublebondcount', 'stereoisomercount', 'szegedindex', 'tetrahedralstereoisomercount', |
|
|
|
|
52
|
|
|
'vdwsa', 'volume', 'wateraccessiblesurfacearea', 'wienerindex', 'wienerpolarity'] |
|
|
|
|
53
|
|
|
|
54
|
|
|
_all_feats = ['acc', 'acceptor', 'acceptorcount', 'acceptormultiplicity', 'acceptorsitecount', 'acceptortable', |
|
|
|
|
55
|
|
|
'accsitecount', 'aliphaticatom', 'aliphaticatomcount', 'aliphaticbondcount', 'aliphaticringcount', |
|
|
|
|
56
|
|
|
'aliphaticringcountofsize', 'aromaticatom', 'aromaticatomcount', 'aromaticbondcount', |
|
|
|
|
57
|
|
|
'aromaticelectrophilicityorder', 'aromaticnucleophilicityorder', 'aromaticringcount', |
|
|
|
|
58
|
|
|
'aromaticringcountofsize', 'asa', 'asymmetricatom', 'asymmetricatomcount', 'asymmetricatoms', |
|
|
|
|
59
|
|
|
'atomicpolarizability', 'atompol', 'averagemicrospeciescharge', 'averagemolecularpolarizability', |
|
|
|
|
60
|
|
|
'averagepol', 'avgpol', 'axxpol', 'ayypol', 'azzpol', 'balabanindex', 'bondcount', |
61
|
|
|
'canonicalresonant', 'canonicaltautomer', 'carboaliphaticringcount', 'carboaromaticringcount', |
|
|
|
|
62
|
|
|
'carboringcount', 'chainatom', 'chainatomcount', 'chainbondcount', 'charge', 'chargedensity', |
|
|
|
|
63
|
|
|
'chargedistribution', 'chiralcenter', 'chiralcentercount', 'chiralcenters', 'connectedgraph', |
|
|
|
|
64
|
|
|
'cyclomaticnumber', 'dipole', 'distancedegree', 'don', 'donor', 'donorcount', 'donormultiplicity', |
|
|
|
|
65
|
|
|
'donorsitecount', 'donortable', 'donsitecount', 'doublebondstereoisomercount', 'dreidingenergy', |
|
|
|
|
66
|
|
|
'eccentricity', 'electrondensity', 'electrophilicityorder', 'electrophiliclocalizationenergy', |
|
|
|
|
67
|
|
|
'enumerationcount', 'enumerations', 'formalcharge', 'fragmentcount', 'fsp3', |
68
|
|
|
'fusedaliphaticringcount', 'fusedaromaticringcount', 'fusedringcount', 'generictautomer', |
|
|
|
|
69
|
|
|
'hararyindex', 'hasvalidconformer', 'hbda', 'hbonddonoracceptor', 'heteroaliphaticringcount', |
|
|
|
|
70
|
|
|
'heteroaromaticringcount', 'heteroringcount', 'hindrance', 'hlb', 'hmochargedensity', |
|
|
|
|
71
|
|
|
'hmoelectrondensity', 'hmoelectrophilicityorder', 'hmoelectrophiliclocalizationenergy', 'hmohuckel', |
|
|
|
|
72
|
|
|
'hmohuckeleigenvalue', 'hmohuckeleigenvector', 'hmohuckelorbitals', 'hmohuckeltable', |
|
|
|
|
73
|
|
|
'hmolocalizationenergy', 'hmonucleophilicityorder', 'hmonucleophiliclocalizationenergy', |
|
|
|
|
74
|
|
|
'hmopienergy', 'huckel', 'huckeleigenvalue', 'huckeleigenvector', 'huckelorbitals', 'huckeltable', |
|
|
|
|
75
|
|
|
'hyperwienerindex', 'ioncharge', 'isoelectricpoint', 'largestatomringsize', 'largestringsize', |
|
|
|
|
76
|
|
|
'largestringsystemsize', 'localizationenergy', 'logd', 'logdcalculator', 'logp', 'logpcalculator', |
|
|
|
|
77
|
|
|
'logs', 'majormicrospecies', 'majorms', 'majorms2', 'majortautomer', 'markushenumerationcount', |
|
|
|
|
78
|
|
|
'markushenumerations', 'maximalprojectionarea', 'maximalprojectionradius', 'maximalprojectionsize', |
|
|
|
|
79
|
|
|
'minimalprojectionarea', 'minimalprojectionradius', 'minimalprojectionsize', 'mmff94energy', |
|
|
|
|
80
|
|
|
'molecularpolarizability', 'molecularsurfacearea', 'molpol', 'moststabletautomer', 'msa', 'msacc', |
|
|
|
|
81
|
|
|
'msdon', 'name', 'nucleophilicityorder', 'nucleophiliclocalizationenergy', 'oen', |
82
|
|
|
'orbitalelectronegativity', 'pi', 'pichargedensity', 'pienergy', 'pka', 'pkacalculator', 'pkat', |
|
|
|
|
83
|
|
|
'plattindex', 'pol', 'polarizability', 'polarsurfacearea', 'psa', 'randicindex', |
84
|
|
|
'randommarkushenumerations', 'refractivity', 'resonantcount', 'resonants', 'ringatom', |
|
|
|
|
85
|
|
|
'ringatomcount', 'ringbondcount', 'ringcount', 'ringcountofatom', 'ringcountofsize', |
|
|
|
|
86
|
|
|
'ringsystemcount', 'ringsystemcountofsize', 'rotatablebondcount', 'smallestatomringsize', |
|
|
|
|
87
|
|
|
'smallestringsize', 'smallestringsystemsize', 'stereodoublebondcount', 'stereoisomercount', |
|
|
|
|
88
|
|
|
'stericeffectindex', 'sterichindrance', 'szegedindex', 'tautomercount', 'tautomers', |
|
|
|
|
89
|
|
|
'tetrahedralstereoisomercount', 'tholepolarizability', 'topanal', 'topologyanalysistable', |
|
|
|
|
90
|
|
|
'totalchargedensity', 'tpol', 'tpolarizability', 'vdwsa', 'volume', 'wateraccessiblesurfacearea', |
|
|
|
|
91
|
|
|
'wienerindex', 'wienerpolarity'] |
92
|
|
|
|
93
|
|
View Code Duplication |
def __init__(self, feat_set='all'): |
|
|
|
|
94
|
|
|
if feat_set == 'all': |
95
|
|
|
self.index = self._all_feats |
96
|
|
|
elif feat_set == 'optimal': |
97
|
|
|
self.index = self._optimal_feats |
98
|
|
|
elif feat_set in self._all_feats: |
99
|
|
|
self.index = [feat_set] |
100
|
|
|
elif isinstance(feat_set, (list, tuple)): |
101
|
|
|
valid = np.array([feat in self._all_feats for feat in feat_set]) |
102
|
|
|
if all(valid): |
103
|
|
|
self.index = feat_set |
104
|
|
|
else: |
105
|
|
|
self.index = feat_set |
106
|
|
|
raise NotImplementedError('Descriptor \'{}\' not available.'.format(np.array(feat_set)[~valid])) |
|
|
|
|
107
|
|
|
else: |
108
|
|
|
raise NotImplementedError('Feature set {} not available.'.format(feat_set)) |
109
|
|
|
|
110
|
|
|
def transform(self, obj): |
|
|
|
|
111
|
|
|
if isinstance(obj, core.Mol): |
112
|
|
|
return self._transform_series(pd.Series(obj)).iloc[0] |
113
|
|
|
elif isinstance(obj, pd.Series): |
114
|
|
|
return self._transform_series(obj) |
115
|
|
|
elif isinstance(obj, pd.DataFrame): |
116
|
|
|
return self._transform_series(obj.structure) |
117
|
|
|
elif isinstance(obj, (tuple, list)): |
118
|
|
|
return self._transform_series(obj) |
119
|
|
|
else: |
120
|
|
|
raise NotImplementedError |
121
|
|
|
|
122
|
|
|
def _transform_series(self, series): |
123
|
|
|
|
124
|
|
|
with tempfile.NamedTemporaryFile(suffix='.sdf') as in_file, tempfile.NamedTemporaryFile() as out_file: |
|
|
|
|
125
|
|
|
# write mols to file |
126
|
|
|
write_sdf(series, in_file.name) |
127
|
|
|
args = ['cxcalc', in_file.name, '-o', out_file.name] + self.index |
128
|
|
|
|
129
|
|
|
LOGGER.info('Running: ' + ' '.join(args)) |
130
|
|
|
|
131
|
|
|
# call command line |
132
|
|
|
subprocess.call(args) |
133
|
|
|
try: |
134
|
|
|
finished = pd.read_table(out_file.name).set_index('id') |
135
|
|
|
except Exception: |
|
|
|
|
136
|
|
|
finished = None |
137
|
|
|
finished.index = series.index |
138
|
|
|
return finished |
139
|
|
|
|
140
|
|
|
class ChemAxonAtomFeatureCalculator(object): |
|
|
|
|
141
|
|
|
|
142
|
|
|
_all_feats = ['acceptormultiplicity', 'aliphaticatom', 'aromaticatom', 'aromaticelectrophilicityorder', |
|
|
|
|
143
|
|
|
'asymmetricatom', 'atomicpolarizability', 'chainatom', 'chargedensity', 'chiralcenter', |
|
|
|
|
144
|
|
|
'distancedegree', 'donormultiplicity', 'eccentricity', 'electrondensity', |
145
|
|
|
'electrophiliclocalizationenergy', 'hindrance', 'hmochargedensity', 'hmoelectrondensity', 'hmoelectrophilicityorder', |
|
|
|
|
146
|
|
|
'hmoelectrophiliclocalizationenergy', 'hmonucleophilicityorder', 'hmonucleophiliclocalizationenergy', 'ioncharge', 'largestatomringsize', |
|
|
|
|
147
|
|
|
'nucleophilicityorder', 'nucleophiliclocalizationenergy', 'oen', 'pichargedensity', 'ringatom', 'ringcountofatom', |
|
|
|
|
148
|
|
|
'stericeffectindex', 'totalchargedensity'] |
149
|
|
|
|
150
|
|
|
_h_inc_feats = ['acc', 'atomicpolarizability', 'charge', 'distancedegree', 'don', |
151
|
|
|
'eccentricity', 'hindrance', 'largestatomringsize', 'oen', |
|
|
|
|
152
|
|
|
'ringcountofatom', 'smallestatomringsize', 'stericeffectindex'] |
|
|
|
|
153
|
|
|
|
154
|
|
View Code Duplication |
def __init__(self, feat_set='all', include_hs=False, max_atoms=75): |
|
|
|
|
155
|
|
|
|
156
|
|
|
""" |
157
|
|
|
Args: |
158
|
|
|
feat_set (str or list<str>): |
159
|
|
|
The feature sets to calculate. |
160
|
|
|
- a single identifier as a `str` |
161
|
|
|
- a list of identifiers |
162
|
|
|
- 'h_inc' or those that also calculate for Hs. |
163
|
|
|
- 'all' for all |
164
|
|
|
|
165
|
|
|
max_atoms: |
166
|
|
|
- The maximum number of atoms available. |
167
|
|
|
""" |
168
|
|
|
self.max_atoms = max_atoms |
169
|
|
|
|
170
|
|
|
if feat_set in self._all_feats: |
171
|
|
|
self.index = [feat_set] |
172
|
|
|
elif feat_set == 'h_inclusive': |
173
|
|
|
self.index = self._h_inc_feats |
174
|
|
|
elif feat_set == 'all': |
175
|
|
|
self.index = self._all_feats |
176
|
|
|
elif isinstance(feat_set, (list, tuple)): |
177
|
|
|
valid = np.array([feat in self._all_feats for feat in feat_set]) |
178
|
|
|
if all(valid): |
179
|
|
|
self.index = feat_set |
180
|
|
|
else: |
181
|
|
|
raise NotImplementedError('Descriptor \'{}\' not available.'.format(np.array(feat_set)[~valid])) |
|
|
|
|
182
|
|
|
self.feature_names = feat_set |
183
|
|
|
else: |
184
|
|
|
raise NotImplementedError('{} feature set is not available'.format(feat_set)) |
185
|
|
|
|
186
|
|
|
@property |
187
|
|
|
def feature_names(self): |
|
|
|
|
188
|
|
|
return self.index |
189
|
|
|
|
190
|
|
|
def transform(self, obj): |
|
|
|
|
191
|
|
|
if isinstance(obj, core.Atom): |
192
|
|
|
return self._transform_atom(obj) |
193
|
|
|
elif isinstance(obj, core.Mol): |
194
|
|
|
return self._transform_mol(obj) |
195
|
|
|
elif isinstance(obj, pd.Series): |
196
|
|
|
return self._transform_series(obj) |
197
|
|
|
elif isinstance(obj, pd.DataFrame): |
198
|
|
|
return self._transform_series(obj.structure) |
199
|
|
|
elif isinstance(obj, (tuple, list)): |
200
|
|
|
return self._transform_series(obj) |
201
|
|
|
else: |
202
|
|
|
raise NotImplementedError |
203
|
|
|
|
204
|
|
|
def _transform_atom(self, atom): |
205
|
|
|
raise NotImplementedError('Cannot calculate atom wise with Chemaxon') |
206
|
|
|
|
207
|
|
|
def _transform_mol(self, mol): |
208
|
|
|
ser = pd.Series([mol], name=mol.name) |
209
|
|
|
res = self._transform_series(ser) |
210
|
|
|
return res.iloc[0] |
211
|
|
|
# make into series then use self._transform_mol |
212
|
|
|
|
213
|
|
|
def _transform_series(self, series): |
214
|
|
|
|
215
|
|
|
with tempfile.NamedTemporaryFile(suffix='.sdf') as in_file, tempfile.NamedTemporaryFile() as out_file: |
|
|
|
|
216
|
|
|
# write mols to file |
217
|
|
|
write_sdf(series, in_file.name) |
218
|
|
|
args = ['cxcalc', in_file.name, '-o', out_file.name] + self.index |
219
|
|
|
|
220
|
|
|
LOGGER.info('Running: ' + ' '.join(args)) |
221
|
|
|
|
222
|
|
|
# call command line |
223
|
|
|
subprocess.call(args) |
224
|
|
|
finished = pd.read_table(out_file.name).set_index('id') |
225
|
|
|
|
226
|
|
|
def to_padded(s): |
|
|
|
|
227
|
|
|
res = np.repeat(np.nan, self.max_atoms) |
228
|
|
|
|
229
|
|
|
def parse_string(s): |
|
|
|
|
230
|
|
|
if s == '': |
231
|
|
|
return np.nan |
232
|
|
|
elif s == 'false': |
233
|
|
|
return 0 |
234
|
|
|
elif s == 'true': |
235
|
|
|
return 1 |
236
|
|
|
else: |
237
|
|
|
return float(s) |
238
|
|
|
|
239
|
|
|
ans = np.array([parse_string(i) for i in s.split(';')]) |
240
|
|
|
res[:len(ans)] = ans |
241
|
|
|
return res |
242
|
|
|
res = np.array([[to_padded(i) for k, i in val.items()] for idx, val in finished.T.items()]) |
|
|
|
|
243
|
|
|
res = pd.Panel(res, items=series.index, major_axis=pd.Index(finished.columns, name='cx_atom_desc'), |
|
|
|
|
244
|
|
|
minor_axis=pd.Index(range(self.max_atoms), name='atom_idx')) |
|
|
|
|
245
|
|
|
return res.swapaxes(1, 2) # to be consistent with AtomFeatureCalculator |
246
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.