1
|
|
|
#! /usr/bin/env python |
|
|
|
|
2
|
|
|
# |
3
|
|
|
# Copyright (C) 2015-2016 Rich Lewis <[email protected]> |
4
|
|
|
# License: 3-clause BSD |
5
|
|
|
|
6
|
1 |
|
""" |
7
|
|
|
# skchem.io.sdf |
8
|
|
|
|
9
|
|
|
Defining input and output operations for sdf files. |
10
|
|
|
""" |
11
|
|
|
|
12
|
1 |
|
from functools import wraps |
13
|
1 |
|
import warnings |
14
|
|
|
|
15
|
1 |
|
from rdkit import Chem |
|
|
|
|
16
|
1 |
|
import pandas as pd |
|
|
|
|
17
|
|
|
|
18
|
1 |
|
from ..core import Mol |
19
|
1 |
|
from ..utils import Suppressor, squeeze |
20
|
|
|
|
21
|
|
|
|
22
|
1 |
|
def _drop_props(row): |
23
|
1 |
|
for prop in row.structure.props.keys(): |
24
|
1 |
|
row.structure.ClearProp(prop) |
25
|
|
|
|
26
|
|
|
|
27
|
1 |
|
def _set_props(row, cols): |
28
|
|
|
for i in cols: |
29
|
|
|
row.structure.SetProp(str(i), str(row[i])) |
30
|
|
|
|
31
|
|
|
|
32
|
1 |
|
def _set_name(row): |
33
|
|
|
row.structure.name = str(row.name) # rdkit props can only be strs |
34
|
|
|
|
35
|
|
|
|
36
|
1 |
|
def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None, |
|
|
|
|
37
|
|
|
skipmols=None, skipfooter=None, read_props=True, mol_props=False, |
38
|
|
|
*args, **kwargs): |
39
|
|
|
|
40
|
|
|
"""Read an sdf file into a `pd.DataFrame`. |
41
|
|
|
|
42
|
|
|
The function wraps the RDKit `ForwardSDMolSupplier` object. |
43
|
|
|
|
44
|
|
|
Args: |
45
|
|
|
sdf (str or file-like): |
46
|
|
|
The location of data to load as a file path, or a file-like object. |
47
|
|
|
error_bad_mol (bool): |
48
|
|
|
Whether an error should be raised if a molecule fails to parse. |
49
|
|
|
Default is False. |
50
|
|
|
warn_bad_mol (bool): |
51
|
|
|
Whether a warning should be output if a molecule fails to parse. |
52
|
|
|
Default is True. |
53
|
|
|
nmols (int): |
54
|
|
|
The number of molecules to read. If `None`, read all molecules. |
55
|
|
|
Default is `None`. |
56
|
|
|
skipmols (int): |
57
|
|
|
The number of molecules to skip at start. |
58
|
|
|
Default is `0`. |
59
|
|
|
skipfooter (int): |
60
|
|
|
The number of molecules to skip from the end. |
61
|
|
|
Default is `0`. |
62
|
|
|
read_props (bool): |
63
|
|
|
Whether to read the properties into the data frame. |
64
|
|
|
Default is `True`. |
65
|
|
|
mol_props (bool): |
66
|
|
|
Whether to keep properties in the molecule dictionary after they |
67
|
|
|
are extracted to the DataFrame. |
68
|
|
|
Default is `False`. |
69
|
|
|
args, kwargs: |
70
|
|
|
Arguments will be passed to RDKit ForwardSDMolSupplier. |
71
|
|
|
|
72
|
|
|
Returns: |
73
|
|
|
pandas.DataFrame: |
74
|
|
|
The loaded data frame, with Mols supplied in the `structure` field. |
75
|
|
|
|
76
|
|
|
See also: |
77
|
|
|
rdkit.Chem.SDForwardMolSupplier |
78
|
|
|
skchem.read_smiles |
79
|
|
|
""" |
80
|
|
|
|
81
|
|
|
# nmols is actually the index to cutoff. If we skip some at start, we need |
82
|
|
|
# to add this number |
83
|
1 |
|
if skipmols: |
84
|
|
|
nmols += skipmols |
85
|
|
|
|
86
|
1 |
|
if isinstance(sdf, str): |
87
|
1 |
|
sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility |
88
|
|
|
|
89
|
|
|
# use the suppression context manager to not pollute our stdout with rdkit |
90
|
|
|
# errors and warnings. |
91
|
|
|
# perhaps this should be captured better by Mol etc. |
92
|
1 |
|
with Suppressor(): |
93
|
|
|
|
94
|
1 |
|
mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs) |
95
|
|
|
|
96
|
1 |
|
mols = [] |
97
|
|
|
|
98
|
|
|
# single loop through sdf |
99
|
1 |
|
for i, mol in enumerate(mol_supp): |
100
|
|
|
|
101
|
1 |
|
if skipmols and i < skipmols: |
102
|
|
|
continue |
103
|
|
|
|
104
|
1 |
|
if nmols and i >= nmols: |
105
|
|
|
break |
106
|
|
|
|
107
|
1 |
|
if mol is None: |
108
|
1 |
|
msg = 'Molecule {} could not be decoded.'.format(i + 1) |
109
|
1 |
|
if error_bad_mol: |
110
|
1 |
|
raise ValueError(msg) |
111
|
|
|
elif warn_bad_mol: |
112
|
|
|
warnings.warn(msg) |
113
|
|
|
continue |
114
|
|
|
|
115
|
1 |
|
mols.append(Mol(mol)) |
116
|
|
|
|
117
|
1 |
|
if skipfooter: |
118
|
|
|
mols = mols[:-skipfooter] |
119
|
|
|
|
120
|
1 |
|
idx = pd.Index((m.name for m in mols), name='batch') |
121
|
1 |
|
data = pd.DataFrame(mols, columns=['structure']) |
122
|
|
|
|
123
|
1 |
|
if read_props: |
124
|
1 |
|
props = pd.DataFrame([{k: v for (k, v) in mol.props.items()} |
125
|
|
|
for mol in mols]) |
126
|
1 |
|
data = pd.concat([data, props], axis=1) |
127
|
|
|
# now we have extracted the props, we can delete if required |
128
|
1 |
|
if not mol_props: |
129
|
1 |
|
data.apply(_drop_props, axis=1) |
130
|
|
|
|
131
|
1 |
|
data.index = idx |
132
|
1 |
|
return squeeze(data, axis=1) |
133
|
|
|
|
134
|
|
|
|
135
|
1 |
|
def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False, |
136
|
|
|
*args, **kwargs): |
137
|
|
|
|
138
|
|
|
""" Write an sdf file from a dataframe. |
139
|
|
|
|
140
|
|
|
Args: |
141
|
|
|
data (pandas.Series or pandas.DataFrame): |
142
|
|
|
Pandas data structure with a `structure` column containing |
143
|
|
|
compounds to serialize. |
144
|
|
|
sdf (str or file-like): |
145
|
|
|
A file path or file-like object specifying where to write the |
146
|
|
|
compound data. |
147
|
|
|
write_cols (bool): |
148
|
|
|
Whether columns should be written as props. Default `True`. |
149
|
|
|
index_as_name (bool): |
150
|
|
|
Whether to use index as the header, or the molecule's name. |
151
|
|
|
Default is `True`. |
152
|
|
|
mol_props (bool): |
153
|
|
|
Whether to write properties in the Mol dictionary in addition to |
154
|
|
|
fields in the frame. |
155
|
|
|
|
156
|
|
|
Warn: |
157
|
|
|
This function will change the names of the compounds if the |
158
|
|
|
`index_as_name` argument is `True`, and will delete all properties in |
159
|
|
|
the molecule dictionary if `mol_props` is `False`. |
160
|
|
|
""" |
161
|
|
|
if isinstance(data, pd.Series): |
162
|
|
|
data = data.to_frame(name='structure') |
163
|
|
|
|
164
|
|
|
names = [m.name for m in data.structure] |
165
|
|
|
|
166
|
|
|
writer = Chem.SDWriter(sdf, *args, **kwargs) |
167
|
|
|
|
168
|
|
|
cols = list(data.columns.drop('structure')) |
169
|
|
|
|
170
|
|
|
if not mol_props: |
171
|
|
|
data.apply(_drop_props, axis=1) |
172
|
|
|
|
173
|
|
|
if write_cols: |
174
|
|
|
data.apply(_set_props, cols=cols, axis=1) |
175
|
|
|
|
176
|
|
|
if index_as_name: |
177
|
|
|
data.apply(_set_name, axis=1) |
178
|
|
|
|
179
|
|
|
data.structure.apply(writer.write) |
180
|
|
|
|
181
|
|
|
# rdkit writer changes names sometimes |
182
|
|
|
for mol, name in zip(data.structure, names): |
183
|
|
|
mol.name = name |
184
|
|
|
|
185
|
|
|
|
186
|
1 |
|
@wraps(write_sdf) |
187
|
|
|
def _to_sdf_series(self, *args, **kwargs): |
188
|
|
|
|
189
|
|
|
return write_sdf(self, write_cols=False, *args, **kwargs) |
190
|
|
|
|
191
|
|
|
|
192
|
1 |
|
@wraps(write_sdf) |
193
|
|
|
def _to_sdf_df(self, *args, **kwargs): |
194
|
|
|
|
195
|
|
|
return write_sdf(self, *args, **kwargs) |
196
|
|
|
|
197
|
1 |
|
pd.Series.to_sdf = _to_sdf_series |
198
|
1 |
|
pd.DataFrame.to_sdf = _to_sdf_df |
199
|
|
|
|
200
|
|
|
|
201
|
1 |
|
@classmethod |
202
|
1 |
|
@wraps(read_sdf) |
203
|
|
|
def _from_sdf_df(_, *args, **kwargs): |
204
|
|
|
|
205
|
|
|
return read_sdf(*args, **kwargs) |
206
|
|
|
|
207
|
1 |
|
pd.DataFrame.from_sdf = _from_sdf_df |
208
|
|
|
|
209
|
|
|
|
210
|
1 |
|
@classmethod |
211
|
1 |
|
@wraps(read_sdf) |
212
|
|
|
def _from_sdf_series(_, *args, **kwargs): |
213
|
|
|
|
214
|
|
|
return read_sdf(*args, **kwargs).structure |
215
|
|
|
|
216
|
|
|
pd.Series.from_sdf = _from_sdf_series |
217
|
|
|
|
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.