1 | #! /usr/bin/env python |
||
0 ignored issues
–
show
|
|||
2 | # |
||
3 | # Copyright (C) 2015-2016 Rich Lewis <[email protected]> |
||
4 | # License: 3-clause BSD |
||
5 | |||
6 | 1 | """ |
|
7 | # skchem.io.sdf |
||
8 | |||
9 | Defining input and output operations for sdf files. |
||
10 | """ |
||
11 | |||
12 | 1 | from functools import wraps |
|
13 | 1 | import warnings |
|
14 | |||
15 | 1 | from rdkit import Chem |
|
0 ignored issues
–
show
The import
rdkit could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
16 | 1 | import pandas as pd |
|
0 ignored issues
–
show
The import
pandas could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
17 | |||
18 | 1 | from ..core import Mol |
|
19 | 1 | from ..utils import Suppressor, squeeze |
|
20 | |||
21 | |||
22 | 1 | def _drop_props(row): |
|
23 | 1 | for prop in row.structure.props.keys(): |
|
24 | 1 | row.structure.ClearProp(prop) |
|
25 | |||
26 | |||
27 | 1 | def _set_props(row, cols): |
|
28 | for i in cols: |
||
29 | row.structure.SetProp(str(i), str(row[i])) |
||
30 | |||
31 | |||
32 | 1 | def _set_name(row): |
|
33 | row.structure.name = str(row.name) # rdkit props can only be strs |
||
34 | |||
35 | |||
36 | 1 | def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None, |
|
0 ignored issues
–
show
|
|||
37 | skipmols=None, skipfooter=None, read_props=True, mol_props=False, |
||
38 | *args, **kwargs): |
||
39 | |||
40 | """Read an sdf file into a `pd.DataFrame`. |
||
41 | |||
42 | The function wraps the RDKit `ForwardSDMolSupplier` object. |
||
43 | |||
44 | Args: |
||
45 | sdf (str or file-like): |
||
46 | The location of data to load as a file path, or a file-like object. |
||
47 | error_bad_mol (bool): |
||
48 | Whether an error should be raised if a molecule fails to parse. |
||
49 | Default is False. |
||
50 | warn_bad_mol (bool): |
||
51 | Whether a warning should be output if a molecule fails to parse. |
||
52 | Default is True. |
||
53 | nmols (int): |
||
54 | The number of molecules to read. If `None`, read all molecules. |
||
55 | Default is `None`. |
||
56 | skipmols (int): |
||
57 | The number of molecules to skip at start. |
||
58 | Default is `0`. |
||
59 | skipfooter (int): |
||
60 | The number of molecules to skip from the end. |
||
61 | Default is `0`. |
||
62 | read_props (bool): |
||
63 | Whether to read the properties into the data frame. |
||
64 | Default is `True`. |
||
65 | mol_props (bool): |
||
66 | Whether to keep properties in the molecule dictionary after they |
||
67 | are extracted to the DataFrame. |
||
68 | Default is `False`. |
||
69 | args, kwargs: |
||
70 | Arguments will be passed to RDKit ForwardSDMolSupplier. |
||
71 | |||
72 | Returns: |
||
73 | pandas.DataFrame: |
||
74 | The loaded data frame, with Mols supplied in the `structure` field. |
||
75 | |||
76 | See also: |
||
77 | rdkit.Chem.SDForwardMolSupplier |
||
78 | skchem.read_smiles |
||
79 | """ |
||
80 | |||
81 | # nmols is actually the index to cutoff. If we skip some at start, we need |
||
82 | # to add this number |
||
83 | 1 | if skipmols: |
|
84 | nmols += skipmols |
||
85 | |||
86 | 1 | if isinstance(sdf, str): |
|
87 | 1 | sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility |
|
88 | |||
89 | # use the suppression context manager to not pollute our stdout with rdkit |
||
90 | # errors and warnings. |
||
91 | # perhaps this should be captured better by Mol etc. |
||
92 | 1 | with Suppressor(): |
|
93 | |||
94 | 1 | mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs) |
|
95 | |||
96 | 1 | mols = [] |
|
97 | |||
98 | # single loop through sdf |
||
99 | 1 | for i, mol in enumerate(mol_supp): |
|
100 | |||
101 | 1 | if skipmols and i < skipmols: |
|
102 | continue |
||
103 | |||
104 | 1 | if nmols and i >= nmols: |
|
105 | break |
||
106 | |||
107 | 1 | if mol is None: |
|
108 | 1 | msg = 'Molecule {} could not be decoded.'.format(i + 1) |
|
109 | 1 | if error_bad_mol: |
|
110 | 1 | raise ValueError(msg) |
|
111 | elif warn_bad_mol: |
||
112 | warnings.warn(msg) |
||
113 | continue |
||
114 | |||
115 | 1 | mols.append(Mol(mol)) |
|
116 | |||
117 | 1 | if skipfooter: |
|
118 | mols = mols[:-skipfooter] |
||
119 | |||
120 | 1 | idx = pd.Index((m.name for m in mols), name='batch') |
|
121 | 1 | data = pd.DataFrame(mols, columns=['structure']) |
|
122 | |||
123 | 1 | if read_props: |
|
124 | 1 | props = pd.DataFrame([{k: v for (k, v) in mol.props.items()} |
|
125 | for mol in mols]) |
||
126 | 1 | data = pd.concat([data, props], axis=1) |
|
127 | # now we have extracted the props, we can delete if required |
||
128 | 1 | if not mol_props: |
|
129 | 1 | data.apply(_drop_props, axis=1) |
|
130 | |||
131 | 1 | data.index = idx |
|
132 | 1 | return squeeze(data, axis=1) |
|
133 | |||
134 | |||
135 | 1 | def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False, |
|
136 | *args, **kwargs): |
||
137 | |||
138 | """ Write an sdf file from a dataframe. |
||
139 | |||
140 | Args: |
||
141 | data (pandas.Series or pandas.DataFrame): |
||
142 | Pandas data structure with a `structure` column containing |
||
143 | compounds to serialize. |
||
144 | sdf (str or file-like): |
||
145 | A file path or file-like object specifying where to write the |
||
146 | compound data. |
||
147 | write_cols (bool): |
||
148 | Whether columns should be written as props. Default `True`. |
||
149 | index_as_name (bool): |
||
150 | Whether to use index as the header, or the molecule's name. |
||
151 | Default is `True`. |
||
152 | mol_props (bool): |
||
153 | Whether to write properties in the Mol dictionary in addition to |
||
154 | fields in the frame. |
||
155 | |||
156 | Warn: |
||
157 | This function will change the names of the compounds if the |
||
158 | `index_as_name` argument is `True`, and will delete all properties in |
||
159 | the molecule dictionary if `mol_props` is `False`. |
||
160 | """ |
||
161 | if isinstance(data, pd.Series): |
||
162 | data = data.to_frame(name='structure') |
||
163 | |||
164 | names = [m.name for m in data.structure] |
||
165 | |||
166 | writer = Chem.SDWriter(sdf, *args, **kwargs) |
||
167 | |||
168 | cols = list(data.columns.drop('structure')) |
||
169 | |||
170 | if not mol_props: |
||
171 | data.apply(_drop_props, axis=1) |
||
172 | |||
173 | if write_cols: |
||
174 | data.apply(_set_props, cols=cols, axis=1) |
||
175 | |||
176 | if index_as_name: |
||
177 | data.apply(_set_name, axis=1) |
||
178 | |||
179 | data.structure.apply(writer.write) |
||
180 | |||
181 | # rdkit writer changes names sometimes |
||
182 | for mol, name in zip(data.structure, names): |
||
183 | mol.name = name |
||
184 | |||
185 | |||
186 | 1 | @wraps(write_sdf) |
|
187 | def _to_sdf_series(self, *args, **kwargs): |
||
188 | |||
189 | return write_sdf(self, write_cols=False, *args, **kwargs) |
||
190 | |||
191 | |||
192 | 1 | @wraps(write_sdf) |
|
193 | def _to_sdf_df(self, *args, **kwargs): |
||
194 | |||
195 | return write_sdf(self, *args, **kwargs) |
||
196 | |||
197 | 1 | pd.Series.to_sdf = _to_sdf_series |
|
198 | 1 | pd.DataFrame.to_sdf = _to_sdf_df |
|
199 | |||
200 | |||
201 | 1 | @classmethod |
|
202 | 1 | @wraps(read_sdf) |
|
203 | def _from_sdf_df(_, *args, **kwargs): |
||
204 | |||
205 | return read_sdf(*args, **kwargs) |
||
206 | |||
207 | 1 | pd.DataFrame.from_sdf = _from_sdf_df |
|
208 | |||
209 | |||
210 | 1 | @classmethod |
|
211 | 1 | @wraps(read_sdf) |
|
212 | def _from_sdf_series(_, *args, **kwargs): |
||
213 | |||
214 | return read_sdf(*args, **kwargs).structure |
||
215 | |||
216 | pd.Series.from_sdf = _from_sdf_series |
||
217 |
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.