Completed
Push — master ( 1baf98...c0c140 )
by Rich
01:33
created

_from_sdf()   A

Complexity

Conditions 1

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 1
c 2
b 0
f 0
dl 0
loc 5
rs 9.4285
1
#! /usr/bin/env python
2
#
3
# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
4
# License: 3-clause BSD
5
6
"""
7
# skchem.io.sdf
8
9
Defining input and output operations for sdf files.
10
"""
11
12
from functools import wraps
13
import warnings
14
15
from rdkit import Chem
0 ignored issues
show
Configuration introduced by
The import rdkit could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
16
import pandas as pd
0 ignored issues
show
Configuration introduced by
The import pandas could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
17
18
from ..core import Mol
19
from ..utils import Suppressor
20
21
def _drop_props(row):
22
    for prop in row.structure.props.keys():
23
        row.structure.ClearProp(prop)
24
25
def _set_props(row, cols):
26
    for i in cols:
27
        row.structure.SetProp(str(i), str(row[i])) # rdkit props can only be str
28
29
def _set_name(row):
30
    row.structure.name = str(row.name) # rdkit props can only be strs
31
32
def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (18/15).
Loading history...
33
             skipmols=None, skipfooter=None, read_props=True, mol_props=False,
34
             *args, **kwargs):
35
36
    """Read an sdf file into a pandas dataframe.
37
38
    The function wraps the RDKit ForwardSDMolSupplier object.
39
40
    Args:
41
        sdf (str or file-like):
42
            The location of data to load, as a file path, or a file-like object.
43
        error_bad_mol (bool):
44
            Whether an error should be raised if a molecule fails to parse.
45
            Default is False.
46
        warn_bad_mol (bool):
47
            Whether a warning should be output if a molecule fails to parse.
48
            Default is True.
49
        nmols (int):
50
            The number of molecules to read. If `None`, read all molecules.
51
            Default is `None`.
52
        skipmols (int):
53
            The number of molecules to skip at start.
54
            Default is `0`.
55
        skipfooter (int):
56
            The number of molecules to skip from the end.
57
            Default is `0`.
58
        read_props (bool):
59
            Whether to read the properties into the data frame.
60
            Default is `True`.
61
        mol_props (bool):
62
            Whether to keep properties in the molecule dictionary after they are
63
            extracted to the dataframe.
64
            Default is `False`.
65
        *args, **kwargs:
66
            Arguments will be passed to rdkit's ForwardSDMolSupplier.
67
68
    Returns:
69
        pandas.DataFrame:
70
            The loaded data frame, with Mols supplied in the `structure` field.
71
72
    See also:
73
        rdkit.Chem.SDForwardMolSupplier
74
        skchem.read_smiles
75
    """
76
77
    # nmols is actually the index to cutoff.  If we skip some at start, we need
78
    # to add this number
79
    if skipmols:
80
        nmols += skipmols
81
82
    if isinstance(sdf, str):
83
        sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility
84
85
    # use the suppression context manager to not pollute our stdout with rdkit
86
    # errors and warnings.
87
    # perhaps this should be captured better by Mol etc.
88
    with Suppressor():
89
90
        mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs)
91
92
        mols = []
93
94
        # single loop through sdf
95
        for i, mol in enumerate(mol_supp):
96
97
            if skipmols and i < skipmols:
98
                continue
99
100
            if nmols and i >= nmols:
101
                break
102
103
            # rdkit returns None if it fails to parse a molecule.  We will raise
104
            # errors unless force is used.
105
            if mol is None:
106
                msg = 'Molecule {} could not be decoded.'.format(i + 1)
107
                if error_bad_mol:
108
                    raise ValueError(msg)
109
                elif warn_bad_mol:
110
                    warnings.warn(msg)
111
                continue
112
113
            mols.append(Mol(mol))
114
115
116
        if skipfooter:
117
            mols = mols[:-skipfooter]
118
119
    idx = pd.Index((m.name for m in mols), name='name')
120
    data = pd.DataFrame(mols, columns=['structure'])
121
122
    if read_props:
123
        props = pd.DataFrame([{k: v for (k, v) in mol.props.items()} for mol in mols])
124
        data = pd.concat([data, props], axis=1)
125
        # now we have extracted the props, we can delete if required
126
        if not mol_props:
127
            data.apply(_drop_props, axis=1)
128
129
    data.index = idx
130
    return data
131
132
def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
133
              *args, **kwargs):
134
135
    """ Write an sdf file from a dataframe.
136
137
    Args:
138
        data (pandas.Series or pandas.DataFrame):
139
            Pandas data structure with a `structure` column containing compounds
140
            to serialize.
141
        sdf (str or file-like):
142
            A file path or file-like object specifying where to write the
143
            compound data.
144
        write_cols (bool):
145
            Whether columns should be written as props. Default `True`.
146
        index_as_name (bool):
147
            Whether to use index as the header, or the molecule's name.
148
            Default is `True`.
149
        mol_props (bool):
150
            Whether to write properties in the Mol dictionary in addition to
151
            fields in the frame.
152
153
    Warn:
154
        This function will change the names of the compounds if the
155
        `index_as_name` argument is `True`, and will delete all properties in
156
        the molecule dictionary if `mol_props` is `False`.
157
    """
158
159
    if isinstance(data, pd.Series):
160
        data = data.to_frame(name='structure')
161
162
    writer = Chem.SDWriter(sdf, *args, **kwargs)
163
164
    cols = list(data.columns.drop('structure'))
165
166
    if not mol_props:
167
        data.apply(_drop_props, axis=1)
168
169
    if write_cols:
170
        data.apply(_set_props, cols=cols, axis=1)
171
172
    if index_as_name:
173
        data.apply(_set_name, axis=1)
174
175
    data.structure.apply(writer.write)
176
177
178
@wraps(write_sdf)
179
def _to_sdf_series(self, *args, **kwargs):
180
181
    return write_sdf(self, write_cols=False, *args, **kwargs)
182
183
@wraps(write_sdf)
184
def _to_sdf_df(self, *args, **kwargs):
185
186
    return write_sdf(self, *args, **kwargs)
187
188
pd.Series.to_sdf = _to_sdf_series
189
pd.DataFrame.to_sdf = _to_sdf_df
190
191
192
@classmethod
193
@wraps(read_sdf)
194
def _from_sdf_df(_, *args, **kwargs):
195
196
    return read_sdf(*args, **kwargs)
197
198
pd.DataFrame.from_sdf = _from_sdf_df
199
200
@classmethod
201
@wraps(read_sdf)
202
def _from_sdf_series(_, *args, **kwargs):
203
204
    return read_sdf(*args, **kwargs).structure
205
206
pd.Series.from_sdf = _from_sdf_series
207