Completed
Push — master ( a47c44...8cd148 )
by Rich
01:15
created

_to_sdf_df()   A

Complexity

Conditions 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 4
rs 10
1
#! /usr/bin/env python
2
#
3
# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
4
# License: 3-clause BSD
5
6
"""
7
# skchem.io.sdf
8
9
Defining input and output operations for sdf files.
10
"""
11
12
from functools import wraps
13
import warnings
14
15
from rdkit import Chem
0 ignored issues
show
Configuration introduced by
The import rdkit could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
16
import pandas as pd
0 ignored issues
show
Configuration introduced by
The import pandas could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
17
18
from ..core import Mol
19
from ..utils import Suppressor
20
21
def _drop_props(row):
22
    for prop in row.structure.props.keys():
23
        row.structure.ClearProp(prop)
24
25
def _set_props(row, cols):
26
    for i in cols:
27
        row.structure.SetProp(str(i), str(row[i])) # rdkit props can only be str
28
29
def _set_name(row):
30
    row.structure.name = str(row.name) # rdkit props can only be strs
31
32
def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,
0 ignored issues
show
best-practice introduced by
Too many arguments (8/5)
Loading history...
Comprehensibility introduced by
This function exceeds the maximum number of variables (18/15).
Loading history...
33
             skipmols=None, skipfooter=None, read_props=True, mol_props=False,
34
             *args, **kwargs):
35
36
    """Read an sdf file into a pandas dataframe.
37
38
    The function wraps the RDKit ForwardSDMolSupplier object.
39
40
    Args:
41
        sdf (str or file-like):
42
            The location of data to load, as a file path, or a file-like object.
43
        error_bad_mol (bool):
44
            Whether an error should be raised if a molecule fails to parse.
45
            Default is False.
46
        warn_bad_mol (bool):
47
            Whether a warning should be output if a molecule fails to parse.
48
            Default is True.
49
        nmols (int):
50
            The number of molecules to read. If `None`, read all molecules.
51
            Default is `None`.
52
        skipmols (int):
53
            The number of molecules to skip at start. Default is `0`.
54
        skipfooter (int):
55
            The number of molecules to skip from the end. Default is `0`.
56
        mol_props (bool):
57
            Whether to keep properties in the molecule dictionary after they are
58
            extracted to the dataframe. Default is `False`.
59
        *args, **kwargs:
60
            Arguments will be passed to rdkit's ForwardSDMolSupplier.
61
62
    Returns:
63
        pd.DataFrame: A dataframe of type :pandas.core.frame.DataFrame:.
64
65
    """
66
67
    # nmols is actually the index to cutoff.  If we skip some at start, we need
68
    # to add this number
69
    if skipmols:
70
        nmols += skipmols
71
72
    if isinstance(sdf, str):
73
        sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility
74
75
    # use the suppression context manager to not pollute our stdout with rdkit
76
    # errors and warnings.
77
    # perhaps this should be captured better by Mol etc.
78
    with Suppressor():
79
80
        mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs)
81
82
        mols = []
83
84
        # single loop through sdf
85
        for i, mol in enumerate(mol_supp):
86
87
            if skipmols and i < skipmols:
88
                continue
89
90
            if nmols and i >= nmols:
91
                break
92
93
            # rdkit returns None if it fails to parse a molecule.  We will raise
94
            # errors unless force is used.
95
            if mol is None:
96
                msg = 'Molecule {} could not be decoded.'.format(i + 1)
97
                if error_bad_mol:
98
                    raise ValueError(msg)
99
                elif warn_bad_mol:
100
                    warnings.warn(msg)
101
                continue
102
103
            mols.append(Mol(mol))
104
105
106
        if skipfooter:
107
            mols = mols[:-skipfooter]
108
109
    idx = pd.Index((m.name for m in mols), name='name')
110
    data = pd.DataFrame(mols, columns=['structure'])
111
112
    if read_props:
113
        props = pd.DataFrame([mol.props for mol in mols])
114
        data = pd.concat([data, props], axis=1)
115
116
    # now we have extracted the props, we can delete if required
117
    if not mol_props:
118
        data.apply(_drop_props, axis=1)
119
120
    data.index = idx
121
    return data
122
123
def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
124
              *args, **kwargs):
125
126
    """ Write an sdf file from a dataframe.
127
128
    Args:
129
        data (pandas.Series or pandas.DataFrame):
130
            Pandas data structure with a `structure` column containing compounds
131
            to serialize.
132
        sdf (str or file-like):
133
            A file path or file-like object specifying where to write the
134
            compound data.
135
        write_cols (bool):
136
            Whether columns should be written as props. Default `True`.
137
        index_as_name (bool):
138
            Whether to use index as the header, or the molecule's name.
139
            Default is `True`.
140
        mol_props (bool):
141
            Whether to write properties in the Mol dictionary in addition to
142
            fields in the frame.
143
144
    Warn:
145
        This function will change the names of the compounds if the
146
        `index_as_name` argument is `True`, and will delete all properties in
147
        the molecule dictionary if `mol_props` is `False`.
148
    """
149
150
    if isinstance(data, pd.Series):
151
        data = data.to_frame(name='structure')
152
153
    writer = Chem.SDWriter(sdf, *args, **kwargs)
154
155
    cols = list(data.columns.drop('structure'))
156
157
    if not mol_props:
158
        data.apply(_drop_props, axis=1)
159
160
    if write_cols:
161
        data.apply(_set_props, cols=cols, axis=1)
162
163
    if index_as_name:
164
        data.apply(_set_name, axis=1)
165
166
    data.structure.apply(writer.write)
167
168
169
@wraps(write_sdf)
170
def _to_sdf_series(self, *args, **kwargs):
171
172
    return write_sdf(self, write_cols=False, *args, **kwargs)
173
174
@wraps(write_sdf)
175
def _to_sdf_df(self, *args, **kwargs):
176
177
    return write_sdf(self, *args, **kwargs)
178
179
pd.Series.to_sdf = _to_sdf_series
180
pd.DataFrame.to_sdf = _to_sdf_df
181
182
183
@classmethod
184
def _from_sdf(_, *args, **kwargs):
185
186
    """ Create a DataFrame from an sdf file """
187
188
    return read_sdf(*args, **kwargs)
189
190
pd.DataFrame.from_sdf = _from_sdf
191