_to_sdf_df() - Code Metrics - Inspection of "Delete .DS_Store" - richlewis42/scikit-chem - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( a47c44...8cd148 )

by Rich

created 2016-04-15 17:15 UTC

_to_sdf_df() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	1
dl	0
loc	4
rs	10

#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
# License: 3-clause BSD

"""
# skchem.io.sdf

Defining input and output operations for sdf files.
"""

from functools import wraps
import warnings

from rdkit import Chem
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import pandas as pd
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from ..core import Mol
from ..utils import Suppressor

def _drop_props(row):
    for prop in row.structure.props.keys():
        row.structure.ClearProp(prop)

def _set_props(row, cols):
    for i in cols:
        row.structure.SetProp(str(i), str(row[i])) # rdkit props can only be str

def _set_name(row):
    row.structure.name = str(row.name) # rdkit props can only be strs

def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,

             skipmols=None, skipfooter=None, read_props=True, mol_props=False,
             *args, **kwargs):

    """Read an sdf file into a pandas dataframe.

    The function wraps the RDKit ForwardSDMolSupplier object.

    Args:
        sdf (str or file-like):
            The location of data to load, as a file path, or a file-like object.
        error_bad_mol (bool):
            Whether an error should be raised if a molecule fails to parse.
            Default is False.
        warn_bad_mol (bool):
            Whether a warning should be output if a molecule fails to parse.
            Default is True.
        nmols (int):
            The number of molecules to read. If `None`, read all molecules.
            Default is `None`.
        skipmols (int):
            The number of molecules to skip at start. Default is `0`.
        skipfooter (int):
            The number of molecules to skip from the end. Default is `0`.
        mol_props (bool):
            Whether to keep properties in the molecule dictionary after they are
            extracted to the dataframe. Default is `False`.
        *args, **kwargs:
            Arguments will be passed to rdkit's ForwardSDMolSupplier.

    Returns:
        pd.DataFrame: A dataframe of type :pandas.core.frame.DataFrame:.

    """

    # nmols is actually the index to cutoff.  If we skip some at start, we need
    # to add this number
    if skipmols:
        nmols += skipmols

    if isinstance(sdf, str):
        sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility

    # use the suppression context manager to not pollute our stdout with rdkit
    # errors and warnings.
    # perhaps this should be captured better by Mol etc.
    with Suppressor():

        mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs)

        mols = []

        # single loop through sdf
        for i, mol in enumerate(mol_supp):

            if skipmols and i < skipmols:
                continue

            if nmols and i >= nmols:
                break

            # rdkit returns None if it fails to parse a molecule.  We will raise
            # errors unless force is used.
            if mol is None:
                msg = 'Molecule {} could not be decoded.'.format(i + 1)
                if error_bad_mol:
                    raise ValueError(msg)
                elif warn_bad_mol:
                    warnings.warn(msg)
                continue

            mols.append(Mol(mol))


        if skipfooter:
            mols = mols[:-skipfooter]

    idx = pd.Index((m.name for m in mols), name='name')
    data = pd.DataFrame(mols, columns=['structure'])

    if read_props:
        props = pd.DataFrame([mol.props for mol in mols])
        data = pd.concat([data, props], axis=1)

    # now we have extracted the props, we can delete if required
    if not mol_props:
        data.apply(_drop_props, axis=1)

    data.index = idx
    return data

def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
              *args, **kwargs):

    """ Write an sdf file from a dataframe.

    Args:
        data (pandas.Series or pandas.DataFrame):
            Pandas data structure with a `structure` column containing compounds
            to serialize.
        sdf (str or file-like):
            A file path or file-like object specifying where to write the
            compound data.
        write_cols (bool):
            Whether columns should be written as props. Default `True`.
        index_as_name (bool):
            Whether to use index as the header, or the molecule's name.
            Default is `True`.
        mol_props (bool):
            Whether to write properties in the Mol dictionary in addition to
            fields in the frame.

    Warn:
        This function will change the names of the compounds if the
        `index_as_name` argument is `True`, and will delete all properties in
        the molecule dictionary if `mol_props` is `False`.
    """

    if isinstance(data, pd.Series):
        data = data.to_frame(name='structure')

    writer = Chem.SDWriter(sdf, *args, **kwargs)

    cols = list(data.columns.drop('structure'))

    if not mol_props:
        data.apply(_drop_props, axis=1)

    if write_cols:
        data.apply(_set_props, cols=cols, axis=1)

    if index_as_name:
        data.apply(_set_name, axis=1)

    data.structure.apply(writer.write)


@wraps(write_sdf)
def _to_sdf_series(self, *args, **kwargs):

    return write_sdf(self, write_cols=False, *args, **kwargs)

@wraps(write_sdf)
def _to_sdf_df(self, *args, **kwargs):

    return write_sdf(self, *args, **kwargs)

pd.Series.to_sdf = _to_sdf_series
pd.DataFrame.to_sdf = _to_sdf_df


@classmethod
def _from_sdf(_, *args, **kwargs):

    """ Create a DataFrame from an sdf file """

    return read_sdf(*args, **kwargs)

pd.DataFrame.from_sdf = _from_sdf


1			#! /usr/bin/env python
2			#
3			# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
4			# License: 3-clause BSD
5
6			"""
7			# skchem.io.sdf
8
9			Defining input and output operations for sdf files.
10			"""
11
12			from functools import wraps
13			import warnings
14
15			from rdkit import Chem
			0 ignored issues – show Configuration introduced 2016-01-19 16:21 UTC by Report Bug Copy Issue Report The import `rdkit` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
16			import pandas as pd
			0 ignored issues – show Configuration introduced 2016-01-19 16:21 UTC by Report Bug Copy Issue Report The import `pandas` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
17
18			from ..core import Mol
19			from ..utils import Suppressor
20
21			def _drop_props(row):
22			for prop in row.structure.props.keys():
23			row.structure.ClearProp(prop)
24
25			def _set_props(row, cols):
26			for i in cols:
27			row.structure.SetProp(str(i), str(row[i])) # rdkit props can only be str
28
29			def _set_name(row):
30			row.structure.name = str(row.name) # rdkit props can only be strs
31
32			def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,
			0 ignored issues – show best-practice introduced 2016-04-14 16:48 UTC by Report Bug Copy Issue Report Too many arguments (8/5) Loading history... Comprehensibility introduced 2016-04-14 16:48 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (18/15). Loading history...
33			skipmols=None, skipfooter=None, read_props=True, mol_props=False,
34			args, *kwargs):
35
36			"""Read an sdf file into a pandas dataframe.
37
38			The function wraps the RDKit ForwardSDMolSupplier object.
39
40			Args:
41			sdf (str or file-like):
42			The location of data to load, as a file path, or a file-like object.
43			error_bad_mol (bool):
44			Whether an error should be raised if a molecule fails to parse.
45			Default is False.
46			warn_bad_mol (bool):
47			Whether a warning should be output if a molecule fails to parse.
48			Default is True.
49			nmols (int):
50			The number of molecules to read. If `None`, read all molecules.
51			Default is `None`.
52			skipmols (int):
53			The number of molecules to skip at start. Default is `0`.
54			skipfooter (int):
55			The number of molecules to skip from the end. Default is `0`.
56			mol_props (bool):
57			Whether to keep properties in the molecule dictionary after they are
58			extracted to the dataframe. Default is `False`.
59			args, *kwargs:
60			Arguments will be passed to rdkit's ForwardSDMolSupplier.
61
62			Returns:
63			pd.DataFrame: A dataframe of type :pandas.core.frame.DataFrame:.
64
65			"""
66
67			# nmols is actually the index to cutoff. If we skip some at start, we need
68			# to add this number
69			if skipmols:
70			nmols += skipmols
71
72			if isinstance(sdf, str):
73			sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility
74
75			# use the suppression context manager to not pollute our stdout with rdkit
76			# errors and warnings.
77			# perhaps this should be captured better by Mol etc.
78			with Suppressor():
79
80			mol_supp = Chem.ForwardSDMolSupplier(sdf, args, *kwargs)
81
82			mols = []
83
84			# single loop through sdf
85			for i, mol in enumerate(mol_supp):
86
87			if skipmols and i < skipmols:
88			continue
89
90			if nmols and i >= nmols:
91			break
92
93			# rdkit returns None if it fails to parse a molecule. We will raise
94			# errors unless force is used.
95			if mol is None:
96			msg = 'Molecule {} could not be decoded.'.format(i + 1)
97			if error_bad_mol:
98			raise ValueError(msg)
99			elif warn_bad_mol:
100			warnings.warn(msg)
101			continue
102
103			mols.append(Mol(mol))
104
105
106			if skipfooter:
107			mols = mols[:-skipfooter]
108
109			idx = pd.Index((m.name for m in mols), name='name')
110			data = pd.DataFrame(mols, columns=['structure'])
111
112			if read_props:
113			props = pd.DataFrame([mol.props for mol in mols])
114			data = pd.concat([data, props], axis=1)
115
116			# now we have extracted the props, we can delete if required
117			if not mol_props:
118			data.apply(_drop_props, axis=1)
119
120			data.index = idx
121			return data
122
123			def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
124			args, *kwargs):
125
126			""" Write an sdf file from a dataframe.
127
128			Args:
129			data (pandas.Series or pandas.DataFrame):
130			Pandas data structure with a `structure` column containing compounds
131			to serialize.
132			sdf (str or file-like):
133			A file path or file-like object specifying where to write the
134			compound data.
135			write_cols (bool):
136			Whether columns should be written as props. Default `True`.
137			index_as_name (bool):
138			Whether to use index as the header, or the molecule's name.
139			Default is `True`.
140			mol_props (bool):
141			Whether to write properties in the Mol dictionary in addition to
142			fields in the frame.
143
144			Warn:
145			This function will change the names of the compounds if the
146			`index_as_name` argument is `True`, and will delete all properties in
147			the molecule dictionary if `mol_props` is `False`.
148			"""
149
150			if isinstance(data, pd.Series):
151			data = data.to_frame(name='structure')
152
153			writer = Chem.SDWriter(sdf, args, *kwargs)
154
155			cols = list(data.columns.drop('structure'))
156
157			if not mol_props:
158			data.apply(_drop_props, axis=1)
159
160			if write_cols:
161			data.apply(_set_props, cols=cols, axis=1)
162
163			if index_as_name:
164			data.apply(_set_name, axis=1)
165
166			data.structure.apply(writer.write)
167
168
169			@wraps(write_sdf)
170			def _to_sdf_series(self, args, *kwargs):
171
172			return write_sdf(self, write_cols=False, args, *kwargs)
173
174			@wraps(write_sdf)
175			def _to_sdf_df(self, args, *kwargs):
176
177			return write_sdf(self, args, *kwargs)
178
179			pd.Series.to_sdf = _to_sdf_series
180			pd.DataFrame.to_sdf = _to_sdf_df
181
182
183			@classmethod
184			def _from_sdf(_, args, *kwargs):
185
186			""" Create a DataFrame from an sdf file """
187
188			return read_sdf(args, *kwargs)
189
190			pd.DataFrame.from_sdf = _from_sdf
191

Push — master ( a47c44...8cd148 )

_to_sdf_df() A

Complexity

Size

Duplication

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

richlewis42 / scikit-chem

Push — master ( a47c44...8cd148 )

_to_sdf_df() A

Complexity

Size

Duplication

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files