_from_sdf() - Code Metrics - Inspection of "added to_smiles functions" - richlewis42/scikit-chem - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 1baf98...c0c140 )

by Rich

created 2016-06-07 13:25 UTC

_from_sdf() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	2
Bugs	0	Features	0

Metric	Value
cc	1
c	2
b	0
f	0
dl	0
loc	5
rs	9.4285

#! /usr/bin/env python
#
# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
# License: 3-clause BSD

"""
# skchem.io.sdf

Defining input and output operations for sdf files.
"""

from functools import wraps
import warnings

from rdkit import Chem
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import pandas as pd
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from ..core import Mol
from ..utils import Suppressor

def _drop_props(row):
    for prop in row.structure.props.keys():
        row.structure.ClearProp(prop)

def _set_props(row, cols):
    for i in cols:
        row.structure.SetProp(str(i), str(row[i])) # rdkit props can only be str

def _set_name(row):
    row.structure.name = str(row.name) # rdkit props can only be strs

def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,

             skipmols=None, skipfooter=None, read_props=True, mol_props=False,
             *args, **kwargs):

    """Read an sdf file into a pandas dataframe.

    The function wraps the RDKit ForwardSDMolSupplier object.

    Args:
        sdf (str or file-like):
            The location of data to load, as a file path, or a file-like object.
        error_bad_mol (bool):
            Whether an error should be raised if a molecule fails to parse.
            Default is False.
        warn_bad_mol (bool):
            Whether a warning should be output if a molecule fails to parse.
            Default is True.
        nmols (int):
            The number of molecules to read. If `None`, read all molecules.
            Default is `None`.
        skipmols (int):
            The number of molecules to skip at start.
            Default is `0`.
        skipfooter (int):
            The number of molecules to skip from the end.
            Default is `0`.
        read_props (bool):
            Whether to read the properties into the data frame.
            Default is `True`.
        mol_props (bool):
            Whether to keep properties in the molecule dictionary after they are
            extracted to the dataframe.
            Default is `False`.
        *args, **kwargs:
            Arguments will be passed to rdkit's ForwardSDMolSupplier.

    Returns:
        pandas.DataFrame:
            The loaded data frame, with Mols supplied in the `structure` field.

    See also:
        rdkit.Chem.SDForwardMolSupplier
        skchem.read_smiles
    """

    # nmols is actually the index to cutoff.  If we skip some at start, we need
    # to add this number
    if skipmols:
        nmols += skipmols

    if isinstance(sdf, str):
        sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility

    # use the suppression context manager to not pollute our stdout with rdkit
    # errors and warnings.
    # perhaps this should be captured better by Mol etc.
    with Suppressor():

        mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs)

        mols = []

        # single loop through sdf
        for i, mol in enumerate(mol_supp):

            if skipmols and i < skipmols:
                continue

            if nmols and i >= nmols:
                break

            # rdkit returns None if it fails to parse a molecule.  We will raise
            # errors unless force is used.
            if mol is None:
                msg = 'Molecule {} could not be decoded.'.format(i + 1)
                if error_bad_mol:
                    raise ValueError(msg)
                elif warn_bad_mol:
                    warnings.warn(msg)
                continue

            mols.append(Mol(mol))


        if skipfooter:
            mols = mols[:-skipfooter]

    idx = pd.Index((m.name for m in mols), name='name')
    data = pd.DataFrame(mols, columns=['structure'])

    if read_props:
        props = pd.DataFrame([{k: v for (k, v) in mol.props.items()} for mol in mols])
        data = pd.concat([data, props], axis=1)
        # now we have extracted the props, we can delete if required
        if not mol_props:
            data.apply(_drop_props, axis=1)

    data.index = idx
    return data

def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
              *args, **kwargs):

    """ Write an sdf file from a dataframe.

    Args:
        data (pandas.Series or pandas.DataFrame):
            Pandas data structure with a `structure` column containing compounds
            to serialize.
        sdf (str or file-like):
            A file path or file-like object specifying where to write the
            compound data.
        write_cols (bool):
            Whether columns should be written as props. Default `True`.
        index_as_name (bool):
            Whether to use index as the header, or the molecule's name.
            Default is `True`.
        mol_props (bool):
            Whether to write properties in the Mol dictionary in addition to
            fields in the frame.

    Warn:
        This function will change the names of the compounds if the
        `index_as_name` argument is `True`, and will delete all properties in
        the molecule dictionary if `mol_props` is `False`.
    """

    if isinstance(data, pd.Series):
        data = data.to_frame(name='structure')

    writer = Chem.SDWriter(sdf, *args, **kwargs)

    cols = list(data.columns.drop('structure'))

    if not mol_props:
        data.apply(_drop_props, axis=1)

    if write_cols:
        data.apply(_set_props, cols=cols, axis=1)

    if index_as_name:
        data.apply(_set_name, axis=1)

    data.structure.apply(writer.write)


@wraps(write_sdf)
def _to_sdf_series(self, *args, **kwargs):

    return write_sdf(self, write_cols=False, *args, **kwargs)

@wraps(write_sdf)
def _to_sdf_df(self, *args, **kwargs):

    return write_sdf(self, *args, **kwargs)

pd.Series.to_sdf = _to_sdf_series
pd.DataFrame.to_sdf = _to_sdf_df


@classmethod
@wraps(read_sdf)
def _from_sdf_df(_, *args, **kwargs):

    return read_sdf(*args, **kwargs)

pd.DataFrame.from_sdf = _from_sdf_df

@classmethod
@wraps(read_sdf)
def _from_sdf_series(_, *args, **kwargs):

    return read_sdf(*args, **kwargs).structure

pd.Series.from_sdf = _from_sdf_series


Push — master ( 1baf98...c0c140 )

_from_sdf() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			#! /usr/bin/env python
2			#
3			# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
4			# License: 3-clause BSD
5
6			"""
7			# skchem.io.sdf
8
9			Defining input and output operations for sdf files.
10			"""
11
12			from functools import wraps
13			import warnings
14
15			from rdkit import Chem
			0 ignored issues – show Configuration introduced 2016-01-19 16:21 UTC by Report Bug Copy Issue Report The import `rdkit` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
16			import pandas as pd
			0 ignored issues – show Configuration introduced 2016-01-19 16:21 UTC by Report Bug Copy Issue Report The import `pandas` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
17
18			from ..core import Mol
19			from ..utils import Suppressor
20
21			def _drop_props(row):
22			for prop in row.structure.props.keys():
23			row.structure.ClearProp(prop)
24
25			def _set_props(row, cols):
26			for i in cols:
27			row.structure.SetProp(str(i), str(row[i])) # rdkit props can only be str
28
29			def _set_name(row):
30			row.structure.name = str(row.name) # rdkit props can only be strs
31
32			def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,
			0 ignored issues – show best-practice introduced 2016-04-14 16:48 UTC by Report Bug Copy Issue Report Too many arguments (8/5) Loading history... Comprehensibility introduced 2016-04-14 16:48 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (18/15). Loading history...
33			skipmols=None, skipfooter=None, read_props=True, mol_props=False,
34			args, *kwargs):
35
36			"""Read an sdf file into a pandas dataframe.
37
38			The function wraps the RDKit ForwardSDMolSupplier object.
39
40			Args:
41			sdf (str or file-like):
42			The location of data to load, as a file path, or a file-like object.
43			error_bad_mol (bool):
44			Whether an error should be raised if a molecule fails to parse.
45			Default is False.
46			warn_bad_mol (bool):
47			Whether a warning should be output if a molecule fails to parse.
48			Default is True.
49			nmols (int):
50			The number of molecules to read. If `None`, read all molecules.
51			Default is `None`.
52			skipmols (int):
53			The number of molecules to skip at start.
54			Default is `0`.
55			skipfooter (int):
56			The number of molecules to skip from the end.
57			Default is `0`.
58			read_props (bool):
59			Whether to read the properties into the data frame.
60			Default is `True`.
61			mol_props (bool):
62			Whether to keep properties in the molecule dictionary after they are
63			extracted to the dataframe.
64			Default is `False`.
65			args, *kwargs:
66			Arguments will be passed to rdkit's ForwardSDMolSupplier.
67
68			Returns:
69			pandas.DataFrame:
70			The loaded data frame, with Mols supplied in the `structure` field.
71
72			See also:
73			rdkit.Chem.SDForwardMolSupplier
74			skchem.read_smiles
75			"""
76
77			# nmols is actually the index to cutoff. If we skip some at start, we need
78			# to add this number
79			if skipmols:
80			nmols += skipmols
81
82			if isinstance(sdf, str):
83			sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility
84
85			# use the suppression context manager to not pollute our stdout with rdkit
86			# errors and warnings.
87			# perhaps this should be captured better by Mol etc.
88			with Suppressor():
89
90			mol_supp = Chem.ForwardSDMolSupplier(sdf, args, *kwargs)
91
92			mols = []
93
94			# single loop through sdf
95			for i, mol in enumerate(mol_supp):
96
97			if skipmols and i < skipmols:
98			continue
99
100			if nmols and i >= nmols:
101			break
102
103			# rdkit returns None if it fails to parse a molecule. We will raise
104			# errors unless force is used.
105			if mol is None:
106			msg = 'Molecule {} could not be decoded.'.format(i + 1)
107			if error_bad_mol:
108			raise ValueError(msg)
109			elif warn_bad_mol:
110			warnings.warn(msg)
111			continue
112
113			mols.append(Mol(mol))
114
115
116			if skipfooter:
117			mols = mols[:-skipfooter]
118
119			idx = pd.Index((m.name for m in mols), name='name')
120			data = pd.DataFrame(mols, columns=['structure'])
121
122			if read_props:
123			props = pd.DataFrame([{k: v for (k, v) in mol.props.items()} for mol in mols])
124			data = pd.concat([data, props], axis=1)
125			# now we have extracted the props, we can delete if required
126			if not mol_props:
127			data.apply(_drop_props, axis=1)
128
129			data.index = idx
130			return data
131
132			def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
133			args, *kwargs):
134
135			""" Write an sdf file from a dataframe.
136
137			Args:
138			data (pandas.Series or pandas.DataFrame):
139			Pandas data structure with a `structure` column containing compounds
140			to serialize.
141			sdf (str or file-like):
142			A file path or file-like object specifying where to write the
143			compound data.
144			write_cols (bool):
145			Whether columns should be written as props. Default `True`.
146			index_as_name (bool):
147			Whether to use index as the header, or the molecule's name.
148			Default is `True`.
149			mol_props (bool):
150			Whether to write properties in the Mol dictionary in addition to
151			fields in the frame.
152
153			Warn:
154			This function will change the names of the compounds if the
155			`index_as_name` argument is `True`, and will delete all properties in
156			the molecule dictionary if `mol_props` is `False`.
157			"""
158
159			if isinstance(data, pd.Series):
160			data = data.to_frame(name='structure')
161
162			writer = Chem.SDWriter(sdf, args, *kwargs)
163
164			cols = list(data.columns.drop('structure'))
165
166			if not mol_props:
167			data.apply(_drop_props, axis=1)
168
169			if write_cols:
170			data.apply(_set_props, cols=cols, axis=1)
171
172			if index_as_name:
173			data.apply(_set_name, axis=1)
174
175			data.structure.apply(writer.write)
176
177
178			@wraps(write_sdf)
179			def _to_sdf_series(self, args, *kwargs):
180
181			return write_sdf(self, write_cols=False, args, *kwargs)
182
183			@wraps(write_sdf)
184			def _to_sdf_df(self, args, *kwargs):
185
186			return write_sdf(self, args, *kwargs)
187
188			pd.Series.to_sdf = _to_sdf_series
189			pd.DataFrame.to_sdf = _to_sdf_df
190
191
192			@classmethod
193			@wraps(read_sdf)
194			def _from_sdf_df(_, args, *kwargs):
195
196			return read_sdf(args, *kwargs)
197
198			pd.DataFrame.from_sdf = _from_sdf_df
199
200			@classmethod
201			@wraps(read_sdf)
202			def _from_sdf_series(_, args, *kwargs):
203
204			return read_sdf(args, *kwargs).structure
205
206			pd.Series.from_sdf = _from_sdf_series
207

richlewis42 / scikit-chem

Push — master ( 1baf98...c0c140 )

_from_sdf() A

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files