write_sdf() - Code Metrics - richlewis42/scikit-chem - Measure and Improve Code Quality continuously with Scrutinizer

write_sdf() C
last analyzed 2016-09-01 14:43 UTC

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	1
CRAP Score	46.8345

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
c	1
b	0
f	0
dl	0
loc	49
ccs	1
cts	15
cp	0.0667
rs	5.5
cc	7
crap	46.8345

#! /usr/bin/env python

#
# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
# License: 3-clause BSD

"""
# skchem.io.sdf

Defining input and output operations for sdf files.
"""

from functools import wraps
import warnings

from rdkit import Chem
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import pandas as pd
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from ..core import Mol
from ..utils import Suppressor, squeeze


def _drop_props(row):
    for prop in row.structure.props.keys():
        row.structure.ClearProp(prop)


def _set_props(row, cols):
    for i in cols:
        row.structure.SetProp(str(i), str(row[i]))


def _set_name(row):
    row.structure.name = str(row.name)  # rdkit props can only be strs


def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,

             skipmols=None, skipfooter=None, read_props=True, mol_props=False,
             *args, **kwargs):

    """Read an sdf file into a `pd.DataFrame`.

    The function wraps the RDKit `ForwardSDMolSupplier` object.

    Args:
        sdf (str or file-like):
            The location of data to load as a file path, or a file-like object.
        error_bad_mol (bool):
            Whether an error should be raised if a molecule fails to parse.
            Default is False.
        warn_bad_mol (bool):
            Whether a warning should be output if a molecule fails to parse.
            Default is True.
        nmols (int):
            The number of molecules to read. If `None`, read all molecules.
            Default is `None`.
        skipmols (int):
            The number of molecules to skip at start.
            Default is `0`.
        skipfooter (int):
            The number of molecules to skip from the end.
            Default is `0`.
        read_props (bool):
            Whether to read the properties into the data frame.
            Default is `True`.
        mol_props (bool):
            Whether to keep properties in the molecule dictionary after they
            are extracted to the DataFrame.
            Default is `False`.
        args, kwargs:
            Arguments will be passed to RDKit ForwardSDMolSupplier.

    Returns:
        pandas.DataFrame:
            The loaded data frame, with Mols supplied in the `structure` field.

    See also:
        rdkit.Chem.SDForwardMolSupplier
        skchem.read_smiles
    """

    # nmols is actually the index to cutoff.  If we skip some at start, we need
    # to add this number
    if skipmols:
        nmols += skipmols

    if isinstance(sdf, str):
        sdf = open(sdf, 'rb')  # use read bytes for python 3 compatibility

    # use the suppression context manager to not pollute our stdout with rdkit
    # errors and warnings.
    # perhaps this should be captured better by Mol etc.
    with Suppressor():

        mol_supp = Chem.ForwardSDMolSupplier(sdf, *args, **kwargs)

        mols = []

        # single loop through sdf
        for i, mol in enumerate(mol_supp):

            if skipmols and i < skipmols:
                continue

            if nmols and i >= nmols:
                break

            if mol is None:
                msg = 'Molecule {} could not be decoded.'.format(i + 1)
                if error_bad_mol:
                    raise ValueError(msg)
                elif warn_bad_mol:
                    warnings.warn(msg)
                continue

            mols.append(Mol(mol))

        if skipfooter:
            mols = mols[:-skipfooter]

    idx = pd.Index((m.name for m in mols), name='batch')
    data = pd.DataFrame(mols, columns=['structure'])

    if read_props:
        props = pd.DataFrame([{k: v for (k, v) in mol.props.items()}
                              for mol in mols])
        data = pd.concat([data, props], axis=1)
        # now we have extracted the props, we can delete if required
        if not mol_props:
            data.apply(_drop_props, axis=1)

    data.index = idx
    return squeeze(data, axis=1)


def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
              *args, **kwargs):

    """ Write an sdf file from a dataframe.

    Args:
        data (pandas.Series or pandas.DataFrame):
            Pandas data structure with a `structure` column containing
            compounds to serialize.
        sdf (str or file-like):
            A file path or file-like object specifying where to write the
            compound data.
        write_cols (bool):
            Whether columns should be written as props. Default `True`.
        index_as_name (bool):
            Whether to use index as the header, or the molecule's name.
            Default is `True`.
        mol_props (bool):
            Whether to write properties in the Mol dictionary in addition to
            fields in the frame.

    Warn:
        This function will change the names of the compounds if the
        `index_as_name` argument is `True`, and will delete all properties in
        the molecule dictionary if `mol_props` is `False`.
    """
    if isinstance(data, pd.Series):
        data = data.to_frame(name='structure')

    names = [m.name for m in data.structure]

    writer = Chem.SDWriter(sdf, *args, **kwargs)

    cols = list(data.columns.drop('structure'))

    if not mol_props:
        data.apply(_drop_props, axis=1)

    if write_cols:
        data.apply(_set_props, cols=cols, axis=1)

    if index_as_name:
        data.apply(_set_name, axis=1)

    data.structure.apply(writer.write)

    # rdkit writer changes names sometimes
    for mol, name in zip(data.structure, names):
        mol.name = name


@wraps(write_sdf)
def _to_sdf_series(self, *args, **kwargs):

    return write_sdf(self, write_cols=False, *args, **kwargs)


@wraps(write_sdf)
def _to_sdf_df(self, *args, **kwargs):

    return write_sdf(self, *args, **kwargs)

pd.Series.to_sdf = _to_sdf_series
pd.DataFrame.to_sdf = _to_sdf_df


@classmethod
@wraps(read_sdf)
def _from_sdf_df(_, *args, **kwargs):

    return read_sdf(*args, **kwargs)

pd.DataFrame.from_sdf = _from_sdf_df


@classmethod
@wraps(read_sdf)
def _from_sdf_series(_, *args, **kwargs):

    return read_sdf(*args, **kwargs).structure

pd.Series.from_sdf = _from_sdf_series


write_sdf() C
last analyzed 2016-09-01 14:43 UTC

Complexity

Size

Duplication

Code Coverage

Importance

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1		#! /usr/bin/env python
		0 ignored issues – show Bug introduced 2016-07-01 13:01 UTC by Report Bug Copy Issue Report There seems to be a cyclic import (skchem -> skchem.data -> skchem.data.datasets -> skchem.data.datasets.muller_ames -> skchem.data.converters.muller_ames). Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug. Loading history... Bug introduced 2016-08-25 16:29 UTC by Report Bug Copy Issue Report There seems to be a cyclic import (skchem.core.conformer -> skchem.core.mol). Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug. Loading history... Bug introduced 2016-08-25 16:29 UTC by Report Bug Copy Issue Report There seems to be a cyclic import (skchem.core.atom -> skchem.core.bond -> skchem.core.mol). Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug. Loading history... Bug introduced 2016-08-25 16:29 UTC by Report Bug Copy Issue Report There seems to be a cyclic import (skchem.core.atom -> skchem.core.bond). Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug. Loading history... Bug introduced 2016-08-25 16:29 UTC by Report Bug Copy Issue Report There seems to be a cyclic import (skchem.core.bond -> skchem.core.mol). Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug. Loading history...
2		#
3		# Copyright (C) 2015-2016 Rich Lewis <[email protected]>
4		# License: 3-clause BSD
5
6	1	"""
7		# skchem.io.sdf
8
9		Defining input and output operations for sdf files.
10		"""
11
12	1	from functools import wraps
13	1	import warnings
14
15	1	from rdkit import Chem
		0 ignored issues – show Configuration introduced 2016-01-19 16:21 UTC by Report Bug Copy Issue Report The import `rdkit` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
16	1	import pandas as pd
		0 ignored issues – show Configuration introduced 2016-01-19 16:21 UTC by Report Bug Copy Issue Report The import `pandas` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
17
18	1	from ..core import Mol
19	1	from ..utils import Suppressor, squeeze
20
21
22	1	def _drop_props(row):
23	1	for prop in row.structure.props.keys():
24	1	row.structure.ClearProp(prop)
25
26
27	1	def _set_props(row, cols):
28		for i in cols:
29		row.structure.SetProp(str(i), str(row[i]))
30
31
32	1	def _set_name(row):
33		row.structure.name = str(row.name) # rdkit props can only be strs
34
35
36	1	def read_sdf(sdf, error_bad_mol=False, warn_bad_mol=True, nmols=None,
		0 ignored issues – show best-practice introduced 2016-04-14 16:48 UTC by Report Bug Copy Issue Report Too many arguments (8/5) Loading history... Comprehensibility introduced 2016-04-14 16:48 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (18/15). Loading history...
37		skipmols=None, skipfooter=None, read_props=True, mol_props=False,
38		args, *kwargs):
39
40		"""Read an sdf file into a `pd.DataFrame`.
41
42		The function wraps the RDKit `ForwardSDMolSupplier` object.
43
44		Args:
45		sdf (str or file-like):
46		The location of data to load as a file path, or a file-like object.
47		error_bad_mol (bool):
48		Whether an error should be raised if a molecule fails to parse.
49		Default is False.
50		warn_bad_mol (bool):
51		Whether a warning should be output if a molecule fails to parse.
52		Default is True.
53		nmols (int):
54		The number of molecules to read. If `None`, read all molecules.
55		Default is `None`.
56		skipmols (int):
57		The number of molecules to skip at start.
58		Default is `0`.
59		skipfooter (int):
60		The number of molecules to skip from the end.
61		Default is `0`.
62		read_props (bool):
63		Whether to read the properties into the data frame.
64		Default is `True`.
65		mol_props (bool):
66		Whether to keep properties in the molecule dictionary after they
67		are extracted to the DataFrame.
68		Default is `False`.
69		args, kwargs:
70		Arguments will be passed to RDKit ForwardSDMolSupplier.
71
72		Returns:
73		pandas.DataFrame:
74		The loaded data frame, with Mols supplied in the `structure` field.
75
76		See also:
77		rdkit.Chem.SDForwardMolSupplier
78		skchem.read_smiles
79		"""
80
81		# nmols is actually the index to cutoff. If we skip some at start, we need
82		# to add this number
83	1	if skipmols:
84		nmols += skipmols
85
86	1	if isinstance(sdf, str):
87	1	sdf = open(sdf, 'rb') # use read bytes for python 3 compatibility
88
89		# use the suppression context manager to not pollute our stdout with rdkit
90		# errors and warnings.
91		# perhaps this should be captured better by Mol etc.
92	1	with Suppressor():
93
94	1	mol_supp = Chem.ForwardSDMolSupplier(sdf, args, *kwargs)
95
96	1	mols = []
97
98		# single loop through sdf
99	1	for i, mol in enumerate(mol_supp):
100
101	1	if skipmols and i < skipmols:
102		continue
103
104	1	if nmols and i >= nmols:
105		break
106
107	1	if mol is None:
108	1	msg = 'Molecule {} could not be decoded.'.format(i + 1)
109	1	if error_bad_mol:
110	1	raise ValueError(msg)
111		elif warn_bad_mol:
112		warnings.warn(msg)
113		continue
114
115	1	mols.append(Mol(mol))
116
117	1	if skipfooter:
118		mols = mols[:-skipfooter]
119
120	1	idx = pd.Index((m.name for m in mols), name='batch')
121	1	data = pd.DataFrame(mols, columns=['structure'])
122
123	1	if read_props:
124	1	props = pd.DataFrame([{k: v for (k, v) in mol.props.items()}
125		for mol in mols])
126	1	data = pd.concat([data, props], axis=1)
127		# now we have extracted the props, we can delete if required
128	1	if not mol_props:
129	1	data.apply(_drop_props, axis=1)
130
131	1	data.index = idx
132	1	return squeeze(data, axis=1)
133
134
135	1	def write_sdf(data, sdf, write_cols=True, index_as_name=True, mol_props=False,
136		args, *kwargs):
137
138		""" Write an sdf file from a dataframe.
139
140		Args:
141		data (pandas.Series or pandas.DataFrame):
142		Pandas data structure with a `structure` column containing
143		compounds to serialize.
144		sdf (str or file-like):
145		A file path or file-like object specifying where to write the
146		compound data.
147		write_cols (bool):
148		Whether columns should be written as props. Default `True`.
149		index_as_name (bool):
150		Whether to use index as the header, or the molecule's name.
151		Default is `True`.
152		mol_props (bool):
153		Whether to write properties in the Mol dictionary in addition to
154		fields in the frame.
155
156		Warn:
157		This function will change the names of the compounds if the
158		`index_as_name` argument is `True`, and will delete all properties in
159		the molecule dictionary if `mol_props` is `False`.
160		"""
161		if isinstance(data, pd.Series):
162		data = data.to_frame(name='structure')
163
164		names = [m.name for m in data.structure]
165
166		writer = Chem.SDWriter(sdf, args, *kwargs)
167
168		cols = list(data.columns.drop('structure'))
169
170		if not mol_props:
171		data.apply(_drop_props, axis=1)
172
173		if write_cols:
174		data.apply(_set_props, cols=cols, axis=1)
175
176		if index_as_name:
177		data.apply(_set_name, axis=1)
178
179		data.structure.apply(writer.write)
180
181		# rdkit writer changes names sometimes
182		for mol, name in zip(data.structure, names):
183		mol.name = name
184
185
186	1	@wraps(write_sdf)
187		def _to_sdf_series(self, args, *kwargs):
188
189		return write_sdf(self, write_cols=False, args, *kwargs)
190
191
192	1	@wraps(write_sdf)
193		def _to_sdf_df(self, args, *kwargs):
194
195		return write_sdf(self, args, *kwargs)
196
197	1	pd.Series.to_sdf = _to_sdf_series
198	1	pd.DataFrame.to_sdf = _to_sdf_df
199
200
201	1	@classmethod
202	1	@wraps(read_sdf)
203		def _from_sdf_df(_, args, *kwargs):
204
205		return read_sdf(args, *kwargs)
206
207	1	pd.DataFrame.from_sdf = _from_sdf_df
208
209
210	1	@classmethod
211	1	@wraps(read_sdf)
212		def _from_sdf_series(_, args, *kwargs):
213
214		return read_sdf(args, *kwargs).structure
215
216		pd.Series.from_sdf = _from_sdf_series
217

richlewis42 / scikit-chem

write_sdf() C last analyzed 2016-09-01 14:43 UTC

Complexity

Size

Duplication

Code Coverage

Importance

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

write_sdf() C
last analyzed 2016-09-01 14:43 UTC

2. Missing init.py files

2. Missing init.py files