Converter.process_splits() - Code Metrics - Inspection of "added first pass at datasets" - richlewis42/scikit-chem - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 4d243d...e4a84f )

by Rich

created 2016-06-12 20:22 UTC

Converter.process_splits() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
c	1
b	0
f	1
dl	0
loc	21
rs	7.8867
cc	6

#! /usr/bin/env python
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
#
# Copyright (C) 2016 Rich Lewis <[email protected]>
# License: 3-clause BSD

import warnings
import logging
import os
import functools
from collections import namedtuple

import numpy as np
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import pandas as pd
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

import h5py
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from fuel.datasets import H5PYDataset
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from ... import filters
from ... import descriptors
from ... import cross_validation
from ... import standardizers

logger = logging.getLogger(__name__)



Feature = namedtuple('Feature', ['fper', 'key', 'axis_names'])

DEFAULT_FEATURES = (
    Feature(fper=descriptors.MorganFingerprinter(),
            key='X_morg',
            axis_names=['batch', 'features']),
    Feature(fper=descriptors.PhysicochemicalFingerprinter(),
            key='X_pc',
            axis_names=['batch', 'features']),
    Feature(fper=descriptors.AtomFeatureCalculator(),
            key='A',
            axis_names=['batch', 'atom_idx', 'features']),
    Feature(fper=descriptors.GraphDistanceCalculator(),
            key='G',
            axis_names=['batch', 'atom_idx', 'atom_idx']))

Filter = namedtuple('Filter', ['filter', 'kwargs'])

DEFAULT_FILTERS = (
    Filter(filters.is_organic, {}),
    Filter(filters.n_atoms, {'above': 5, 'below': 75}),
    Filter(filters.mass, {'below': 1000})
)

DEFAULT_STANDARDIZER = standardizers.ChemAxonStandardizer(keep_failed=True)

class Converter(object):
    """ Create a fuel dataset from molecules and targets.

    Args:
        ms (pd.Series):
            The molecules of the dataset.
        ys (pd.Series or pd.DataFrame):
            The target labels of the dataset.
        output_path (str):
            The path to which the dataset should be saved.
        features (list[Feature]):
            The features to calculate. Defaults are provided.
        splits (dict):
            A dictionary of different splits provided.
            The keys should be the split name, and values an array of indices.
            Alternatively, if `contiguous_splits` is `True`, the keys should be
            the split name, and the values a tuple of start and stop.
            If `None`, use `skchem.cross_validation.SimThresholdSplit`
    """


    def __init__(self, directory, output_directory, output_filename='default.h5'):

        raise NotImplemented


    def run(self, ms, y, output_path,
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
                features=DEFAULT_FEATURES, splits=None, contiguous=False):


        self.contiguous = contiguous
class Foo:
    def __init__(self, x=None):
        self.x = x
        self.output_path = output_path
class Foo:
    def __init__(self, x=None):
        self.x = x
        self.features = features
class Foo:
    def __init__(self, x=None):
        self.x = x
        self.feature_names = [feat.key for feat in self.features] + ['y']
class Foo:
    def __init__(self, x=None):
        self.x = x

        self.create_file(output_path)

        if not splits:
            splits, idx = self.create_splits(ms)
            ms, y = ms.ix[idx], y.ix[idx]

        split_dict = self.process_splits(splits)

        self.save_splits(split_dict)
        self.save_molecules(ms)
        self.save_targets(y)
        self.save_features(ms)

    def create_file(self, path):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
        logger.info('Creating h5 file at %s...', self.output_path)
        self.data_file = h5py.File(path, 'w')
class Foo:
    def __init__(self, x=None):
        self.x = x
        return self.data_file

    def filter(self, data, filters=DEFAULT_FILTERS):
param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param

        """ Filter the compounds according to the usual filters. """
        logger.info('Filtering %s compounds', len(data))
        if isinstance(data, pd.DataFrame):
            ms = data.structure

        else:
            ms = data

        filt = functools.reduce(lambda a, b: a & b, (ms.apply(filt.filter, **filt.kwargs) for filt in filters))

        logger.info('Filtered out %s compounds', (~filt).sum())

        return data[filt]


    def standardize(self, data, standardizer=DEFAULT_STANDARDIZER):
class Foo:
    def some_method(self, x, y):
        return x + y;

        """ Standardize the compounds. """
        logger.info('Standardizing %s compounds', len(data))
        return standardizer.transform(data)


    def save_molecules(self, mols):

        """ Save the molecules to the data file. """

        logger.info('Writing molecules to file...')
        logger.debug('Writing %s molecules to %s', len(mols), self.data_file.filename)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            mols.to_hdf(self.data_file.filename, 'structure')
            mols.apply(lambda m: m.to_smiles().encode('utf-8')).to_hdf(self.data_file.filename, 'smiles')


    def save_targets(self, y):


        """ Save the targets to the data file. """
        y_name = getattr(y, 'name', None)
        if not y_name:
            y_name = getattr(y.columns, 'name', None)
        if not y_name:
            y_name = 'targets'

        logger.info('Writing %s', y_name)
        logger.debug('Writing targets of shape %s to %s', y.shape, self.data_file.filename)

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            y.to_hdf(self.data_file.filename, '/targets/' + y_name)

        if isinstance(y, pd.Series):
            self.data_file['y'] = h5py.SoftLink('/targets/{}/values'.format(y_name))
            self.data_file['y'].dims[0].label = 'batch'

        elif isinstance(y, pd.DataFrame):
            self.data_file['y'] = h5py.SoftLink('/targets/{}/block0_values'.format(y_name))
            self.data_file['y'].dims[0].label = 'batch'
            self.data_file['y'].dims[0].label = 'task'

    def save_features(self, ms):


        """ Save all features for the dataset. """
        logger.debug('Saving features')
        for feat in self.features:
            self._save_feature(ms, feat)

    def _save_feature(self, ms, feat):


        """ Calculate and save a feature to the data file. """
        logger.info('Calculating %s', feat.key)

        fps = feat.fper.transform(ms)
        if len(feat.axis_names) > 2:
            fps = fps.transpose(2, 1, 0) # panel serialize backwards for some reason...
        logger.debug('Writing features with shape %s to %s', fps.shape, self.data_file.filename)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            fps.to_hdf(self.data_file.filename, 'features/{}'.format(feat.key))
        self.data_file[feat.key] = h5py.SoftLink('/features/{}/block0_values'.format(feat.key))
        self.data_file[feat.key].dims[0].label = feat.axis_names[0]
        self.data_file[feat.key].dims[1].label = feat.axis_names[1]
        if len(feat.axis_names) > 2:
            self.data_file[feat.key].dims[2].label = feat.axis_names[2]

    def create_splits(self, ms, contiguous=True):


        """ Create a split dict for fuel from mols, using SimThresholdSplit.

        Args:
            ms (pd.Series):
                The molecules to use to design the splits.
            contiguous (bool):
                Whether the split should be contiguous.  This allows for more
                efficient loading times.  This usually is the appropriate if
                there are no other splits for the dataset, and will reorder
                the dataset.
        Returns:
            (dict, idx)
                The split dict, and the index to align the data with.
        """

        logger.info('Creating Similarity Threshold splits...')
        cv = cross_validation.SimThresholdSplit(ms, memory_optimized=True)

        train, valid, test = cv.split((70, 15, 15))

        def bool_to_index(ser):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return np.nonzero(ser.values)[0]

        if self.contiguous:
            dset = pd.Series(0, ms.index)
            dset[train] = 0
            dset[valid] = 1
            dset[test] = 2
            dset = dset.sort_values()
            idx = dset.index
            train_split = bool_to_index(dset == 0)
            valid_split = bool_to_index(dset == 1)
            test_split = bool_to_index(dset == 2)
            print('train', train_split)
            print('valid', valid_split)
            print('test', test_split)
            def min_max(split):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
                return min(split), max(split)

            splits = {
                'train': min_max(train_split),
                'valid': min_max(valid_split),
                'test': min_max(test_split)
            }

        else:

            idx = ms.index

            splits = {
                'train': bool_to_index(train),
                'valid': bool_to_index(valid),
                'test': bool_to_index(test)
            }

        return splits, idx

    def process_splits(self, splits, contiguous=False):


        """ Create a split dict for fuel from provided indexes. """

        logger.info('Creating split array.')

        split_dict = {}

        if self.contiguous:
            logger.debug('Contiguous splits.')
            for split_name, (start, stop) in splits.items():
                split_dict[split_name] = {feat: (start, stop, h5py.Reference()) for feat in self.feature_names}

        else:
            for split_name, split in splits.items():
                split_indices_name = '{}_indices'.format(split_name).encode('utf-8')
                logger.debug('Saving %s to %s', split_indices_name, self.data_file.filename)
                self.data_file[split_indices_name] = split
                split_ref = self.data_file[split_indices_name].ref
                split_dict[split_name] = {feat: (-1, -1, split_ref) for feat in self.feature_names}

        return split_dict

    def save_splits(self, split_dict):

        """ Save the splits to the data file. """

        logger.info('Producing dataset splits...')
        split = H5PYDataset.create_split_array(split_dict)
        logger.debug('split: %s', split)
        logger.info('Saving splits...')
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            self.data_file.attrs['split'] = split

    @classmethod
    def convert(cls, **kwargs):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
        kwargs.setdefault('directory', os.getcwd())
        kwargs.setdefault('output_directory', os.getcwd())

        return cls(**kwargs).output_path,

    @classmethod
    def fill_subparser(cls, subparser):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
        return cls.convert


Push — master ( 4d243d...e4a84f )

Converter.process_splits() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			#! /usr/bin/env python
			0 ignored issues – show Coding Style introduced 2016-06-12 20:23 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2			#
3			# Copyright (C) 2016 Rich Lewis <[email protected]>
4			# License: 3-clause BSD
5
6			import warnings
7			import logging
8			import os
9			import functools
10			from collections import namedtuple
11
12			import numpy as np
			0 ignored issues – show Configuration introduced 2016-06-12 20:23 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
13			import pandas as pd
			0 ignored issues – show Configuration introduced 2016-06-12 20:23 UTC by Report Bug Copy Issue Report The import `pandas` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
14
15			import h5py
			0 ignored issues – show Configuration introduced 2016-06-12 20:23 UTC by Report Bug Copy Issue Report The import `h5py` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
16			from fuel.datasets import H5PYDataset
			0 ignored issues – show Configuration introduced 2016-06-12 20:23 UTC by Report Bug Copy Issue Report The import `fuel.datasets` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
17
18			from ... import filters
19			from ... import descriptors
20			from ... import cross_validation
21			from ... import standardizers
22
23			logger = logging.getLogger(__name__)
			0 ignored issues – show Coding Style Naming introduced 2016-06-12 20:23 UTC by Report Bug Copy Issue Report The name `logger` does not conform to the constant naming conventions (`(([A-Z_][A-Z0-9_])\|(__.__))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
24
25
26			Feature = namedtuple('Feature', ['fper', 'key', 'axis_names'])
27
28			DEFAULT_FEATURES = (
29			Feature(fper=descriptors.MorganFingerprinter(),
30			key='X_morg',
31			axis_names=['batch', 'features']),
32			Feature(fper=descriptors.PhysicochemicalFingerprinter(),
33			key='X_pc',
34			axis_names=['batch', 'features']),

richlewis42 / scikit-chem

Push — master ( 4d243d...e4a84f )

Converter.process_splits() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files