NMRShiftDB2Converter.get_spectra() - Code Metrics - richlewis42/scikit-chem - Measure and Improve Code Quality continuously with Scrutinizer

NMRShiftDB2Converter.get_spectra() C
last analyzed 2016-09-01 14:43 UTC

↳ Parent: NMRShiftDB2Converter

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	1
CRAP Score	43.8155

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
c	1
b	0
f	1
dl	0
loc	31
ccs	1
cts	11
cp	0.0909
rs	5.5
cc	7
crap	43.8155

2 Methods

Rating	Name	Duplication	Size	Complexity
A	NMRShiftDB2Converter.is_spectrum()	0	2	2
A	NMRShiftDB2Converter.index_pair()	0	2	1

#! /usr/bin/env python
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
#
# Copyright (C) 2016 Rich Lewis <[email protected]>
# License: 3-clause BSD

import os
import logging
import itertools
from collections import defaultdict

import pandas as pd
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import numpy as np
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from sklearn import metrics
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from .base import Converter, default_pipeline, contiguous_order
from ... import io
from ... import utils
from ...cross_validation import SimThresholdSplit

LOGGER = logging.getLogger(__file__)

class NMRShiftDB2Converter(Converter):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

    def __init__(self, directory, output_directory, output_filename='nmrshiftdb2.h5'):
class SomeParent:
    def __init__(self):
        self.x = 1

class SomeChild(SomeParent):
    def __init__(self):
        # Initialize the super class
        SomeParent.__init__(self)

        output_path = os.path.join(output_directory, output_filename)
        input_path = os.path.join(directory, 'nmrshiftdb2.sdf')
        data = self.parse_data(input_path)

        ys = self.get_spectra(data)

        ys = self.process_spectra(ys)

        ys = self.combine_duplicates(ys)

        self.log_dists(ys)
        self.log_duplicates(ys)
        ys = self.squash_duplicates(ys)


        c13s = self.to_frame(ys.loc[ys['13c'].notnull(), '13c'])
        data = data[['structure']].join(c13s, how='right')

        ms, y = data.structure, data.drop('structure', axis=1)

        pipeline = default_pipeline()
        ms, y = pipeline.transform_filter(ms, y)

        y.columns.name = 'shifts'

        cv = SimThresholdSplit(min_threshold=0.6, block_width=4000, n_jobs=-1).fit(ms)

        train, valid, test = cv.split((70, 15, 15))

        (ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test))

        splits = (('train', train), ('valid', valid), ('test', test))

        self.run(ms, y, output_path=output_path, splits=splits)

    @staticmethod
    def parse_data(filepath):

        """ Reads the raw datafile. """

        LOGGER.info('Reading file: %s', filepath)
        data = io.read_sdf(filepath, removeHs=False, warn_bad_mol=False)
        data.index = data['nmrshiftdb2 ID'].astype(int)
        data.index.name = 'nmrshiftdb2_id'
        data.columns = data.columns.to_series().apply(utils.free_to_snail)
        data = data.sort_index()
        LOGGER.info('Read %s molecules.', len(data))
        return data

    @staticmethod
    def get_spectra(data):

        """ Retrieves spectra from raw data. """

        LOGGER.info('Retrieving spectra from raw data...')
        isotopes = [
            '1h',
            '11b',
            '13c',
            '15n',
            '17o',
            '19f',
            '29si',
            '31p',
            '33s',
            '73ge',
            '195pt'
        ]

        def is_spectrum(col_name, ele='c'):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return any(isotope in col_name for isotope in isotopes)

        spectrum_cols = [c for c in data if is_spectrum(c)]
        data = data[spectrum_cols]

        def index_pair(s):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return s[0], int(s[1])

        data.columns = pd.MultiIndex.from_tuples([index_pair(i.split('_')[1:]) for i in data.columns])

        return data

    @staticmethod
    def process_spectra(data):

        """ Turn the string representations found in sdf file into a dictionary. """

        def spectrum_dict(spectrum_string):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            if not isinstance(spectrum_string, str):
                return np.nan # no spectra are still nan
            if spectrum_string == '':
                return np.nan # empty spectra are nan
            sigs = spectrum_string.strip().strip('|').strip().split('|') # extract signals
            sig_tup = [tuple(s.split(';')) for s in sigs] # take tuples as (signal, coupling, atom)
            return {int(s[2]): float(s[0]) for s in sig_tup} # make spectrum a dictionary of atom to signal


        return data.applymap(spectrum_dict)

    @staticmethod
    def combine_duplicates(data):

        """ Collect duplicate spectra into one dictionary. All shifts are collected into lists. """

        def aggregate_dicts(ds):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            res = defaultdict(list)
            for d in ds:

                if not isinstance(d, dict): continue

                for k, v in d.items():

                    res[k].append(v)
            return dict(res) if len(res) else np.nan

        return data.groupby(level=0, axis=1).apply(lambda s: s.apply(aggregate_dicts, axis=1))

    @staticmethod
    def squash_duplicates(data):

        """ Take the mean of all the duplicates.  This is where we could do a bit more checking. """

        def squash(d):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            if not isinstance(d, dict):
                return np.nan
            else:
                return {k: np.mean(v) for k, v in d.items()}

        return data.applymap(squash)

    @staticmethod
    def to_frame(data):

        """ Convert a series of dictionaries to a dataframe. """
        res = pd.DataFrame(data.tolist(), index=data.index)
        res.columns.name = 'atom_idx'
        return res

    @staticmethod
    def extract_duplicates(data, kind='13c'):

        """ Get all 13c duplicates.  """

        def is_duplicate(ele):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            if not isinstance(ele, dict):
                return False
            else:
                return len(list(ele.values())[0]) > 1

        return data.loc[data[kind].apply(is_duplicate), kind]

    @staticmethod
    def log_dists(data):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

        def n_spect(ele):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return isinstance(ele, dict)

        def n_shifts(ele):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return len(ele) if isinstance(ele, dict) else 0

        def log_message(func):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return '  '.join('{k}: {v}'.format(k=k, v=v) for k, v in data.applymap(func).sum().to_dict().items())


        LOGGER.info('Number of spectra: %s', log_message(n_spect))
        LOGGER.info('Extracted shifts: %s', log_message(n_shifts))


    def log_duplicates(self, data):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

        for kind in '1h', '13c':
            dups = self.extract_duplicates(data, kind)
            LOGGER.info('Number of duplicate %s spectra: %s', kind, len(dups))
            res = pd.DataFrame(sum((list(itertools.combinations(l, 2)) for s in dups for k, l in s.items()), []))

            LOGGER.info('Number of duplicate %s pairs: %f', kind, len(res))
            LOGGER.info('MAE for duplicate %s: %.4f', kind, metrics.mean_absolute_error(res[0], res[1]))

            LOGGER.info('MSE for duplicate %s: %.4f', kind, metrics.mean_squared_error(res[0], res[1]))

            LOGGER.info('r2 for duplicate %s: %.4f', kind, metrics.r2_score(res[0], res[1]))


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    LOGGER.info('Converting NMRShiftDB2 Dataset...')
    NMRShiftDB2Converter.convert()


NMRShiftDB2Converter.get_spectra() C
last analyzed 2016-09-01 14:43 UTC

Complexity

Size

Duplication

Code Coverage

Importance

2 Methods

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1		#! /usr/bin/env python
		0 ignored issues – show Coding Style introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2		#
3		# Copyright (C) 2016 Rich Lewis <[email protected]>
4		# License: 3-clause BSD
5
6	1	import os
7	1	import logging
8	1	import itertools
9	1	from collections import defaultdict
10
11	1	import pandas as pd
		0 ignored issues – show Configuration introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The import `pandas` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
12	1	import numpy as np
		0 ignored issues – show Configuration introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
13	1	from sklearn import metrics
		0 ignored issues – show Configuration introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The import `sklearn` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
14
15	1	from .base import Converter, default_pipeline, contiguous_order
16	1	from ... import io
17	1	from ... import utils
18	1	from ...cross_validation import SimThresholdSplit
19
20	1	LOGGER = logging.getLogger(__file__)
21
22	1	class NMRShiftDB2Converter(Converter):
		0 ignored issues – show Coding Style introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report This class should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
23
24	1	def __init__(self, directory, output_directory, output_filename='nmrshiftdb2.h5'):
		0 ignored issues – show Bug introduced 2016-08-03 02:42 UTC by Report Bug Copy Issue Report The `__init__` method of the super-class `Converter` is not called. It is generally advisable to initialize the super-class by calling its `__init__` method: class SomeParent: def __init__(self): self.x = 1 class SomeChild(SomeParent): def __init__(self): # Initialize the super class SomeParent.__init__(self) Loading history... Comprehensibility introduced 2016-08-05 16:20 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (17/15). Loading history...
25
26		output_path = os.path.join(output_directory, output_filename)
27		input_path = os.path.join(directory, 'nmrshiftdb2.sdf')
28		data = self.parse_data(input_path)
29
30		ys = self.get_spectra(data)
		0 ignored issues – show Coding Style Naming introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The name `ys` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
31		ys = self.process_spectra(ys)
		0 ignored issues – show Coding Style Naming introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The name `ys` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
32		ys = self.combine_duplicates(ys)

richlewis42 / scikit-chem

NMRShiftDB2Converter.get_spectra() C last analyzed 2016-09-01 14:43 UTC

Complexity

Size

Duplication

Code Coverage

Importance

2 Methods

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

NMRShiftDB2Converter.get_spectra() C
last analyzed 2016-09-01 14:43 UTC

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files