NMRShiftDB2Converter - Code Metrics - richlewis42/scikit-chem - Measure and Improve Code Quality continuously with Scrutinizer

NMRShiftDB2Converter B
last analyzed 2016-09-01 14:43 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	168
Duplicated Lines	0 %

Test Coverage

Coverage

12.5%

Importance

Changes	1
Bugs	0	Features	1

Metric	Value
wmc	40
c	1
b	0
f	1
dl	0
loc	168
ccs	12
cts	96
cp	0.125
rs	8.2608

19 Methods

Rating	Name	Size	Complexity
B	log_dists()	14	6
A	is_spectrum()	2	2
B	__init__()	28	1
A	index_pair()	2	1
A	is_duplicate()	5	2
B	combine_duplicates()	14	7
A	squash_duplicates()	12	4
A	log_duplicates()	10	4
A	log_message()	2	2
B	process_spectra()	15	6
B	aggregate_dicts()	7	5
A	squash()	5	3
A	parse_data()	13	1
B	spectrum_dict()	8	5
A	to_frame()	7	1
A	n_shifts()	2	2
C	get_spectra()	31	7
A	n_spect()	2	1
A	extract_duplicates()	12	3

How to fix Complexity

#! /usr/bin/env python
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
#
# Copyright (C) 2016 Rich Lewis <[email protected]>
# License: 3-clause BSD

import os
import logging
import itertools
from collections import defaultdict

import pandas as pd
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import numpy as np
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from sklearn import metrics
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from .base import Converter, default_pipeline, contiguous_order
from ... import io
from ... import utils
from ...cross_validation import SimThresholdSplit

LOGGER = logging.getLogger(__file__)

class NMRShiftDB2Converter(Converter):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

    def __init__(self, directory, output_directory, output_filename='nmrshiftdb2.h5'):
class SomeParent:
    def __init__(self):
        self.x = 1

class SomeChild(SomeParent):
    def __init__(self):
        # Initialize the super class
        SomeParent.__init__(self)

        output_path = os.path.join(output_directory, output_filename)
        input_path = os.path.join(directory, 'nmrshiftdb2.sdf')
        data = self.parse_data(input_path)

        ys = self.get_spectra(data)

        ys = self.process_spectra(ys)

        ys = self.combine_duplicates(ys)

        self.log_dists(ys)
        self.log_duplicates(ys)
        ys = self.squash_duplicates(ys)


        c13s = self.to_frame(ys.loc[ys['13c'].notnull(), '13c'])
        data = data[['structure']].join(c13s, how='right')

        ms, y = data.structure, data.drop('structure', axis=1)

        pipeline = default_pipeline()
        ms, y = pipeline.transform_filter(ms, y)

        y.columns.name = 'shifts'

        cv = SimThresholdSplit(min_threshold=0.6, block_width=4000, n_jobs=-1).fit(ms)

        train, valid, test = cv.split((70, 15, 15))

        (ms, y, train, valid, test) = contiguous_order((ms, y, train, valid, test), (train, valid, test))

        splits = (('train', train), ('valid', valid), ('test', test))

        self.run(ms, y, output_path=output_path, splits=splits)

    @staticmethod
    def parse_data(filepath):

        """ Reads the raw datafile. """

        LOGGER.info('Reading file: %s', filepath)
        data = io.read_sdf(filepath, removeHs=False, warn_bad_mol=False)
        data.index = data['nmrshiftdb2 ID'].astype(int)
        data.index.name = 'nmrshiftdb2_id'
        data.columns = data.columns.to_series().apply(utils.free_to_snail)
        data = data.sort_index()
        LOGGER.info('Read %s molecules.', len(data))
        return data

    @staticmethod
    def get_spectra(data):

        """ Retrieves spectra from raw data. """

        LOGGER.info('Retrieving spectra from raw data...')
        isotopes = [
            '1h',
            '11b',
            '13c',
            '15n',
            '17o',
            '19f',
            '29si',
            '31p',
            '33s',
            '73ge',
            '195pt'
        ]

        def is_spectrum(col_name, ele='c'):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return any(isotope in col_name for isotope in isotopes)

        spectrum_cols = [c for c in data if is_spectrum(c)]
        data = data[spectrum_cols]

        def index_pair(s):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return s[0], int(s[1])

        data.columns = pd.MultiIndex.from_tuples([index_pair(i.split('_')[1:]) for i in data.columns])

        return data

    @staticmethod
    def process_spectra(data):

        """ Turn the string representations found in sdf file into a dictionary. """

        def spectrum_dict(spectrum_string):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            if not isinstance(spectrum_string, str):
                return np.nan # no spectra are still nan
            if spectrum_string == '':
                return np.nan # empty spectra are nan
            sigs = spectrum_string.strip().strip('|').strip().split('|') # extract signals
            sig_tup = [tuple(s.split(';')) for s in sigs] # take tuples as (signal, coupling, atom)
            return {int(s[2]): float(s[0]) for s in sig_tup} # make spectrum a dictionary of atom to signal


        return data.applymap(spectrum_dict)

    @staticmethod
    def combine_duplicates(data):

        """ Collect duplicate spectra into one dictionary. All shifts are collected into lists. """

        def aggregate_dicts(ds):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            res = defaultdict(list)
            for d in ds:

                if not isinstance(d, dict): continue

                for k, v in d.items():

                    res[k].append(v)
            return dict(res) if len(res) else np.nan

        return data.groupby(level=0, axis=1).apply(lambda s: s.apply(aggregate_dicts, axis=1))

    @staticmethod
    def squash_duplicates(data):

        """ Take the mean of all the duplicates.  This is where we could do a bit more checking. """

        def squash(d):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            if not isinstance(d, dict):
                return np.nan
            else:
                return {k: np.mean(v) for k, v in d.items()}

        return data.applymap(squash)

    @staticmethod
    def to_frame(data):

        """ Convert a series of dictionaries to a dataframe. """
        res = pd.DataFrame(data.tolist(), index=data.index)
        res.columns.name = 'atom_idx'
        return res

    @staticmethod
    def extract_duplicates(data, kind='13c'):

        """ Get all 13c duplicates.  """

        def is_duplicate(ele):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            if not isinstance(ele, dict):
                return False
            else:
                return len(list(ele.values())[0]) > 1

        return data.loc[data[kind].apply(is_duplicate), kind]

    @staticmethod
    def log_dists(data):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

        def n_spect(ele):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return isinstance(ele, dict)

        def n_shifts(ele):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return len(ele) if isinstance(ele, dict) else 0

        def log_message(func):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            return '  '.join('{k}: {v}'.format(k=k, v=v) for k, v in data.applymap(func).sum().to_dict().items())


        LOGGER.info('Number of spectra: %s', log_message(n_spect))
        LOGGER.info('Extracted shifts: %s', log_message(n_shifts))


    def log_duplicates(self, data):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""

        for kind in '1h', '13c':
            dups = self.extract_duplicates(data, kind)
            LOGGER.info('Number of duplicate %s spectra: %s', kind, len(dups))
            res = pd.DataFrame(sum((list(itertools.combinations(l, 2)) for s in dups for k, l in s.items()), []))

            LOGGER.info('Number of duplicate %s pairs: %f', kind, len(res))
            LOGGER.info('MAE for duplicate %s: %.4f', kind, metrics.mean_absolute_error(res[0], res[1]))

            LOGGER.info('MSE for duplicate %s: %.4f', kind, metrics.mean_squared_error(res[0], res[1]))

            LOGGER.info('r2 for duplicate %s: %.4f', kind, metrics.r2_score(res[0], res[1]))


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    LOGGER.info('Converting NMRShiftDB2 Dataset...')
    NMRShiftDB2Converter.convert()


NMRShiftDB2Converter B
last analyzed 2016-09-01 14:43 UTC

Complexity

Size/Duplication

Test Coverage

Importance

19 Methods

How to fix Complexity

Complex Class

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1		#! /usr/bin/env python
		0 ignored issues – show Coding Style introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report This module should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
2		#
3		# Copyright (C) 2016 Rich Lewis <[email protected]>
4		# License: 3-clause BSD
5
6	1	import os
7	1	import logging
8	1	import itertools
9	1	from collections import defaultdict
10
11	1	import pandas as pd
		0 ignored issues – show Configuration introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The import `pandas` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
12	1	import numpy as np
		0 ignored issues – show Configuration introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
13	1	from sklearn import metrics
		0 ignored issues – show Configuration introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The import `sklearn` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
14
15	1	from .base import Converter, default_pipeline, contiguous_order
16	1	from ... import io
17	1	from ... import utils
18	1	from ...cross_validation import SimThresholdSplit
19
20	1	LOGGER = logging.getLogger(__file__)
21
22	1	class NMRShiftDB2Converter(Converter):
		0 ignored issues – show Coding Style introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report This class should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
23
24	1	def __init__(self, directory, output_directory, output_filename='nmrshiftdb2.h5'):
		0 ignored issues – show Bug introduced 2016-08-03 02:42 UTC by Report Bug Copy Issue Report The `__init__` method of the super-class `Converter` is not called. It is generally advisable to initialize the super-class by calling its `__init__` method: class SomeParent: def __init__(self): self.x = 1 class SomeChild(SomeParent): def __init__(self): # Initialize the super class SomeParent.__init__(self) Loading history... Comprehensibility introduced 2016-08-05 16:20 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (17/15). Loading history...
25
26		output_path = os.path.join(output_directory, output_filename)
27		input_path = os.path.join(directory, 'nmrshiftdb2.sdf')
28		data = self.parse_data(input_path)
29
30		ys = self.get_spectra(data)
		0 ignored issues – show Coding Style Naming introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The name `ys` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
31		ys = self.process_spectra(ys)
		0 ignored issues – show Coding Style Naming introduced 2016-07-18 17:48 UTC by Report Bug Copy Issue Report The name `ys` does not conform to the variable naming conventions (`[a-z_][a-z0-9_]{2,30}$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
32		ys = self.combine_duplicates(ys)

richlewis42 / scikit-chem

NMRShiftDB2Converter B last analyzed 2016-09-01 14:43 UTC

Complexity

Size/Duplication

Test Coverage

Importance

19 Methods

How to fix Complexity

Complex Class

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

NMRShiftDB2Converter B
last analyzed 2016-09-01 14:43 UTC

2. Missing init.py files

2. Missing init.py files

2. Missing init.py files