mandos.analysis.prepping - Code Metrics - Inspection of "build(deps): bump pyarrow from 3.0.0 to 5.0.0" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — dependabot/pip/pyarrow-5.0.0 ( 101caa...cfe875 )

unknown

created 2021-08-02 23:40 UTC

mandos.analysis.prepping A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	86
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	68
dl	0
loc	86
rs	10
c	0
b	0
f	0
wmc	11

3 Methods

Rating	Name	Size	Complexity
A	MatrixPrep.ecfp_matrix()	23	4
A	MatrixPrep.from_files()	11	3
A	MatrixPrep.create()	18	4

"""
X.
"""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import TypeVar, Mapping, Sequence, List

import numpy as np

import pandas as pd

from typeddfs import BaseDf

from typeddfs.df_errors import UnsupportedOperationError


from mandos import logger
from mandos.analysis.io_defns import SimilarityDfLongForm, SimilarityDfShortForm
from mandos.entries.searcher import InputFrame
from mandos.model.rdkit_utils import RdkitUtils

T = TypeVar("T", bound=BaseDf)



@dataclass(frozen=True, repr=True)

class MatrixPrep:
    kind: str
    normalize: bool
    log: bool
    invert: bool

    def from_files(self, paths: Sequence[Path]) -> SimilarityDfLongForm:

        dct = {}
        for p in paths:

            key = p.with_suffix("").name
            try:
                mx = SimilarityDfShortForm.read_file(p)

                dct[key] = mx
            except (OSError, UnsupportedOperationError, ValueError):
                logger.error(f"Failed to load matrix at {str(p)}")
                raise
        return self.create(dct)

    def create(self, key_to_mx: Mapping[str, SimilarityDfShortForm]) -> SimilarityDfLongForm:

        df = SimilarityDfLongForm(

            pd.concat([mx.to_long_form(self.kind, key) for key, mx in key_to_mx.items()])
        )
        vals = df["value"]
        if self.invert:
            vals = -vals
        if self.normalize:
            mn, mx = vals.min(), vals.max()

            vals = (vals - mn) / (mn - mx)
        if self.log:
            # this is a bit stupid, but calc the log then normalize again
            # we can't take the log before normalization because we might have negative values
            vals = vals.map(np.log10)
            mn, mx = vals.min(), vals.max()

            vals = (vals - mn) / (mn - mx)
        df["value"] = vals
        return SimilarityDfLongForm.convert(df)

    @classmethod
    def ecfp_matrix(cls, df: InputFrame, radius: int, n_bits: int) -> SimilarityDfShortForm:

        # TODO: This is inefficient and long

        indices = range(len(df))
        keys = df["inchikey"]
        on_bits = [
            RdkitUtils.ecfp(c, radius=radius, n_bits=n_bits).list_on for c in df.get_structures()
        ]
        the_rows: List[List[float]] = []
        for i, row_key, row_print in zip(indices, keys, on_bits):

            for j, col_key, col_print in zip(indices, keys, on_bits):

                the_row = []
                if i < j:
                    jaccard = len(row_print.intersection(col_print)) / len(
                        row_print.union(col_print)
                    )
                    the_row.append(jaccard)
                the_rows.append(the_row)
        short = SimilarityDfShortForm(the_rows)
        short["inchikey"] = keys
        short = short.set_index("inchikey")
        short.columns = keys
        return SimilarityDfShortForm.convert(short)


__all__ = ["MatrixPrep"]


1			"""
2			X.
3			"""
4			from __future__ import annotations
5			from dataclasses import dataclass
6			from pathlib import Path
7			from typing import TypeVar, Mapping, Sequence, List
8
9			import numpy as np
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Unable to import 'numpy' Loading history...
10			import pandas as pd
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
11			from typeddfs import BaseDf
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
12			from typeddfs.df_errors import UnsupportedOperationError
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.df_errors' Loading history...
13
14			from mandos import logger
15			from mandos.analysis.io_defns import SimilarityDfLongForm, SimilarityDfShortForm
16			from mandos.entries.searcher import InputFrame
17			from mandos.model.rdkit_utils import RdkitUtils
18
19			T = TypeVar("T", bound=BaseDf)
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Class name "T" doesn't conform to PascalCase naming style ('[^\\W\\da-z][^\\W_]+$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
20
21
22			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
23			class MatrixPrep:
24			kind: str
25			normalize: bool
26			log: bool
27			invert: bool
28
29			def from_files(self, paths: Sequence[Path]) -> SimilarityDfLongForm:
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
30			dct = {}
31			for p in paths:
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "p" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
32			key = p.with_suffix("").name
33			try:
34			mx = SimilarityDfShortForm.read_file(p)
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "mx" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
35			dct[key] = mx
36			except (OSError, UnsupportedOperationError, ValueError):
37			logger.error(f"Failed to load matrix at {str(p)}")
38			raise
39			return self.create(dct)
40
41			def create(self, key_to_mx: Mapping[str, SimilarityDfShortForm]) -> SimilarityDfLongForm:
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
42			df = SimilarityDfLongForm(
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
43			pd.concat([mx.to_long_form(self.kind, key) for key, mx in key_to_mx.items()])
44			)
45			vals = df["value"]
46			if self.invert:
47			vals = -vals
48			if self.normalize:
49			mn, mx = vals.min(), vals.max()
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "mn" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "mx" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
50			vals = (vals - mn) / (mn - mx)
51			if self.log:
52			# this is a bit stupid, but calc the log then normalize again
53			# we can't take the log before normalization because we might have negative values
54			vals = vals.map(np.log10)
55			mn, mx = vals.min(), vals.max()
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "mn" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Variable name "mx" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
56			vals = (vals - mn) / (mn - mx)
57			df["value"] = vals
58			return SimilarityDfLongForm.convert(df)
59
60			@classmethod
61			def ecfp_matrix(cls, df: InputFrame, radius: int, n_bits: int) -> SimilarityDfShortForm:
			0 ignored issues – show Comprehensibility introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (17/15). Loading history... introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
62			# TODO: This is inefficient and long
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
63			indices = range(len(df))
64			keys = df["inchikey"]
65			on_bits = [
66			RdkitUtils.ecfp(c, radius=radius, n_bits=n_bits).list_on for c in df.get_structures()
67			]
68			the_rows: List[List[float]] = []
69			for i, row_key, row_print in zip(indices, keys, on_bits):
			0 ignored issues – show Unused Code introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report The variable `row_key` seems to be unused. Loading history...
70			for j, col_key, col_print in zip(indices, keys, on_bits):
			0 ignored issues – show Unused Code introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report The variable `col_key` seems to be unused. Loading history...
71			the_row = []
72			if i < j:
73			jaccard = len(row_print.intersection(col_print)) / len(
74			row_print.union(col_print)
75			)
76			the_row.append(jaccard)
77			the_rows.append(the_row)
78			short = SimilarityDfShortForm(the_rows)
79			short["inchikey"] = keys
80			short = short.set_index("inchikey")
81			short.columns = keys
82			return SimilarityDfShortForm.convert(short)
83
84
85			__all__ = ["MatrixPrep"]
86

dmyersturnbull / mandos

Push — dependabot/pip/pyarrow-5.0.0 ( 101caa...cfe875 )

mandos.analysis.prepping A

Complexity

Size/Duplication

Importance

3 Methods

Duplication Side-by-Side

Filter issues like