mandos.model.correlation_math - Code Metrics - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

mandos.model.correlation_math A
last analyzed 2021-01-25 23:07 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	241
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	139
dl	0
loc	241
rs	9.92
c	0
b	0
f	0
wmc	31

17 Methods

Rating	Name	Size	Complexity
A	AffinityFunctions.identity()	7	1
A	AffinityFunctions.rho_cm()	13	1
A	AffinityMatrix.index_names()	3	1
A	AffinityMatrix.all_pairs()	16	3
A	AffinityMatrix.from_function()	21	4
A	AffinityMatrix.restrict()	4	1
A	AffinityFunctions.pearson()	23	3
A	AffinityFunctions.random_uniform()	7	1
A	ConfusionMatrix.cols()	4	1
A	AffinityMatrix.cols()	4	1
A	AffinityFunctions.jaccard()	9	2
A	AffinityMatrix.non_self_pairs()	9	1
A	AffinityMatrix.rows()	4	1
A	AffinityMatrix.from_triples()	7	2
A	AffinityFunctions.bit_string_jaccard()	11	2
A	ConfusionMatrix.rows()	4	1
A	AffinityFunctions.negative_minkowski()	32	5

"""
TODO This module will probably be deleted.
"""

from __future__ import annotations
from collections import defaultdict
import logging
from typing import Sequence, Mapping, Set, TypeVar, Callable, SupportsFloat, Optional, Type, List

from typing import Tuple as Tup

import numpy as np

import pandas as pd

from typeddfs import TypedDfs, TypedDf


logger = logging.getLogger("mandos")
CompoundCompoundPair = Tup[str, str]


T = TypeVar("T")

Z = TypeVar("Z")



class ConfusionMatrix(TypedDf):

    @property
    def rows(self):

        """"""
        return [str(s) for s in self.index.tolist()]

    @property
    def cols(self):

        """"""
        return [str(s) for s in self.columns.tolist()]


class AffinityFunctions:
    """
    These are functions that can be passed into ``AffinityMatrix.from_function``.
    """

    @classmethod
    def jaccard(cls) -> Callable[[Set[Z], Set[Z]], float]:

        def x(a_set: Set[Z], b_set: Set[Z]):

            if len(a_set) == len(b_set) == 0:
                return float("NaN")
            return len(a_set.intersection(b_set)) / len(a_set.union(b_set))

        x.__name__ = "jaccard"
        return x

    @classmethod
    def bit_string_jaccard(cls) -> Callable[[str, str], float]:

        def x(a: str, b: str):

            if len(a) == len(b) == 0:
                return float("NaN")
            on_bits_a = {i for i, v in enumerate(a) if int(v) == 1}
            on_bits_b = {i for i, v in enumerate(b) if int(v) == 1}
            return len(on_bits_a.intersection(on_bits_b)) / len(on_bits_a.union(on_bits_b))

        x.__name__ = "bit_string_jaccard"
        return x

    @classmethod
    def identity(cls) -> Callable[[str, str], float]:

        def x(a: str, b: str):

            return float(a == b)

        x.__name__ = "identity"
        return x

    @classmethod
    def random_uniform(cls, state: np.random.RandomState) -> Callable[[Z, Z], float]:

        def x(a: Z, b: Z):

            return state.uniform(0, 1)

        x.__name__ = "random_uniform"
        return x

    @classmethod
    def rho_cm(cls, df: ConfusionMatrix) -> Callable[[str, str], float]:

        def x(d: ConfusionMatrix, a: str, b: str):

            ij = d.at[a, b]

            a_off_diag = np.sum(d.at[a, j] for j in d.cols if j != a)
            b_off_diag = np.sum(d.at[i, b] for i in d.rows if i != b)
            return ij / a_off_diag / b_off_diag

        def qq(a: str, b: str):

            return 0.5 * x(df, a, b) + 0.5 * x(ConfusionMatrix(df.transpose()), a, b)

        x.__name__ = "rho_cm"
        return qq

    @classmethod
    def pearson(
        cls, weights: Optional[Sequence[SupportsFloat]] = None

    ) -> Callable[[Sequence[SupportsFloat], Sequence[SupportsFloat]], float]:
        """
        Pearson correlation coefficient, possibly weighted.

        Args:
            weights:

        Returns:

        """
        weights = 1.0 if weights is None else np.array(weights, dtype=np.float64)

        def x(a: Sequence[SupportsFloat], b: Sequence[SupportsFloat]):

            a, b = np.array(a, dtype=np.float64), np.array(b, dtype=np.float64)
            a = (a - a.mean()) / a.std()
            b = (b - b.mean()) / b.std()
            return np.average(a * b, weights=weights)

        x.__name__ = "pearson" if weights == 1.0 else "weighted-pearson"
        return x

    @classmethod
    def negative_minkowski(
        cls, order: float, weights: Optional[Sequence[SupportsFloat]] = None

    ) -> Callable[[Sequence[SupportsFloat], Sequence[SupportsFloat]], float]:
        """

        If ``order==0``, this is defined to return the number of elements that differ (number of nonzero).


        Args:
            order:
            weights:

        Returns:

        """
        weights = 1.0 if weights is None else np.array(weights, dtype=np.float64)

        def x(a: Sequence[SupportsFloat], b: Sequence[SupportsFloat]):

            a, b = np.array(a, dtype=np.float64), np.array(b, dtype=np.float64)
            if np.isposinf(order):

                return -np.max(np.abs(a - b) * weights)
            elif order == 0:
                return -float(np.count_nonzero(a - b))
            else:
                return -np.float_power(
                    np.sum(np.float_power(np.abs(a - b) * weights, order)), 1 / order
                )

        x.__name__ = (
            "minkowski" + str(order) if weights == 1.0 else "weighted-minkowski" + str(order)
        )
        return x


class AffinityMatrix(TypedDf):
    """
    An affinity matrix of compound by compound.
    It has a single index, which is compound A,
    and the column labels are the compound B.
    """

    @classmethod
    def index_names(cls) -> List[str]:

        return ["name"]

    # @classmethod
    # def must_be_symmetric(cls) -> bool:
    #    return True

    @property
    def rows(self):

        """"""
        return [str(s) for s in self.index.tolist()]

    @property
    def cols(self):

        """"""
        return [str(s) for s in self.columns.tolist()]

    def restrict(self, to: Set[str]) -> AffinityMatrix:

        df = self[self["name"].isin(to)]

        df = df[[v for v in df.columns if v in to]]

        return AffinityMatrix.convert(df)

    def non_self_pairs(self) -> Mapping[CompoundCompoundPair, float]:
        """
        Gets the STRICT lower-triangular matrix, which excludes comparisons where the two labels are the same.

        The result is (n choose 2) matrix elements.

        Returns:
            A dict from comparison labels to their values in the matrix (i.e. (i, j)).
        """
        return {(a, b): value for (a, b), value in self.all_pairs().items() if a != b}

    def all_pairs(self) -> Mapping[CompoundCompoundPair, float]:
        """
        Gets the (non-strict) lower-triangular matrix.
        The result is (n(n+1)/2) matrix elements.

        Returns:
            A dict from comparison labels to their values in the matrix (i.e. (i, j)).
        """
        values = {}
        for i in range(0, len(self)):
            a = self.index.values[i]

            for j in range(0, i + 1):
                b = self.columns.values[j]

                # use (b, a) to get lower triangle
                values[(str(b).strip(), str(a).strip())] = self.iat[i, j]
        return values

    @classmethod
    def from_triples(cls, df: pd.DataFrame) -> AffinityMatrix:

        # TODO this function doesn't belong here

        dct = defaultdict(set)
        for row in df.itertuples():
            dct[row.compound_lookup].add(row.object_id + "," + row.predicate)
        return AffinityMatrix.from_function(dct, AffinityFunctions.jaccard())

    @classmethod
    def from_function(cls, items: Mapping[str, T], fn: Callable[[T, T], float]) -> AffinityMatrix:

        rows = []
        for a_label, a_value in items.items():
            rows.append(
                pd.Series(
                    {
                        "name": str(a_label),
                        **{
                            str(b_label): fn(a_value, b_value) for b_label, b_value in items.items()
                        },
                    }
                )
            )
        df = pd.DataFrame(rows)  # .astype(np.float64)

        df["name"] = df["name"].map(lambda v: str(v).replace(".0", "")).astype(str)
        df = df.set_index("name")

        afm = AffinityMatrix.convert(df)
        for c in afm.columns:

            afm[c] = afm[c].astype(np.float64)
        return afm


__all__ = [
    "AffinityMatrix",
    "AffinityFunctions",
]


1			"""
2			TODO This module will probably be deleted.
3			"""
4
5			from __future__ import annotations
6			from collections import defaultdict
7			import logging
8			from typing import Sequence, Mapping, Set, TypeVar, Callable, SupportsFloat, Optional, Type, List
			0 ignored issues – show Unused Code introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unused Type imported from typing Loading history...
9			from typing import Tuple as Tup
10
11			import numpy as np
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'numpy' Loading history...
12			import pandas as pd
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
13			from typeddfs import TypedDfs, TypedDf
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history... Unused Code introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unused TypedDfs imported from typeddfs Loading history...
14
15			logger = logging.getLogger("mandos")
16			CompoundCompoundPair = Tup[str, str]
17
18
19			T = TypeVar("T")
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Class name "T" doesn't conform to PascalCase naming style ('[^\\W\\da-z][^\\W_]+$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
20			Z = TypeVar("Z")
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Class name "Z" doesn't conform to PascalCase naming style ('[^\\W\\da-z][^\\W_]+$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
21
22
23			class ConfusionMatrix(TypedDf):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
24			@property
25			def rows(self):
			0 ignored issues – show Documentation introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Empty method docstring Loading history...
26			""""""
27			return [str(s) for s in self.index.tolist()]
28
29			@property
30			def cols(self):
			0 ignored issues – show Documentation introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Empty method docstring Loading history...
31			""""""

dmyersturnbull / mandos

mandos.model.correlation_math A last analyzed 2021-01-25 23:07 UTC

Complexity

Size/Duplication

Importance

17 Methods

Duplication Side-by-Side

Filter issues like

mandos.model.correlation_math A
last analyzed 2021-01-25 23:07 UTC