Passed
Push — main ( 65730f...fad324 )
by Douglas
06:54 queued 02:27
created

mandos.analysis.distances._Inf.got()   A

Complexity

Conditions 2

Size

Total Lines 11
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 10
nop 4
dl 0
loc 11
rs 9.9
c 0
b 0
f 0
1
"""
2
Calculations of concordance between annotations.
3
"""
4
import abc
5
import enum
6
import math
7
import time
8
from collections import defaultdict
9
from typing import Collection, Sequence, Type, Union
10
11
import decorateme
0 ignored issues
show
introduced by
Unable to import 'decorateme'
Loading history...
12
import numpy as np
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
13
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
14
from pocketutils.core.chars import Chars
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.chars'
Loading history...
15
from pocketutils.core.enums import CleverEnum
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.enums'
Loading history...
16
from pocketutils.tools.unit_tools import UnitTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.unit_tools'
Loading history...
17
18
from mandos.analysis import AnalysisUtils as Au
19
from mandos.analysis.io_defns import SimilarityDfLongForm, SimilarityDfShortForm
20
from mandos.model.hits import AbstractHit
21
22
# note that most of these math functions are much faster than their numpy counterparts
23
# if we're not broadcasting, it's almost always better to use them
24
# some are more accurate, too
25
# e.g. we're using fsum rather than sum
26
from mandos.model.utils.setup import logger
27
28
29
@decorateme.auto_repr_str()
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
30
class MatrixCalculator(metaclass=abc.ABCMeta):
31
    def calc_all(self, hits: Sequence[AbstractHit]) -> SimilarityDfLongForm:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
32
        raise NotImplemented()
0 ignored issues
show
Bug introduced by
NotImplemented does not seem to be callable.
Loading history...
Best Practice introduced by
NotImplemented raised - should raise NotImplementedError
Loading history...
33
34
35
class _Inf:
36
    def __init__(self, n: int):
37
        self.n = n
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "n" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
38
        self.used, self.t0, self.nonzeros = set(), time.monotonic(), 0
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
39
40
    def is_used(self, c1: str, c2: str) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Argument name "c1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "c2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
41
        return (c1, c2) in self.used or (c2, c1) in self.used
42
43
    def got(self, c1: str, c2: str, z: float) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Argument name "c1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "z" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "c2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
44
        self.used.add((c1, c2))
45
        self.nonzeros += int(c1 != c2 and not np.isnan(z) and 0 < z < 1)
46
        i = self.i
47
        if i % 5000 == 0:
48
            lg_ = next(
49
                t_
50
                for s_, t_ in zip([50000, 10000, 1000], ["success", "info", "debug"])
51
                if not i % s_
52
            )
53
            self.log(lg_)
54
55
    @property
56
    def i(self) -> int:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
57
        return len(self.used)
58
59
    def log(self, level: str) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
60
        delta = UnitTools.delta_time_to_str(time.monotonic() - self.t0, space=Chars.narrownbsp)
61
        logger.log(
62
            level.upper(),
63
            f"Processed {self.i:,}/{self.n:,} pairs in {delta};"
64
            + f" {self.nonzeros:,} ({self.nonzeros / self.i * 100:.1f}%) are nonzero",
65
        )
66
67
68
class JPrimeMatrixCalculator(MatrixCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
69
    def calc_all(self, hits: Sequence[AbstractHit]) -> SimilarityDfLongForm:
70
        key_to_hit = Au.hit_multidict(hits, "search_key")
71
        logger.notice(f"Calculating J on {len(key_to_hit)} keys from {len(hits)} hits")
72
        dfs = []
73
        for key, key_hits in key_to_hit.items():
74
            df: SimilarityDfShortForm = self.calc_one(key, key_hits)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
75
            df = df.to_long_form(kind="psi", key=key)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
76
            dfs += [df]
77
        return SimilarityDfLongForm(pd.concat(dfs))
78
79
    def calc_one(self, key: str, hits: Sequence[AbstractHit]) -> SimilarityDfShortForm:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
80
        ik2hits = Au.hit_multidict(hits, "origin_inchikey")
81
        logger.info(f"Calculating J on {key} for {len(ik2hits)} compounds and {len(hits)} hits")
82
        data = defaultdict(dict)
83
        inf = _Inf(n=int(len(ik2hits) * (len(ik2hits) - 1) / 2))
84
        for (c1, hits1) in ik2hits.items():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "c1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
85
            for (c2, hits2) in ik2hits.items():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "c2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
86
                if inf.is_used(c1, c2):
87
                    continue
88
                z = 1 if c1 == c2 else self._j_prime(hits1, hits2)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "z" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
89
                data[c1][c2] = z
90
                inf.got(c1, c2, z)
91
        inf.log("notice")
92
        return SimilarityDfShortForm.from_dict(data)
93
94
    def _j_prime(self, hits1: Collection[AbstractHit], hits2: Collection[AbstractHit]) -> float:
95
        if len(hits1) == 0 or len(hits2) == 0:
96
            return 0
97
        sources = {h.data_source for h in hits1}.intersection({h.data_source for h in hits2})
98
        if len(sources) == 0:
99
            return np.nan
100
        values = [
101
            self._jx(
102
                [h for h in hits1 if h.data_source == source],
103
                [h for h in hits2 if h.data_source == source],
104
            )
105
            for source in sources
106
        ]
107
        return float(math.fsum(values) / len(values))
108
109
    def _jx(self, hits1: Collection[AbstractHit], hits2: Collection[AbstractHit]) -> float:
110
        pair_to_weights = Au.weights_of_pairs(hits1, hits2)
111
        values = [self._wedge(ca, cb) / self._vee(ca, cb) for ca, cb in pair_to_weights.values()]
112
        return float(math.fsum(values) / len(values))
113
114
    def _wedge(self, ca: float, cb: float) -> float:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "ca" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "cb" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
115
        return math.sqrt(Au.elle(ca) * Au.elle(cb))
116
117
    def _vee(self, ca: float, cb: float) -> float:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
Coding Style Naming introduced by
Argument name "cb" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "ca" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
118
        return Au.elle(ca) + Au.elle(cb) - math.sqrt(Au.elle(ca) * Au.elle(cb))
119
120
121
class MatrixAlg(CleverEnum):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
122
    j = enum.auto()
123
124
    @property
125
    def clazz(self) -> Type[MatrixCalculator]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
126
        return {MatrixAlg.j: JPrimeMatrixCalculator}[self]
127
128
129
@decorateme.auto_utils()
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
130
class MatrixCalculation:
131
    @classmethod
132
    def create(cls, algorithm: Union[str, MatrixAlg]) -> MatrixCalculator:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
133
        return MatrixAlg.of(algorithm).clazz()
134
135
136
__all__ = ["MatrixCalculator", "JPrimeMatrixCalculator", "MatrixCalculation", "MatrixAlg"]
137