Passed
Push — main ( 2e1b6b...3a0c28 )
by Douglas
02:06
created

mandos.analysis.distances._Inf.__str__()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
"""
2
Calculations of concordance between annotation sets.
3
"""
4
import abc
5
import enum
6
import math
7
import time
8
from collections import defaultdict
9
from typing import Collection, Sequence, Type, Union
10
11
import decorateme
0 ignored issues
show
introduced by
Unable to import 'decorateme'
Loading history...
12
import numpy as np
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
13
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
14
from pocketutils.core.chars import Chars
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.chars'
Loading history...
15
from pocketutils.core.enums import CleverEnum
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.enums'
Loading history...
16
from pocketutils.tools.unit_tools import UnitTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.unit_tools'
Loading history...
17
18
from mandos.analysis import AnalysisUtils as Au
19
from mandos.analysis.io_defns import SimilarityDfLongForm, SimilarityDfShortForm
20
from mandos.model.hits import AbstractHit
21
22
# note that most of these math functions are much faster than their numpy counterparts
23
# if we're not broadcasting, it's almost always better to use them
24
# some are more accurate, too
25
# e.g. we're using fsum rather than sum
26
from mandos.model.utils.setup import logger
27
28
29
@decorateme.auto_repr_str()
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
30
class MatrixCalculator(metaclass=abc.ABCMeta):
31
    def calc_all(self, hits: Sequence[AbstractHit]) -> SimilarityDfLongForm:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
32
        raise NotImplemented()
0 ignored issues
show
Bug introduced by
NotImplemented does not seem to be callable.
Loading history...
Best Practice introduced by
NotImplemented raised - should raise NotImplementedError
Loading history...
33
34
35
class _Inf:
36
    def __init__(self, n: int):
37
        self.n = n
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "n" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
38
        self.used, self.t0, self.nonzeros = set(), time.monotonic(), 0
0 ignored issues
show
Coding Style Naming introduced by
Attribute name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
39
40
    def is_used(self, c1: str, c2: str) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Argument name "c1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "c2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
41
        return (c1, c2) in self.used or (c2, c1) in self.used
42
43
    def got(self, c1: str, c2: str, z: float) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Argument name "c1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "z" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "c2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
44
        self.used.add((c1, c2))
45
        self.nonzeros += int(c1 != c2 and not np.isnan(z) and 0 < z < 1)
46
        if self.i % 50000 == 0:
47
            self.log("info")
48
        elif self.i % 5000 == 0:
49
            self.log("trace")
50
51
    @property
52
    def i(self) -> int:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
53
        return len(self.used)
54
55
    def log(self, level: str) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
56
        delta = UnitTools.delta_time_to_str(time.monotonic() - self.t0, space=Chars.narrownbsp)
57
        logger.log(
58
            level.upper(),
59
            f"Processed {self.i:,}/{self.n:,} pairs in {delta};"
60
            + f" {self.nonzeros:,} ({self.nonzeros / self.i * 100:.1f}%) are nonzero",
61
        )
62
63
    def __repr__(self):
64
        return f"{self.__class__.__name__}({self.i}/{self.n})"
65
66
    def __str__(self):
67
        return repr(self)
68
69
70
class JPrimeMatrixCalculator(MatrixCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
71
    def calc_all(self, hits: Sequence[AbstractHit]) -> SimilarityDfLongForm:
72
        key_to_hit = Au.hit_multidict(hits, "search_key")
73
        logger.notice(f"Calculating J on {len(key_to_hit)} keys from {len(hits)} hits")
74
        dfs = []
75
        for key, key_hits in key_to_hit.items():
76
            df: SimilarityDfShortForm = self.calc_one(key, key_hits)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
77
            df = df.to_long_form(kind="psi", key=key)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
78
            dfs += [df]
79
        return SimilarityDfLongForm(pd.concat(dfs))
80
81
    def calc_one(self, key: str, hits: Sequence[AbstractHit]) -> SimilarityDfShortForm:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
82
        ik2hits = Au.hit_multidict(hits, "origin_inchikey")
83
        logger.info(f"Calculating J on {key} for {len(ik2hits)} compounds and {len(hits)} hits")
84
        data = defaultdict(dict)
85
        inf = _Inf(n=int(len(ik2hits) * (len(ik2hits) - 1) / 2))
86
        for (c1, hits1) in ik2hits.items():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "c1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
87
            for (c2, hits2) in ik2hits.items():
0 ignored issues
show
Coding Style Naming introduced by
Variable name "c2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
88
                if inf.is_used(c1, c2):
89
                    continue
90
                z = 1 if c1 == c2 else self._j_prime(key, hits1, hits2)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "z" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
91
                data[c1][c2] = z
92
                inf.got(c1, c2, z)
93
        inf.log("notice")
94
        return SimilarityDfShortForm.from_dict(data)
95
96
    def _j_prime(
97
        self, key: str, hits1: Collection[AbstractHit], hits2: Collection[AbstractHit]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
98
    ) -> float:
99
        if len(hits1) == 0 or len(hits2) == 0:
100
            return 0
101
        sources = {h.data_source for h in hits1}.intersection({h.data_source for h in hits2})
102
        if len(sources) == 0:
103
            return np.nan
104
        values = [
105
            self._jx(
106
                key,
107
                [h for h in hits1 if h.data_source == source],
108
                [h for h in hits2 if h.data_source == source],
109
            )
110
            for source in sources
111
        ]
112
        return float(math.fsum(values) / len(values))
113
114
    def _jx(
115
        self, key: str, hits1: Collection[AbstractHit], hits2: Collection[AbstractHit]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
116
    ) -> float:
117
        # TODO -- for testing only
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
118
        # TODO: REMOVE ME!
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
119
        if key in ["core.chemidplus.effects", "extra.chemidplus.specific-effects"]:
120
            hits1 = [h.copy(weight=np.power(10, -h.weight)) for h in hits1]
121
            hits2 = [h.copy(weight=np.power(10, -h.weight)) for h in hits2]
122
        pair_to_weights = Au.weights_of_pairs(hits1, hits2)
123
        values = [self._wedge(ca, cb) / self._vee(ca, cb) for ca, cb in pair_to_weights.values()]
124
        return float(math.fsum(values) / len(values))
125
126
    def _wedge(self, ca: float, cb: float) -> float:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "cb" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
Coding Style Naming introduced by
Argument name "ca" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
127
        return math.sqrt(Au.elle(ca) * Au.elle(cb))
128
129
    def _vee(self, ca: float, cb: float) -> float:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "ca" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "cb" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
130
        return Au.elle(ca) + Au.elle(cb) - math.sqrt(Au.elle(ca) * Au.elle(cb))
131
132
133
class MatrixAlg(CleverEnum):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
134
    j = enum.auto()
135
136
    @property
137
    def clazz(self) -> Type[MatrixCalculator]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
138
        return {MatrixAlg.j: JPrimeMatrixCalculator}[self]
139
140
141
@decorateme.auto_utils()
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
142
class MatrixCalculation:
143
    @classmethod
144
    def create(cls, algorithm: Union[str, MatrixAlg]) -> MatrixCalculator:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
145
        return MatrixAlg.of(algorithm).clazz()
146
147
148
__all__ = ["MatrixCalculator", "JPrimeMatrixCalculator", "MatrixCalculation", "MatrixAlg"]
149