Passed
Push — dependabot/pip/pyarrow-5.0.0 ( 101caa...cfe875 )
by
unknown
01:39
created

SumWeightedCalc.for_pair()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
"""
2
Scoring (regression and enrichment) calculations.
3
"""
4
import abc
5
import enum
6
import math
7
from dataclasses import dataclass
0 ignored issues
show
Unused Code introduced by
Unused dataclass imported from dataclasses
Loading history...
8
from typing import Generic, Mapping, Sequence, Type, TypeVar, Union, Optional, Any, Tuple
9
10
import numpy as np
0 ignored issues
show
introduced by
Unable to import 'numpy'
Loading history...
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
from numpy.random import RandomState
0 ignored issues
show
introduced by
Unable to import 'numpy.random'
Loading history...
13
14
from mandos.analysis import AnalysisUtils as Au
15
from mandos.model import CleverEnum
16
from mandos.model.hits import AbstractHit, HitFrame, KeyPredObjSource, KeyPredObj
0 ignored issues
show
Unused Code introduced by
Unused KeyPredObjSource imported from mandos.model.hits
Loading history...
17
from mandos.model.hit_utils import HitUtils
0 ignored issues
show
Unused Code introduced by
Unused HitUtils imported from mandos.model.hit_utils
Loading history...
18
from mandos.analysis.io_defns import ScoreDf, EnrichmentDf
19
20
S = TypeVar("S", bound=Union[int, float, bool])
0 ignored issues
show
Coding Style Naming introduced by
Class name "S" doesn't conform to PascalCase naming style ('[^\\W\\da-z][^\\W_]+$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
21
22
23
class EnrichmentCalculator(Generic[S], metaclass=abc.ABCMeta):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
24
    def calc(
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
25
        self, hits: Sequence[AbstractHit], scores: Mapping[str, S]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
26
    ) -> Mapping[KeyPredObj, float]:
27
        pair_to_hits = Au.hit_multidict(hits, "to_key_pred_obj")
28
        results = {}
29
        for pair, the_hits in pair_to_hits.items():
0 ignored issues
show
Unused Code introduced by
The variable the_hits seems to be unused.
Loading history...
30
            results[pair] = self.for_pair(hits, scores)
31
        return results
32
33
    def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
34
        raise NotImplementedError()
35
36
    @classmethod
37
    def alg_name(cls) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
38
        raise NotImplementedError()
39
40
41
# noinspection PyAbstractClass
42
class _FoldCalculator(EnrichmentCalculator[bool]):
0 ignored issues
show
introduced by
Value 'EnrichmentCalculator' is unsubscriptable
Loading history...
43
    """"""
44
45
46
# noinspection PyAbstractClass
47
class _RegressCalculator(EnrichmentCalculator[float]):
0 ignored issues
show
introduced by
Value 'EnrichmentCalculator' is unsubscriptable
Loading history...
48
    """"""
49
50
51
class AlphaCalculator(_RegressCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
52
    @classmethod
53
    def alg_name(cls) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
54
        return "alpha"
55
56
    def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
57
        source_to_hits = Au.hit_multidict(hits, "data_source")
58
        return float(
59
            np.mean(
60
                self._calc_term(source_hits, scores)
61
                for source, source_hits in source_to_hits.items()
62
            )
63
        )
64
65
    def _calc_term(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
66
        return float(
67
            np.mean(
68
                [
69
                    Au.elle(hit.weight) * (2 * float(scores[hit.origin_inchikey] - 1)) ** 2
70
                    for hit in hits
71
                ]
72
            )
73
        )
74
75
76
class SumWeightedCalc(_RegressCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
77
    @classmethod
78
    def alg_name(cls) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
79
        return "w-sum"
80
81
    def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
introduced by
Missing function or method docstring
Loading history...
82
        return math.fsum([scores[hit.origin_inchikey] * hit.weight for hit in hits]) / len(hits)
83
84
85
class SumUnweightedCalc(_RegressCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
86
    @classmethod
87
    def alg_name(cls) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
88
        return "n-sum"
89
90
    def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
91
        return math.fsum([scores[hit.origin_inchikey] for hit in hits]) / len(hits)
92
93
94
class FoldWeightedCalc(_FoldCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
95
    @classmethod
96
    def alg_name(cls) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
97
        return "w-ratio"
98
99
    def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
introduced by
Missing function or method docstring
Loading history...
100
        yes = [hit for hit in hits if scores[hit.origin_inchikey]]
101
        no = [hit for hit in hits if not scores[hit.origin_inchikey]]
0 ignored issues
show
Coding Style Naming introduced by
Variable name "no" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
102
        numerator = math.fsum((hit.weight for hit in yes))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable hit does not seem to be defined.
Loading history...
103
        denominator = math.fsum((hit.weight for hit in no))
104
        if denominator == 0:
105
            return float("inf")
106
        return numerator / denominator
107
108
109
class FoldUnweightedCalc(_FoldCalculator):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
110
    @classmethod
111
    def alg_name(cls) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
112
        return "n-ratio"
113
114
    def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
introduced by
Missing function or method docstring
Loading history...
115
        numerator = len([hit for hit in hits if scores[hit.origin_inchikey]])
116
        denominator = len([hit for hit in hits if not scores[hit.origin_inchikey]])
117
        if denominator == 0:
118
            return float("inf")
119
        return numerator / denominator
120
121
122
class _Alg(CleverEnum):
123
    """"""
124
125
    @classmethod
126
    def dtype(cls) -> Type[Any]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
127
        raise NotImplementedError()
128
129
130
class RealAlg(_Alg):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
131
    alpha = enum.auto()
132
    weighted = enum.auto()
133
    unweighted = enum.auto()
134
135
    @classmethod
136
    def dtype(cls) -> Type[Any]:
137
        return float
138
139
    @property
140
    def clazz(self) -> Type[_RegressCalculator]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
141
        return {
142
            RealAlg.alpha: AlphaCalculator,
143
            RealAlg.weighted: SumWeightedCalc,
144
            RealAlg.unweighted: SumUnweightedCalc,
145
        }[self]
146
147
148
class BoolAlg(_Alg):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
149
    weighted = enum.auto()
150
    unweighted = enum.auto()
151
152
    @classmethod
153
    def dtype(cls) -> Type[Any]:
154
        return bool
155
156
    @property
157
    def clazz(self) -> Type[_FoldCalculator]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
158
        return {
159
            BoolAlg.weighted: FoldWeightedCalc,
160
            BoolAlg.unweighted: FoldUnweightedCalc,
161
        }[self]
162
163
164
class EnrichmentCalculation:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
165
    def __init__(
166
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
167
        bool_alg: str,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
168
        real_alg: str,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
169
        n_samples: int,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
170
        seed: int,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
171
    ):
172
        self.bool_alg = BoolAlg.of(bool_alg)
173
        self.real_alg = RealAlg.of(real_alg)
174
        self.n_samples = n_samples
175
        self.seed = seed
176
        self.state = RandomState(seed)
177
178
    def calculate(self, hit_df: HitFrame, scores: Optional[ScoreDf]) -> EnrichmentDf:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
179
        hits = hit_df.to_hits()
180
        if scores is None:
181
            scores = self._default_scores(hit_df)
182
        score_dict = self._get_dict(scores)
183
        results = self._calc(hits, score_dict, 0)
184
        for b in range(self.n_samples):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "b" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
185
            b_hits = self.state.choice(hits, replace=True)
186
            results += self._calc(b_hits, score_dict, b)
187
        return EnrichmentDf.convert(results)
188
189
    def _calc(self, hits: Sequence[AbstractHit], score_dict, sample: int) -> Sequence[pd.DataFrame]:
190
        for score_name, (alg_type, score_vals) in score_dict.items():
191
            alg_instance = alg_type.clazz()
192
            forward = alg_instance.calc(hits, score_vals.to_dict())
193
            if alg_type.dtype == bool:
194
                reverse = alg_instance.calc(hits, (~score_vals).to_dict())
195
            else:
196
                reverse = alg_instance.calc(hits, (-score_vals).to_dict())
197
            return [self._make_df(forward, reverse, score_name, alg_type.name, sample)]
198
199
    def _default_scores(self, hit_df: HitFrame) -> ScoreDf:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
200
        inchikeys = hit_df["origin_inchikey"].unique().values
201
        return ScoreDf(
202
            pd.concat(
203
                [
204
                    pd.DataFrame(dict(inchikey=inchikeys, weight=[1 for _ in inchikeys])),
205
                    pd.DataFrame(dict(inchikey=inchikeys, count=[1 for _ in inchikeys])),
206
                ]
207
            )
208
        )
209
210
    def _get_dict(self, scores: ScoreDf) -> Mapping[str, Tuple[_Alg, pd.Series]]:
211
        fold_cols = [c for c in scores.columns if c.startswith("is_") or c == "count"]
212
        score_cols = [c for c in scores.columns if c.startswith("score_") or c == "weight"]
213
        fold_dct = {c: (self.bool_alg, scores.set_index("inchikey")[c]) for c in fold_cols}
214
        score_dct = {c: (self.real_alg, scores.set_index("inchikey")[c]) for c in score_cols}
215
        return {**fold_dct, **score_dct}
216
217
    def _make_df(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
218
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
219
        forward: Mapping[KeyPredObj, float],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
220
        backward: Mapping[KeyPredObj, float],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
221
        score: str,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
222
        alg: str,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
223
        sample: int,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
224
    ):
225
        return pd.DataFrame(
226
            [
227
                pd.Series(
228
                    dict(
229
                        key=kpo.key,
230
                        predicate=kpo.pred,
231
                        object=kpo.obj,
232
                        score_name=score,
233
                        algorithm=alg,
234
                        sample=sample,
235
                        value=forward[kpo],
236
                        inverse=backward[kpo],
237
                    )
238
                )
239
                for kpo in forward.keys()
240
            ]
241
        )
242
243
244
__all__ = [
245
    "AlphaCalculator",
246
    "EnrichmentCalculator",
247
    "FoldUnweightedCalc",
248
    "FoldWeightedCalc",
249
    "SumUnweightedCalc",
250
    "SumWeightedCalc",
251
    "EnrichmentCalculation",
252
    "EnrichmentDf",
253
    "ScoreDf",
254
    "BoolAlg",
255
    "RealAlg",
256
]
257