1
|
|
|
""" |
2
|
|
|
Scoring (regression and enrichment) calculations. |
3
|
|
|
""" |
4
|
|
|
import abc |
5
|
|
|
import enum |
6
|
|
|
import math |
7
|
|
|
from typing import Generic, Mapping, Sequence, Type, TypeVar, Union |
8
|
|
|
|
9
|
|
|
import pandas as pd |
|
|
|
|
10
|
|
|
from typeddfs import TypedDfs |
|
|
|
|
11
|
|
|
|
12
|
|
|
from mandos.analysis import AnalysisUtils as Au |
13
|
|
|
from mandos.model import CleverEnum |
14
|
|
|
from mandos.model.hits import AbstractHit, HitFrame, HitUtils, Pair |
15
|
|
|
|
16
|
|
|
S = TypeVar("S", bound=Union[int, float, bool]) |
|
|
|
|
17
|
|
|
|
18
|
|
|
|
19
|
|
|
EnrichmentDf = ( |
20
|
|
|
TypedDfs.typed("EnrichmentDf") |
21
|
|
|
.require("predicate", "object", dtype=str) |
22
|
|
|
.require("samples", dtype=int) |
23
|
|
|
).build() |
24
|
|
|
# extra cols are, e.g., alpha(score_1), alpha(is_hit) |
25
|
|
|
|
26
|
|
|
|
27
|
|
|
ScoreDf = (TypedDfs.typed("ScoreDf").require("inchikey", dtype=str).reserve("score")).build() |
28
|
|
|
# extra cols are, e.g., score_1, score_hello |
29
|
|
|
|
30
|
|
|
|
31
|
|
|
IsHitDf = TypedDfs.typed("IsHitDf").require("inchikey", dtype=str) |
32
|
|
|
# extra calls are is_hit, etc. |
33
|
|
|
|
34
|
|
|
|
35
|
|
|
def _vars(self: ScoreDf) -> Sequence[str]: |
36
|
|
|
return [ |
37
|
|
|
col |
38
|
|
|
for col in self.columns |
39
|
|
|
if (isinstance(self, ScoreDf) and (col == "score" or col.startswith("score_"))) |
40
|
|
|
or (isinstance(self, IsHitDf) and col.startswith("is_")) |
41
|
|
|
] |
42
|
|
|
|
43
|
|
|
|
44
|
|
|
def _var_map(self: ScoreDf) -> Mapping[str, Mapping[str, S]]: |
45
|
|
|
results = {} |
46
|
|
|
for c in self.score_cols: |
|
|
|
|
47
|
|
|
results[c] = self[c].to_dict() |
48
|
|
|
return results |
49
|
|
|
|
50
|
|
|
|
51
|
|
|
ScoreDf.vars = _vars |
52
|
|
|
ScoreDf.var_map = _var_map |
53
|
|
|
IsHitDf.vars = _vars |
54
|
|
|
IsHitDf.var_map = _var_map |
55
|
|
|
|
56
|
|
|
|
57
|
|
|
class EnrichmentCalculator(Generic[S], metaclass=abc.ABCMeta): |
|
|
|
|
58
|
|
|
def __init__(self, alg_name: str): |
59
|
|
|
self.alg_name = alg_name |
60
|
|
|
|
61
|
|
|
def calc_many(self, data: HitFrame, score_df: ScoreDf): |
|
|
|
|
62
|
|
|
hits = HitUtils.df_to_hits(data) |
63
|
|
|
samples_per_pair = {k: len(v) for k, v in Au.hit_multidict(hits, "pair").items()} |
64
|
|
|
results = {} |
65
|
|
|
for col, scores in score_df.var_map(): |
66
|
|
|
calculated = self.calc(hits, scores) |
67
|
|
|
results[col] = calculated |
68
|
|
|
return EnrichmentDf( |
69
|
|
|
[ |
70
|
|
|
pd.Series(self._results_row(pair, n_samples, results)) |
71
|
|
|
for pair, n_samples in samples_per_pair.items() |
72
|
|
|
] |
73
|
|
|
) |
74
|
|
|
|
75
|
|
|
def _results_row( |
76
|
|
|
self, pair: Pair, n_samples: int, results |
|
|
|
|
77
|
|
|
) -> Mapping[str, Union[str, int, float]]: |
78
|
|
|
return { |
79
|
|
|
**dict( |
80
|
|
|
predicate=pair.pred, |
81
|
|
|
object=pair.obj, |
82
|
|
|
samples=n_samples, |
83
|
|
|
), |
84
|
|
|
**{self._col_name(col): vs[pair] for col, vs in results.items()}, |
85
|
|
|
} |
86
|
|
|
|
87
|
|
|
def _col_name(self, col: str) -> str: |
88
|
|
|
return self.alg_name + "(" + col + ")" |
89
|
|
|
|
90
|
|
|
def calc(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> Mapping[Pair, float]: |
|
|
|
|
91
|
|
|
pair_to_hits = Au.hit_multidict(hits, "to_pair") |
92
|
|
|
results = {} |
93
|
|
|
for pair, the_hits in pair_to_hits.items(): |
|
|
|
|
94
|
|
|
results[pair] = self.for_pair(hits, scores) |
95
|
|
|
return results |
96
|
|
|
|
97
|
|
|
def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
98
|
|
|
raise NotImplementedError() |
99
|
|
|
|
100
|
|
|
|
101
|
|
|
class AlphaCalculator(EnrichmentCalculator[S]): |
|
|
|
|
102
|
|
|
def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
103
|
|
|
source_to_hits = Au.hit_multidict(hits, "data_source") |
104
|
|
|
terms = [] |
105
|
|
|
for source, source_hits in source_to_hits.items(): |
|
|
|
|
106
|
|
|
terms.append(self._calc_term(source_hits, scores)) |
107
|
|
|
return math.fsum(terms) / len(terms) |
108
|
|
|
|
109
|
|
|
def _calc_term(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
110
|
|
|
values = [ |
111
|
|
|
Au.elle(hit.weight) * (2 * float(scores[hit.origin_inchikey] - 1)) ** 2 for hit in hits |
112
|
|
|
] |
113
|
|
|
return math.fsum(values) / len(values) |
114
|
|
|
|
115
|
|
|
|
116
|
|
|
class FoldUnweightedCalc(EnrichmentCalculator[bool]): |
|
|
|
|
117
|
|
|
def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
118
|
|
|
numerator = len([hit for hit in hits if scores[hit.origin_inchikey]]) |
119
|
|
|
denominator = len([hit for hit in hits if not scores[hit.origin_inchikey]]) |
120
|
|
|
if denominator == 0: |
121
|
|
|
return float("inf") |
122
|
|
|
return numerator / denominator |
123
|
|
|
|
124
|
|
|
|
125
|
|
|
class FoldWeightedCalc(EnrichmentCalculator[bool]): |
|
|
|
|
126
|
|
|
def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
127
|
|
|
yes = [hit for hit in hits if scores[hit.origin_inchikey]] |
128
|
|
|
no = [hit for hit in hits if not scores[hit.origin_inchikey]] |
|
|
|
|
129
|
|
|
numerator = math.fsum((hit.weight for hit in yes)) |
|
|
|
|
130
|
|
|
denominator = math.fsum((hit.weight for hit in no)) |
131
|
|
|
if denominator == 0: |
132
|
|
|
return float("inf") |
133
|
|
|
return numerator / denominator |
134
|
|
|
|
135
|
|
|
|
136
|
|
|
class SumWeightedCalc(EnrichmentCalculator[S]): |
|
|
|
|
137
|
|
|
def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
138
|
|
|
return math.fsum([scores[hit.origin_inchikey] * hit.weight for hit in hits]) / len(hits) |
139
|
|
|
|
140
|
|
|
|
141
|
|
|
class SumUnweightedCalc(EnrichmentCalculator[S]): |
|
|
|
|
142
|
|
|
def for_pair(self, hits: Sequence[AbstractHit], scores: Mapping[str, S]) -> float: |
|
|
|
|
143
|
|
|
return math.fsum([scores[hit.origin_inchikey] for hit in hits]) / len(hits) |
144
|
|
|
|
145
|
|
|
|
146
|
|
|
class EnrichmentAlg(CleverEnum): |
|
|
|
|
147
|
|
|
""" """ |
148
|
|
|
|
149
|
|
|
alpha = enum.auto() |
150
|
|
|
fold = enum.auto() |
151
|
|
|
fold_w = enum.auto() |
152
|
|
|
sum = enum.auto() |
153
|
|
|
sum_w = enum.auto() |
154
|
|
|
|
155
|
|
|
@property |
156
|
|
|
def description(self) -> str: |
|
|
|
|
157
|
|
|
s = self.symbol |
|
|
|
|
158
|
|
|
return { |
159
|
|
|
EnrichmentAlg.alpha: rf"[float] {s}(p) = Mean product of rescaled weights and scores; see the docs", |
|
|
|
|
160
|
|
|
EnrichmentAlg.fold: rf"[bool] {s}(p) = #(c has p and hit) / #(c has p and not hit)", |
161
|
|
|
EnrichmentAlg.fold_w: rf"[bool] {s}(p) = ∑(w(c, pair) s.t. c is hit) / ∑(w(c, pair) s.t. c not hit)", |
|
|
|
|
162
|
|
|
EnrichmentAlg.sum: rf"{s}(p) = ∑(score(c) s.t. c has p)", |
163
|
|
|
EnrichmentAlg.sum_w: rf"{s}(p) = ∑(score(c) × w(c, p) for all c)", |
164
|
|
|
}[self] |
165
|
|
|
|
166
|
|
|
@property |
167
|
|
|
def symbol(self) -> str: |
|
|
|
|
168
|
|
|
return { |
169
|
|
|
EnrichmentAlg.alpha: "α", |
170
|
|
|
EnrichmentAlg.fold: r"β", |
171
|
|
|
EnrichmentAlg.fold_w: "β*", |
172
|
|
|
EnrichmentAlg.sum: r"γ", |
173
|
|
|
EnrichmentAlg.sum_w: r"γ*", |
174
|
|
|
}[self] |
175
|
|
|
|
176
|
|
|
@property |
177
|
|
|
def clazz(self) -> Type[EnrichmentCalculator]: |
|
|
|
|
178
|
|
|
return { |
179
|
|
|
EnrichmentAlg.alpha: AlphaCalculator, |
180
|
|
|
EnrichmentAlg.fold: FoldUnweightedCalc, |
181
|
|
|
EnrichmentAlg.fold_w: FoldWeightedCalc, |
182
|
|
|
EnrichmentAlg.sum: SumUnweightedCalc, |
183
|
|
|
EnrichmentAlg.sum_w: SumWeightedCalc, |
184
|
|
|
}[self.name] |
185
|
|
|
|
186
|
|
|
|
187
|
|
|
class EnrichmentCalculation: |
|
|
|
|
188
|
|
|
@classmethod |
189
|
|
|
def create(cls, algorithm: Union[str, EnrichmentAlg]) -> EnrichmentDf: |
|
|
|
|
190
|
|
|
alg_clazz = EnrichmentDf.of(algorithm).clazz |
191
|
|
|
return alg_clazz() |
192
|
|
|
|
193
|
|
|
|
194
|
|
|
__all__ = [ |
195
|
|
|
"AlphaCalculator", |
196
|
|
|
"EnrichmentCalculator", |
197
|
|
|
"FoldUnweightedCalc", |
198
|
|
|
"FoldWeightedCalc", |
199
|
|
|
"SumUnweightedCalc", |
200
|
|
|
"SumWeightedCalc", |
201
|
|
|
"EnrichmentCalculation", |
202
|
|
|
"EnrichmentAlg", |
203
|
|
|
"EnrichmentDf", |
204
|
|
|
"ScoreDf", |
205
|
|
|
] |
206
|
|
|
|