| 1 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | Calculations of overlap (similarity) between annotation sets. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import abc | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import enum | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import math | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | import time | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from collections import defaultdict | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from pathlib import Path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | from typing import Collection, Mapping, Optional, Sequence, Type, Union | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | import decorateme | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | import numpy as np | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from pocketutils.core.chars import Chars | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | from pocketutils.core.enums import CleverEnum | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | from pocketutils.tools.unit_tools import UnitTools | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | from typeddfs.df_errors import HashFileMissingError | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | from mandos.analysis import AnalysisUtils as Au | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | from mandos.analysis.io_defns import SimilarityDfLongForm, SimilarityDfShortForm | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | from mandos.model.hit_dfs import HitDf | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  | from mandos.model.hits import AbstractHit | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | from mandos.model.utils import unlink | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  | # note that most of these math functions are much faster than their numpy counterparts | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  | # if we're not broadcasting, it's almost always better to use them | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  | # some are more accurate, too | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | # e.g. we're using fsum rather than sum | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  | from mandos.model.utils.setup import logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  | class _Inf: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     def __init__(self, n: int): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         self.n = n | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |         self.used, self.t0, self.nonzeros = set(), time.monotonic(), 0 | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |     def is_used(self, c1: str, c2: str) -> bool: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |         return (c1, c2) in self.used or (c2, c1) in self.used | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |     def got(self, c1: str, c2: str, z: float) -> None: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |         self.used.add((c1, c2)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |         self.nonzeros += int(c1 != c2 and not np.isnan(z) and 0 < z < 1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         if self.i % 100 == 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |             self.log("info") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     def i(self) -> int: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |         return len(self.used) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |     def log(self, level: str) -> None: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |         delta = UnitTools.delta_time_to_str(time.monotonic() - self.t0, space=Chars.narrownbsp) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |         logger.log( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |             level.upper(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |             f"Processed {self.i:,}/{self.n:,} pairs in {delta};" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |             + f" {self.nonzeros:,} ({self.nonzeros / self.i * 100:.1f}%) are nonzero", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |     def __repr__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |         return f"{self.__class__.__name__}({self.i}/{self.n})" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |     def __str__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |         return repr(self) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  | @decorateme.auto_repr_str() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  | class MatrixCalculator(metaclass=abc.ABCMeta): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |     def __init__( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         self, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         *, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         min_compounds: int, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |         min_nonzero: int, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         min_hits: int, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         exclude: Optional[Collection[str]] = None, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |     ): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         self.min_compounds = min_compounds | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         self.min_nonzero = min_nonzero | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |         self.min_hits = min_hits | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |         self.exclude = set() if exclude is None else exclude | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |     def calc_all(self, hits: Path, to: Path, *, keep_temp: bool = False) -> SimilarityDfLongForm: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |         raise NotImplemented() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  | class JPrimeMatrixCalculator(MatrixCalculator): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     def calc_all(self, path: Path, to: Path, *, keep_temp: bool = False) -> SimilarityDfLongForm: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         hits = HitDf.read_file(path).to_hits() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         key_to_hit = Au.hit_multidict(hits, "search_key") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         logger.notice(f"Calculating J on {len(key_to_hit):,} keys from {len(hits):,} hits") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         good_keys = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         for key, key_hits in key_to_hit.items(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |             if key in self.exclude: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |                 logger.caution(f"Excluding {key}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |                 continue | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |             key_hits: Sequence[AbstractHit] = key_hits | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |             n_compounds_0 = len({k.origin_inchikey for k in key_hits}) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |             part_path = self._path_of(path, key) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |             df = None | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |             if part_path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |                 df = self._read_part(key, part_path) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |             if df is None and n_compounds_0 >= self.min_compounds: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |                 df = self._calc_partial(key, key_hits) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |                 df.write_file(part_path, attrs=True, file_hash=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |                 logger.debug(f"Wrote results for {key} to {part_path}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |             if df is not None and self._should_include(df): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |                 good_keys[key] = part_path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |             if df is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |                 del df | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         big_df = self._concat_parts(good_keys) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         big_df.write_file(to, attrs=True, file_hash=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         logger.notice(f"Wrote {len(big_df):,} rows to {to}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         logger.debug(f"Concatenating {len(big_df):,} files") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         if not keep_temp: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |             for k in good_keys: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |                 unlink(self._path_of(path, k)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |     def _calc_partial(self, key: str, key_hits: HitDf) -> SimilarityDfLongForm: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |         df = self.calc_one(key, key_hits).to_long_form(kind="psi", key=key) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |         return df.set_attrs( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |             key=key, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |             quartiles=[float(df["value"].quantile(x)) for x in [0, 0.25, 0.5, 0.75, 1]], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |             n_hits=len(key_hits), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |             n_values=len(df["value"].unique()), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |             n_compounds=len(df["inchikey_1"].unique()), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |             n_real=len(df[(df["value"] > 0) & (df["value"] < 1)]), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |     def _should_include(self, df: SimilarityDfLongForm) -> bool: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |         key = df.attrs["key"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |         reqs = dict(n_compounds=self.min_compounds, n_hits=self.min_hits, n_real=self.min_nonzero) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |         for a, mn in reqs.items(): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |             v = df.attrs[a] | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |             if v < mn: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |                 logger.warning(f"Key {key}: {a} = {v:,} < {mn:,}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |                 return False | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |         return True | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |     def _read_part(self, key: str, part_path: Path) -> Optional[SimilarityDfLongForm]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |             df = SimilarityDfLongForm.read_file(part_path, file_hash=True) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |             logger.warning(f"Results for key {key} already exist ({len(df):,} rows)") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |             return df | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |         except HashFileMissingError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |             logger.error(f"Extant results for key {key} appear incomplete; restarting") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |             logger.opt(exception=True).debug(f"Hash error for {key}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |             unlink(part_path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |         return None  #  calculate from scratch | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |     def _concat_parts(self, keys: Mapping[str, Path]): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |         logger.notice(f"Included {len(keys):,} keys: {', '.join(keys)}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |         dfs = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |         for key, pp in keys: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |             df = SimilarityDfLongForm.read_file(pp, attrs=True) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |             n_values = df.attrs["n_values"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |             n_real = df.attrs["n_real"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |             quartiles = df.attrs["quartiles"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |             logger.info(f"Key {key}:") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |             prefix = f"    {key} {Chars.fatright}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |             logger.info(f"{prefix} unique values = {n_values}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |             logger.info(f"{prefix} values in (0, 1) = {n_real,}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |             logger.info(f"{prefix} quartiles: " + " | ".join(quartiles)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |             dfs.append(df) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |         return SimilarityDfLongForm.of(dfs, keys=keys) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 163 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 164 |  |  |     def calc_one(self, key: str, hits: Sequence[AbstractHit]) -> SimilarityDfShortForm: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 165 |  |  |         ik2hits = Au.hit_multidict(hits, "origin_inchikey") | 
            
                                                                        
                            
            
                                    
            
            
                | 166 |  |  |         logger.info(f"Calculating J on {key} for {len(ik2hits):,} compounds and {len(hits):,} hits") | 
            
                                                                        
                            
            
                                    
            
            
                | 167 |  |  |         data = defaultdict(dict) | 
            
                                                                        
                            
            
                                    
            
            
                | 168 |  |  |         inf = _Inf(n=int(len(ik2hits) * (len(ik2hits) - 1) / 2)) | 
            
                                                                        
                            
            
                                    
            
            
                | 169 |  |  |         for (c1, hits1) in ik2hits.items(): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 170 |  |  |             for (c2, hits2) in ik2hits.items(): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 171 |  |  |                 if inf.is_used(c1, c2): | 
            
                                                                        
                            
            
                                    
            
            
                | 172 |  |  |                     continue | 
            
                                                                        
                            
            
                                    
            
            
                | 173 |  |  |                 z = 1 if c1 == c2 else self._j_prime(key, hits1, hits2) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 174 |  |  |                 data[c1][c2] = z | 
            
                                                                        
                            
            
                                    
            
            
                | 175 |  |  |                 inf.got(c1, c2, z) | 
            
                                                                        
                            
            
                                    
            
            
                | 176 |  |  |         inf.log("success") | 
            
                                                                        
                            
            
                                    
            
            
                | 177 |  |  |         return SimilarityDfShortForm.from_dict(data) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |     def _path_of(self, path: Path, key: str): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |         return path.parent / f".{path.name}-{key}.tmp.feather" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |     def _j_prime( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |         self, key: str, hits1: Collection[AbstractHit], hits2: Collection[AbstractHit] | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 184 |  |  |     ) -> float: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 185 |  |  |         if len(hits1) == 0 or len(hits2) == 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 186 |  |  |             return 0 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 187 |  |  |         sources = {h.data_source for h in hits1}.intersection({h.data_source for h in hits2}) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 188 |  |  |         if len(sources) == 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 189 |  |  |             return float("NaN") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 190 |  |  |         values = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 191 |  |  |             self._jx( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 192 |  |  |                 key, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 193 |  |  |                 [h for h in hits1 if h.data_source == source], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 194 |  |  |                 [h for h in hits2 if h.data_source == source], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 195 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 196 |  |  |             for source in sources | 
            
                                                                                                            
                            
            
                                    
            
            
                | 197 |  |  |         ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 198 |  |  |         return float(math.fsum(values) / len(values)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 199 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 200 |  |  |     def _jx( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 201 |  |  |         self, key: str, hits1: Collection[AbstractHit], hits2: Collection[AbstractHit] | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 202 |  |  |     ) -> float: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 203 |  |  |         # TODO -- for testing only | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 204 |  |  |         # TODO: REMOVE ME! | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 205 |  |  |         if key in ["core.chemidplus.effects", "extra.chemidplus.specific-effects"]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 206 |  |  |             hits1 = [h.copy(weight=math.pow(10, -h.weight)) for h in hits1] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 207 |  |  |             hits2 = [h.copy(weight=math.pow(10, -h.weight)) for h in hits2] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 208 |  |  |         pair_to_weights = Au.weights_of_pairs(hits1, hits2) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 209 |  |  |         values = [self._wedge(ca, cb) / self._vee(ca, cb) for ca, cb in pair_to_weights.values()] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 210 |  |  |         return float(math.fsum(values) / len(values)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 211 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 212 |  |  |     def _wedge(self, ca: float, cb: float) -> float: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 213 |  |  |         return math.sqrt(Au.elle(ca) * Au.elle(cb)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 214 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 215 |  |  |     def _vee(self, ca: float, cb: float) -> float: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 216 |  |  |         return Au.elle(ca) + Au.elle(cb) - math.sqrt(Au.elle(ca) * Au.elle(cb)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 217 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 218 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 219 |  |  | class MatrixAlg(CleverEnum): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 220 |  |  |     j = () | 
            
                                                                                                            
                            
            
                                    
            
            
                | 221 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 222 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 223 |  |  |     def clazz(self) -> Type[MatrixCalculator]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 224 |  |  |         return {MatrixAlg.j: JPrimeMatrixCalculator}[self] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 225 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 226 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 227 |  |  | @decorateme.auto_utils() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 228 |  |  | class MatrixCalculation: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 229 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 230 |  |  |     def create( | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 231 |  |  |         cls, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 232 |  |  |         algorithm: Union[str, MatrixAlg], | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 233 |  |  |         *, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 234 |  |  |         min_compounds: int, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 235 |  |  |         min_nonzero: int, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 236 |  |  |         min_hits: int, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 237 |  |  |         exclude: Optional[Collection[str]] = None, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 238 |  |  |     ) -> MatrixCalculator: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 239 |  |  |         return MatrixAlg.of(algorithm).clazz( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 240 |  |  |             min_compounds=min_compounds, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 241 |  |  |             min_nonzero=min_nonzero, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 242 |  |  |             min_hits=min_hits, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 243 |  |  |             exclude=exclude, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 244 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 245 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 246 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 247 |  |  | __all__ = ["JPrimeMatrixCalculator", "MatrixAlg", "MatrixCalculation", "MatrixCalculator"] | 
            
                                                        
            
                                    
            
            
                | 248 |  |  |  |