1
|
|
|
""" |
2
|
|
|
Run searches and write files. |
3
|
|
|
""" |
4
|
|
|
|
5
|
|
|
from __future__ import annotations |
6
|
|
|
|
7
|
|
|
import time |
8
|
|
|
from dataclasses import dataclass |
9
|
|
|
from datetime import timedelta |
10
|
|
|
from pathlib import Path |
11
|
|
|
from typing import Sequence |
12
|
|
|
|
13
|
|
|
from pocketutils.core.exceptions import IllegalStateError |
|
|
|
|
14
|
|
|
from typeddfs import Checksums, TypedDfs |
|
|
|
|
15
|
|
|
|
16
|
|
|
from mandos.model import CompoundNotFoundError |
17
|
|
|
from mandos.model.hit_dfs import HitDf |
18
|
|
|
from mandos.model.hits import AbstractHit |
19
|
|
|
from mandos.model.search_caches import SearchCache |
20
|
|
|
from mandos.model.searches import Search, SearchError |
21
|
|
|
from mandos.model.settings import SETTINGS |
22
|
|
|
from mandos.model.utils.setup import logger |
23
|
|
|
|
24
|
|
|
|
25
|
|
|
def _fix_cols(df): |
|
|
|
|
26
|
|
|
return df.rename(columns={s: s.lower() for s in df.columns}) |
27
|
|
|
|
28
|
|
|
|
29
|
|
|
InputCompoundsDf = ( |
30
|
|
|
TypedDfs.typed("InputCompoundsDf") |
31
|
|
|
.require("inchikey") |
32
|
|
|
.reserve("inchi", "smiles", "compound_id", dtype=str) |
33
|
|
|
.post(_fix_cols) |
34
|
|
|
.strict(cols=False) |
35
|
|
|
.secure() |
36
|
|
|
).build() |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
@dataclass(frozen=True, repr=True, order=True) |
|
|
|
|
40
|
|
|
class SearchReturnInfo: |
41
|
|
|
n_kept: int |
42
|
|
|
n_processed: int |
43
|
|
|
n_errored: int |
44
|
|
|
time_taken: timedelta |
45
|
|
|
|
46
|
|
|
|
47
|
|
|
@dataclass(frozen=True, repr=True) |
48
|
|
|
class Searcher: |
49
|
|
|
""" |
50
|
|
|
Executes one or more searches and saves the results. |
51
|
|
|
Create and use once. |
52
|
|
|
""" |
53
|
|
|
|
54
|
|
|
what: Search |
55
|
|
|
input_df: InputCompoundsDf |
56
|
|
|
to: Path |
|
|
|
|
57
|
|
|
proceed: bool |
58
|
|
|
restart: bool |
59
|
|
|
|
60
|
|
|
def search(self) -> SearchReturnInfo: |
|
|
|
|
61
|
|
|
""" |
62
|
|
|
Performs the search, and writes data. |
63
|
|
|
""" |
64
|
|
|
inchikeys = self.input_df["inchikey"].unique() |
65
|
|
|
if self.is_complete: |
66
|
|
|
logger.info(f"{self.to} already complete") |
67
|
|
|
return SearchReturnInfo( |
68
|
|
|
n_kept=len(inchikeys), n_processed=0, n_errored=0, time_taken=timedelta(seconds=0) |
69
|
|
|
) |
70
|
|
|
logger.info(f"Will save every {SETTINGS.save_every} compounds") |
71
|
|
|
logger.info(f"Writing {self.what.key} to {self.to}") |
72
|
|
|
annotes = [] |
73
|
|
|
compounds_run = set() |
74
|
|
|
cache = SearchCache(self.to, inchikeys, restart=self.restart, proceed=self.proceed) |
75
|
|
|
# refresh so we know it's (no longer) complete |
76
|
|
|
# this would only happen if we're forcing this -- which is not currently allowed |
77
|
|
|
( |
78
|
|
|
Checksums() |
79
|
|
|
.load_dirsum_of_file(self.to, missing_ok=True) |
80
|
|
|
.remove(self.to, missing_ok=True) |
81
|
|
|
.write(rm_if_empty=True) |
82
|
|
|
) |
83
|
|
|
t0, n0, n_proc, n_err, n_annot = time.monotonic(), cache.at, 0, 0, 0 |
|
|
|
|
84
|
|
|
while True: |
85
|
|
|
try: |
86
|
|
|
compound = cache.next() |
|
|
|
|
87
|
|
|
except StopIteration: |
88
|
|
|
break |
89
|
|
|
try: |
90
|
|
|
with logger.contextualize(compound=compound): |
91
|
|
|
x = self.what.find(compound) |
|
|
|
|
92
|
|
|
annotes.extend(x) |
93
|
|
|
except CompoundNotFoundError: |
94
|
|
|
logger.info(f"Compound {compound} not found for {self.what.key}") |
95
|
|
|
x = [] |
|
|
|
|
96
|
|
|
n_err += 1 |
97
|
|
|
except Exception: |
98
|
|
|
raise SearchError( |
99
|
|
|
f"Failed {self.what.key} [{self.what.search_class}] on compound {compound}", |
100
|
|
|
compound=compound, |
101
|
|
|
search_key=self.what.key, |
102
|
|
|
search_class=self.what.search_class, |
103
|
|
|
) |
104
|
|
|
compounds_run.add(compound) |
105
|
|
|
logger.debug(f"Found {len(x)} {self.what.search_name()} annotations for {compound}") |
106
|
|
|
n_annot += len(x) |
107
|
|
|
n_proc += 1 |
108
|
|
|
# logging, caching, and such: |
109
|
|
|
on_nth = cache.at % SETTINGS.save_every == SETTINGS.save_every - 1 |
110
|
|
|
is_last = cache.at == len(inchikeys) - 1 |
111
|
|
|
if on_nth or is_last: |
112
|
|
|
logger.log( |
113
|
|
|
"NOTICE" if is_last else "INFO", |
114
|
|
|
f"Found {len(annotes)} {self.what.search_name()} annotations" |
115
|
|
|
+ f" for {cache.at} of {len(inchikeys)} compounds", |
116
|
|
|
) |
117
|
|
|
self._save(annotes, done=is_last) |
118
|
|
|
cache.save(*compounds_run) # CRITICAL -- do this AFTER saving |
119
|
|
|
# done! |
120
|
|
|
i1, t1 = cache.at, time.monotonic() |
|
|
|
|
121
|
|
|
assert i1 == len(inchikeys) |
122
|
|
|
cache.kill() |
123
|
|
|
logger.success(f"Wrote {self.what.key} to {self.to}") |
124
|
|
|
return SearchReturnInfo( |
125
|
|
|
n_kept=n0, n_processed=n_proc, n_errored=n_err, time_taken=timedelta(seconds=t1 - t0) |
126
|
|
|
) |
127
|
|
|
|
128
|
|
|
@property |
129
|
|
|
def is_partial(self) -> bool: |
|
|
|
|
130
|
|
|
return self.to.exists() and not self.is_complete |
131
|
|
|
|
132
|
|
|
@property |
133
|
|
|
def is_complete(self) -> bool: |
|
|
|
|
134
|
|
|
done = self.to in Checksums().load_dirsum_of_file(self.to) |
135
|
|
|
if done and not self.to.exists(): |
136
|
|
|
raise IllegalStateError(f"{self.to} marked complete but does not exist") |
137
|
|
|
return done |
138
|
|
|
|
139
|
|
|
def _save(self, hits: Sequence[AbstractHit], *, done: bool): |
140
|
|
|
df = HitDf.from_hits(hits) |
|
|
|
|
141
|
|
|
# keep all of the original extra columns from the input |
142
|
|
|
# e.g. if the user had 'inchi' or 'smiles' or 'pretty_name' |
143
|
|
|
for extra_col in [c for c in self.input_df.columns if c != "inchikey"]: |
144
|
|
|
extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict() |
145
|
|
|
df[extra_col] = df["origin_inchikey"].map(extra_mp.get) |
146
|
|
|
# write the file |
147
|
|
|
df: HitDf = HitDf.of(df) |
|
|
|
|
148
|
|
|
params = self.what.get_params() |
149
|
|
|
df = df.set_attrs(**params, key=self.what.key) |
|
|
|
|
150
|
|
|
df.write_file(self.to, mkdirs=True, attrs=True, dir_hash=done) |
151
|
|
|
logger.debug(f"Saved {len(df)} rows to {self.to}") |
152
|
|
|
|
153
|
|
|
|
154
|
|
|
__all__ = ["Searcher", "InputCompoundsDf", "SearchReturnInfo"] |
155
|
|
|
|