1
|
|
|
""" |
2
|
|
|
API for PubChem similarity search. |
3
|
|
|
""" |
4
|
|
|
from __future__ import annotations |
5
|
|
|
|
6
|
|
|
import time |
7
|
|
|
from pathlib import Path |
8
|
|
|
from typing import FrozenSet |
9
|
|
|
|
10
|
|
|
import orjson |
|
|
|
|
11
|
|
|
import pandas as pd |
|
|
|
|
12
|
|
|
from pocketutils.core.dot_dict import NestedDotDict |
|
|
|
|
13
|
|
|
from pocketutils.core.exceptions import DownloadTimeoutError, XValueError |
|
|
|
|
14
|
|
|
from pocketutils.core.query_utils import QueryExecutor |
|
|
|
|
15
|
|
|
from typeddfs import TypedDfs |
|
|
|
|
16
|
|
|
|
17
|
|
|
from mandos.model.apis.similarity_api import SimilarityApi |
18
|
|
|
from mandos.model.settings import QUERY_EXECUTORS, SETTINGS |
19
|
|
|
from mandos.model.utils.setup import logger |
20
|
|
|
|
21
|
|
|
SimilarityDf = (TypedDfs.typed("SimilarityDf").require("cid", dtype=int).secure()).build() |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
class QueryingPubchemSimilarityApi(SimilarityApi): |
|
|
|
|
25
|
|
|
def __init__(self, executor: QueryExecutor = QUERY_EXECUTORS.pubchem): |
26
|
|
|
self._executor = executor |
27
|
|
|
|
28
|
|
|
_pug = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" |
29
|
|
|
|
30
|
|
|
def search(self, inchi: str, min_tc: float) -> FrozenSet[int]: |
31
|
|
|
req = self._executor( |
32
|
|
|
f"{self._pug}/compound/similarity/inchikey/{inchi}/JSON?Threshold={min_tc}", |
33
|
|
|
method="post", |
34
|
|
|
) |
35
|
|
|
key = orjson.loads(req)["Waiting"]["ListKey"] |
36
|
|
|
t0 = time.monotonic() |
|
|
|
|
37
|
|
|
while time.monotonic() - t0 < 5: |
38
|
|
|
# it'll wait as needed here |
39
|
|
|
resp = self._executor(f"{self._pug}/compound/listkey/{key}/cids/JSON") |
40
|
|
|
resp = NestedDotDict(orjson.loads(resp)) |
41
|
|
|
if resp.get("IdentifierList.CID") is not None: |
42
|
|
|
return frozenset(resp.req_list_as("IdentifierList.CID", int)) |
43
|
|
|
raise DownloadTimeoutError(f"Search for {inchi} using key {key} timed out") |
44
|
|
|
|
45
|
|
|
|
46
|
|
|
class CachingPubchemSimilarityApi(SimilarityApi): |
|
|
|
|
47
|
|
|
def __init__(self, query: QueryingPubchemSimilarityApi): |
48
|
|
|
self._query = query |
49
|
|
|
|
50
|
|
|
def path(self, inchi: str, min_tc: float) -> Path: |
|
|
|
|
51
|
|
|
if not (min_tc * 100).is_integer(): |
52
|
|
|
raise XValueError(f"min_tc {min_tc} is not an increment of 1%") |
53
|
|
|
percent = int(min_tc * 100) |
54
|
|
|
path = self._cache_dir / "similarity" / f"{inchi}_{percent}" |
|
|
|
|
55
|
|
|
return path.with_suffix(SETTINGS.archive_filename_suffix) |
56
|
|
|
|
57
|
|
|
def search(self, inchi: str, min_tc: float) -> FrozenSet[int]: |
58
|
|
|
logger.info(f"Searching for {inchi} with min TC {min_tc}") |
59
|
|
|
path = self.path(inchi, min_tc) |
60
|
|
|
if path.exists(): |
61
|
|
|
df = SimilarityDf.read_file(path) |
|
|
|
|
62
|
|
|
return frozenset(set(df["cid"].values)) |
63
|
|
|
found = self._query.search(inchi, min_tc) |
64
|
|
|
df: SimilarityDf = SimilarityDf.of([pd.Series(dict(cid=cid)) for cid in found]) |
|
|
|
|
65
|
|
|
df.write_file(path, mkdirs=True, dir_hash=True) |
66
|
|
|
logger.info(f"Wrote {len(df)} values for {inchi} with min TC {min_tc}") |
67
|
|
|
return frozenset(set(df["cid"].values)) |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
__all__ = ["QueryingPubchemSimilarityApi", "CachingPubchemSimilarityApi"] |
71
|
|
|
|