| 1 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | PubChem caching API. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | from __future__ import annotations | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import gzip | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | import os | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from pathlib import Path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from typing import FrozenSet, Optional, Union | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | import orjson | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | from pocketutils.core.dot_dict import NestedDotDict | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from pocketutils.core.exceptions import XValueError | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | from mandos.model.apis.pubchem_support.pubchem_data import PubchemData | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | from mandos.model.settings import SETTINGS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | from mandos.model.utils.setup import logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | class CachingPubchemApi(PubchemApi): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     def __init__( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |         self, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |         query: Optional[QueryingPubchemApi], | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |         cache_dir: Path = SETTINGS.pubchem_cache_path, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     ): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |         self._cache_dir = cache_dir | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         self._query = query | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |         path = self.data_path(inchikey_or_cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         if path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |             logger.debug(f"Found cached PubChem data") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |             data = self._read_json(path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |             if data is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |                 raise PubchemCompoundLookupError( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |                     f"{inchikey_or_cid} previously not found in PubChem" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |                 ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |             self._write_siblings(data)  # TODO: remove | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |             return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         return self._download(inchikey_or_cid) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 44 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 45 |  |  |     def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData: | 
            
                                                                        
                            
            
                                    
            
            
                | 46 |  |  |         if self._query is None: | 
            
                                                                        
                            
            
                                    
            
            
                | 47 |  |  |             raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached") | 
            
                                                                        
                            
            
                                    
            
            
                | 48 |  |  |         # logger.debug(f"Downloading PubChem data for {inchikey_or_cid}") | 
            
                                                                        
                            
            
                                    
            
            
                | 49 |  |  |         try: | 
            
                                                                        
                            
            
                                    
            
            
                | 50 |  |  |             data: PubchemData = self._query.fetch_data(inchikey_or_cid) | 
            
                                                                        
                            
            
                                    
            
            
                | 51 |  |  |         except PubchemCompoundLookupError: | 
            
                                                                        
                            
            
                                    
            
            
                | 52 |  |  |             data = PubchemData(NestedDotDict({})) | 
            
                                                                        
                            
            
                                    
            
            
                | 53 |  |  |             path = self.data_path(inchikey_or_cid) | 
            
                                                                        
                            
            
                                    
            
            
                | 54 |  |  |             path.parent.mkdir(parents=True, exist_ok=True) | 
            
                                                                        
                            
            
                                    
            
            
                | 55 |  |  |             path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8"))) | 
            
                                                                        
                            
            
                                    
            
            
                | 56 |  |  |             logger.debug(f"Wrote empty PubChem data to {path}") | 
            
                                                                        
                            
            
                                    
            
            
                | 57 |  |  |             raise | 
            
                                                                        
                            
            
                                    
            
            
                | 58 |  |  |         cid = data.parent_or_self  # if there's ever a parent of a parent, this will NOT work | 
            
                                                                        
                            
            
                                    
            
            
                | 59 |  |  |         path = self.data_path(cid) | 
            
                                                                        
                            
            
                                    
            
            
                | 60 |  |  |         if path.exists(): | 
            
                                                                        
                            
            
                                    
            
            
                | 61 |  |  |             logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} exists") | 
            
                                                                        
                            
            
                                    
            
            
                | 62 |  |  |             logger.caution(f"Writing over {path} for {inchikey_or_cid}") | 
            
                                                                        
                            
            
                                    
            
            
                | 63 |  |  |         else: | 
            
                                                                        
                            
            
                                    
            
            
                | 64 |  |  |             logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} does not exist") | 
            
                                                                        
                            
            
                                    
            
            
                | 65 |  |  |         path.parent.mkdir(parents=True, exist_ok=True) | 
            
                                                                        
                            
            
                                    
            
            
                | 66 |  |  |         path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8"))) | 
            
                                                                        
                            
            
                                    
            
            
                | 67 |  |  |         self._write_siblings(data) | 
            
                                                                        
                            
            
                                    
            
            
                | 68 |  |  |         logger.debug(f"Wrote PubChem data to {path.absolute()}") | 
            
                                                                        
                            
            
                                    
            
            
                | 69 |  |  |         logger.info(f"Got PubChem data for {inchikey_or_cid}") | 
            
                                                                        
                            
            
                                    
            
            
                | 70 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |     def _write_siblings(self, data: PubchemData): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         cid = data.parent_or_self | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         path = self.data_path(cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         aliases = {self.data_path(data.inchikey), *data.siblings} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         for sibling in aliases: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |             link = self.data_path(sibling) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |             link.unlink(missing_ok=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |             path.link_to(link) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |         logger.debug(f"Added aliases {','.join(aliases)} ⇌ {cid} ({path})") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |     def data_path(self, inchikey_or_cid: Union[int, str]) -> Path: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |         return self._cache_dir / "data" / f"{inchikey_or_cid}.json.gz" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     def _read_json(self, path: Path) -> Optional[PubchemData]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         deflated = gzip.decompress(path.read_bytes()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         read = orjson.loads(deflated) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         return PubchemData(NestedDotDict(read)) if len(read) > 0 else None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |     def similarity_path(self, inchi: str, min_tc: float) -> Path: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         if not (min_tc * 100).is_integer(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |             raise XValueError(f"min_tc {min_tc} is not an increment of 1%") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         percent = int(min_tc * 100) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |         path = self._cache_dir / "similarity" / f"{inchi}_{percent}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         return path.with_suffix(SETTINGS.archive_filename_suffix) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |     def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         logger.debug(f"Searching for {inchi} with min TC {min_tc}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         path = self.similarity_path(inchi, min_tc) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         if path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |             df = pd.read_file(path) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |             return frozenset(set(df["cid"].values)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         found = self._query.find_similar_compounds(inchi, min_tc) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |         df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found]) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         path.parent.mkdir(parents=True, exist_ok=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |         df.write_file(path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         logger.debug(f"Wrote {len(df)} values for {inchi} with min TC {min_tc}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         return frozenset(set(df["cid"].values)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 111 |  |  | __all__ = ["CachingPubchemApi"] | 
            
                                                        
            
                                    
            
            
                | 112 |  |  |  |