| 1 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | PubChem caching API. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | from __future__ import annotations | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import gzip | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | import os | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from pathlib import Path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from typing import FrozenSet, Optional, Union | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | import orjson | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | from pocketutils.core.dot_dict import NestedDotDict | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from pocketutils.core.exceptions import XValueError | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | from mandos.model.apis.pubchem_support.pubchem_data import PubchemData | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | from mandos.model.settings import SETTINGS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | from mandos.model.utils.setup import logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | class CachingPubchemApi(PubchemApi): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     def __init__( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |         self, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |         query: Optional[QueryingPubchemApi], | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |         cache_dir: Path = SETTINGS.pubchem_cache_path, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     ): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |         self._cache_dir = cache_dir | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         self._query = query | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |         path = self.data_path(inchikey_or_cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         if path.exists(): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |             logger.debug(f"Found cached PubChem data for {inchikey_or_cid}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |             data = self._read_json(path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |             if data is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |                 raise PubchemCompoundLookupError( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |                     f"{inchikey_or_cid} previously not found in PubChem" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |                 ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |             self._write_siblings(data)  # TODO: remove | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |             return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |             logger.debug(f"Did NOT find cached PubChem data for {inchikey_or_cid}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |         return self._download(inchikey_or_cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |         if self._query is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |             raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |         # logger.debug(f"Downloading PubChem data for {inchikey_or_cid}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |         try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |             data: PubchemData = self._query.fetch_data(inchikey_or_cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |         except PubchemCompoundLookupError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |             path = self.data_path(inchikey_or_cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |             NestedDotDict({}).write_json(path, mkdirs=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |             logger.debug(f"Wrote empty PubChem data to {path}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |             raise | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         cid = data.parent_or_self  # if there's ever a parent of a parent, this will NOT work | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |         path = self.data_path(cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |         if path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |             logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} exists") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |             logger.warning(f"Writing over {path} for {inchikey_or_cid}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |             logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} does not exist") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |         data._data.write_json(path, mkdirs=True) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |         self._write_siblings(data, inchikey_or_cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |         logger.debug(f"Wrote PubChem data to {path.absolute()}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         logger.info(f"Got PubChem data for {inchikey_or_cid}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |     def _write_siblings(self, data: PubchemData, *others: str): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         cid = data.parent_or_self | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         path = self.data_path(cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         aliases = {self.data_path(data.inchikey), *data.siblings, *others} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |         for sibling in aliases: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |             link = self.data_path(sibling) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |             if link != path and link.resolve() != path.resolve(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |                 link.unlink(missing_ok=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |                 path.link_to(link) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |         logger.debug(f"Added aliases {','.join([str(s) for s in aliases])} ⇌ {cid} ({path})") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |     def data_path(self, inchikey_or_cid: Union[int, str]) -> Path: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |         return self._cache_dir / "data" / f"{inchikey_or_cid}.json.gz" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     def _read_json(self, path: Path) -> Optional[PubchemData]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         dot = NestedDotDict.read_json(path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         return PubchemData(dot) if len(dot) > 0 else None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  | __all__ = ["CachingPubchemApi"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |  |