| 1 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | PubChem querying API. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | from __future__ import annotations | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import abc | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | import logging | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | import time | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from urllib.error import HTTPError | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | from datetime import datetime, timezone | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | from pathlib import Path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | from typing import Optional, Sequence, Union, FrozenSet | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | import io | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | import gzip | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | import orjson | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | from pocketutils.core.dot_dict import NestedDotDict | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | from pocketutils.core.query_utils import QueryExecutor | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | from mandos import MandosUtils | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  | from mandos.model.pubchem_data import PubchemData | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  | logger = logging.getLogger("mandos") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  | class PubchemApi(metaclass=abc.ABCMeta): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     def fetch_data_from_cid(self, cid: int) -> Optional[PubchemData]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |         # separated from fetch_data to make it completely clear what an int value means | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |         # noinspection PyTypeChecker | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |         return self.fetch_data(cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     def fetch_data(self, inchikey: str) -> Optional[PubchemData]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         raise NotImplementedError() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |     def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |         raise NotImplementedError() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  | class QueryingPubchemApi(PubchemApi): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     def __init__(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |         self._query = QueryExecutor(0.22, 0.25) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |     _pug = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |     _pug_view = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     _sdg = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |     _classifications = "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |     _link_db = "https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |     def fetch_data(self, inchikey: str) -> Optional[PubchemData]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |         logger.info(f"Downloading PubChem data for {inchikey}") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |         data = dict( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |             meta=dict( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |                 timestamp_fetch_started=datetime.now(timezone.utc).astimezone().isoformat(), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |                 from_lookup=inchikey, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         t0 = time.monotonic_ns() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |         cid = self._fetch_compound(inchikey) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |         if cid is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |             return None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |         data["record"] = self._fetch_display_data(cid)["Record"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         external_table_names = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |             "related:pubchem:related_compounds_with_annotation": "compound", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |             "drug:clinicaltrials.gov:clinical_trials": "clinicaltrials", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |             "pharm:pubchem:reactions": "pathwayreaction", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |             "uses:cpdat:uses": "cpdat", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |             "tox:chemidplus:acute_effects": "chemidplus", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |             "dis:ctd:associated_disorders_and_diseases": "ctd_chemical_disease", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |             "lit:pubchem:depositor_provided_pubmed_citations": "pubmed", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |             "patent:depositor_provided_patent_identifiers": "patent", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |             "bio:rcsb_pdb:protein_bound_3d_structures": "pdb", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |             "bio:dgidb:drug_gene_interactions": "dgidb", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |             "bio:ctd:chemical_gene_interactions": "ctdchemicalgene", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |             "bio:drugbank:drugbank_interactions": "drugbank", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |             "bio:drugbank:drug_drug_interactions": "drugbankddi", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |             "bio:pubchem:bioassay_results": "bioactivity", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |         external_link_set_names = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |             "lit:pubchem:chemical_cooccurrences_in_literature": "ChemicalNeighbor", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |             "lit:pubchem:gene_cooccurrences_in_literature": "ChemicalGeneSymbolNeighbor", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |             "lit:pubchem:disease_cooccurrences_in_literature": "ChemicalDiseaseNeighbor", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |         data["external_tables"] = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |             table: self._fetch_external_table(cid, table) for table in external_table_names.values() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         data["link_sets"] = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |             table: self._fetch_external_link_set(cid, table) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |             for table in external_link_set_names.values() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         # get index==0 because we only have 1 compound | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |         data["structure"] = self._fetch_misc_data(cid)["PC_Compounds"][0] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         del [data["structure"]["props"]]  # redundant with props section in record | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |         data["classifications"] = self._fetch_hierarchies(cid)["hierarchies"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         t1 = time.monotonic_ns() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |         data["meta"]["timestamp_fetch_finished"] = ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |             datetime.now(timezone.utc).astimezone().isoformat() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         data["meta"]["fetch_nanos_taken"] = str(t1 - t0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         self._strip_by_key_in_place(data, "DisplayControls") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         return PubchemData(NestedDotDict(data)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |     def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |         slash = self._query_and_type(inchi) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         req = self._query( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |             f"{self._pug}/compound/similarity/{slash}/{inchi}/JSON?Threshold={min_tc}", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |             method="post", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         key = orjson.loads(req)["Waiting"]["ListKey"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         t0 = time.monotonic() | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         while time.monotonic() - t0 < 5: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |             # it'll wait as needed here | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |             resp = self._query(f"{self._pug}/compound/listkey/{key}/cids/JSON") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |             resp = NestedDotDict(orjson.loads(resp)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |             if resp.get("IdentifierList.CID") is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |                 return frozenset(resp.req_list_as("IdentifierList.CID", int)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |         raise TimeoutError(f"Search for {inchi} using key {key} timed out") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |     def _fetch_compound(self, inchikey: Union[int, str]) -> Optional[int]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |         cid = self._fetch_cid(inchikey) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |         if cid is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |             return None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         data = dict(record=self._fetch_display_data(cid)["Record"]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |         data = PubchemData(NestedDotDict(data)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         return data.parent_or_self | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |     def _fetch_cid(self, inchikey: str) -> Optional[int]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |         # The PubChem API docs LIE!! | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |         # Using ?cids_type=parent DOES NOT give the parent | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |         # Ex: https://pubchem.ncbi.nlm.nih.gov/compound/656832 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |         # This is cocaine HCl, which has cocaine (446220) as a parent | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/656832/JSON | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |         # gives 656832 back again | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |         # same thing when querying by inchikey | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |         slash = self._query_and_type(inchikey) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |         url = f"{self._pug}/compound/{slash}/JSON" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |         data = self._query_json(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         logger.error(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |         found = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |         for match in data["PC_Compounds"]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |             for c in match["props"]: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |                 if ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |                     c["urn"]["label"] == "InChIKey" | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |                     and c["urn"]["name"] == "Standard" | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |                     and c["value"]["sval"] == inchikey | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |                 ): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |                     if match["id"]["id"] not in found: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |                         found.append(match["id"]["id"]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |         if len(found) == 0: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |             return None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |         elif len(found) > 1: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |             logger.warning( | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |                 f"Found {len(found)} CIDs for {inchikey}: {found}. Using first ({found[0]})." | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |             ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |         found = found[0]["cid"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |         assert isinstance(found, int), f"Type of {found} is {type(found)}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |         return found | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |     def _fetch_display_data(self, cid: int) -> Optional[NestedDotDict]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |         url = f"{self._pug_view}/data/compound/{cid}/JSON/?response_type=display" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |         return self._query_json(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |     def _fetch_misc_data(self, cid: int) -> Optional[NestedDotDict]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |         url = f"{self._pug}/compound/cid/{cid}/JSON" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |         return self._query_json(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |     def _query_json(self, url: str) -> NestedDotDict: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |         data = self._query(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |         data = NestedDotDict(orjson.loads(data)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |         if "Fault" in data: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |             raise ValueError(f"Request failed ({data.get('Code')}) on {url}: {data.get('Message')}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |         return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |     def _fetch_external_link_set(self, cid: int, table: str) -> NestedDotDict: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |         url = f"{self._link_db}?format=JSON&type={table}&operation=GetAllLinks&id_1={cid}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |         data = self._query(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |         return NestedDotDict(orjson.loads(data)) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 178 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 179 |  |  |     def _fetch_hierarchies(self, cid: int) -> NestedDotDict: | 
            
                                                                        
                            
            
                                    
            
            
                | 180 |  |  |         hids = { | 
            
                                                                        
                            
            
                                    
            
            
                | 181 |  |  |             "MeSH Tree": 1, | 
            
                                                                        
                            
            
                                    
            
            
                | 182 |  |  |             "ChEBI Ontology": 2, | 
            
                                                                        
                            
            
                                    
            
            
                | 183 |  |  |             "KEGG: Phytochemical Compounds": 5, | 
            
                                                                        
                            
            
                                    
            
            
                | 184 |  |  |             "KEGG: Drug": 14, | 
            
                                                                        
                            
            
                                    
            
            
                | 185 |  |  |             "KEGG: USP": 15, | 
            
                                                                        
                            
            
                                    
            
            
                | 186 |  |  |             "KEGG: Major components of natural products": 69, | 
            
                                                                        
                            
            
                                    
            
            
                | 187 |  |  |             "KEGG: Target-based Classification of Drugs": 22, | 
            
                                                                        
                            
            
                                    
            
            
                | 188 |  |  |             "KEGG: OTC drugs": 25, | 
            
                                                                        
                            
            
                                    
            
            
                | 189 |  |  |             "KEGG: Drug Classes": 96, | 
            
                                                                        
                            
            
                                    
            
            
                | 190 |  |  |             "CAMEO Chemicals": 86, | 
            
                                                                        
                            
            
                                    
            
            
                | 191 |  |  |             "WHO ATC Classification System": 79, | 
            
                                                                        
                            
            
                                    
            
            
                | 192 |  |  |             "Guide to PHARMACOLOGY Target Classification": 92, | 
            
                                                                        
                            
            
                                    
            
            
                | 193 |  |  |             "ChEMBL Target Tree": 87, | 
            
                                                                        
                            
            
                                    
            
            
                | 194 |  |  |             "EPA CPDat Classification": 99, | 
            
                                                                        
                            
            
                                    
            
            
                | 195 |  |  |             "FDA Pharm Classes": 78, | 
            
                                                                        
                            
            
                                    
            
            
                | 196 |  |  |             "ChemIDplus": 84, | 
            
                                                                        
                            
            
                                    
            
            
                | 197 |  |  |         } | 
            
                                                                        
                            
            
                                    
            
            
                | 198 |  |  |         build_up = [] | 
            
                                                                        
                            
            
                                    
            
            
                | 199 |  |  |         for hid in hids.values(): | 
            
                                                                        
                            
            
                                    
            
            
                | 200 |  |  |             url = f"{self._classifications}?format=json&hid={hid}&search_uid_type=cid&search_uid={cid}&search_type=list&response_type=display" | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 201 |  |  |             try: | 
            
                                                                        
                            
            
                                    
            
            
                | 202 |  |  |                 data = orjson.loads(self._query(url)) | 
            
                                                                        
                            
            
                                    
            
            
                | 203 |  |  |                 logger.debug(f"Found data for classifier {hid}, compound {cid}") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 204 |  |  |                 data = data["Hierarchies"]["Hierarchy"] | 
            
                                                                        
                            
            
                                    
            
            
                | 205 |  |  |                 if len(data) > 1: | 
            
                                                                        
                            
            
                                    
            
            
                | 206 |  |  |                     logger.warning( | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 207 |  |  |                         f"Multiple hierarchies for classifier {hid}, compound {cid}; using first" | 
            
                                                                        
                            
            
                                    
            
            
                | 208 |  |  |                     ) | 
            
                                                                        
                            
            
                                    
            
            
                | 209 |  |  |                     data = data[0] | 
            
                                                                        
                            
            
                                    
            
            
                | 210 |  |  |                 elif len(data) == 1: | 
            
                                                                        
                            
            
                                    
            
            
                | 211 |  |  |                     data = data[0] | 
            
                                                                        
                            
            
                                    
            
            
                | 212 |  |  |                 else: | 
            
                                                                        
                            
            
                                    
            
            
                | 213 |  |  |                     raise KeyError("Hierarchy") | 
            
                                                                        
                            
            
                                    
            
            
                | 214 |  |  |             except (HTTPError, KeyError, LookupError) as e: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 215 |  |  |                 logger.debug(f"No data for classifier {hid}, compound {cid}: {e}") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                        
                            
            
                                    
            
            
                | 216 |  |  |                 data = {} | 
            
                                                                        
                            
            
                                    
            
            
                | 217 |  |  |             build_up.append(data) | 
            
                                                                        
                            
            
                                    
            
            
                | 218 |  |  |         # These list all of the child nodes for each node | 
            
                                                                        
                            
            
                                    
            
            
                | 219 |  |  |         # Some of them are > 1000 items -- they're HUGE | 
            
                                                                        
                            
            
                                    
            
            
                | 220 |  |  |         # We don't expect to need to navigate to children | 
            
                                                                        
                            
            
                                    
            
            
                | 221 |  |  |         self._strip_by_key_in_place(build_up, "ChildID") | 
            
                                                                        
                            
            
                                    
            
            
                | 222 |  |  |         return NestedDotDict(dict(hierarchies=build_up)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 223 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 224 |  |  |     def _fetch_external_table(self, cid: int, table: str) -> Sequence[dict]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 225 |  |  |         url = self._external_table_url(cid, table) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 226 |  |  |         data = self._query(url) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 227 |  |  |         df: pd.DataFrame = pd.read_csv(io.StringIO(data)) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 228 |  |  |         return list(df.T.to_dict().values()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 229 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 230 |  |  |     def _external_table_url(self, cid: int, collection: str) -> str: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 231 |  |  |         return ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 232 |  |  |             self._sdg | 
            
                                                                                                            
                            
            
                                    
            
            
                | 233 |  |  |             + "?infmt=json" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 234 |  |  |             + "&outfmt=csv" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 235 |  |  |             + "&query={ download : * , collection : " | 
            
                                                                                                            
                            
            
                                    
            
            
                | 236 |  |  |             + collection | 
            
                                                                                                            
                            
            
                                    
            
            
                | 237 |  |  |             + " , where :{ ands :[{ cid : " | 
            
                                                                                                            
                            
            
                                    
            
            
                | 238 |  |  |             + str(cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 239 |  |  |             + " }]}}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 240 |  |  |         ).replace(" ", "%22") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 241 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 242 |  |  |     def _query_and_type(self, inchi: Union[int, str], req_full: bool = False) -> str: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 243 |  |  |         allowed = ["cid", "inchi", "smiles"] if req_full else ["cid", "inchi", "inchikey", "smiles"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 244 |  |  |         if isinstance(inchi, int): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 245 |  |  |             return f"cid/{inchi}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 246 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 247 |  |  |             query_type = MandosUtils.get_query_type(inchi).name.lower() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 248 |  |  |             if query_type not in allowed: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 249 |  |  |                 raise ValueError(f"Can't query {inchi} with type {query_type}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 250 |  |  |             return f"{query_type}/{inchi}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 251 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 252 |  |  |     def _strip_by_key_in_place(self, data: Union[dict, list], bad_key: str) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 253 |  |  |         if isinstance(data, list): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 254 |  |  |             for x in data: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 255 |  |  |                 self._strip_by_key_in_place(x, bad_key) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 256 |  |  |         elif isinstance(data, dict): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 257 |  |  |             for k, v in list(data.items()): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 258 |  |  |                 if k == bad_key: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 259 |  |  |                     del data[k] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 260 |  |  |                 elif isinstance(v, (list, dict)): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 261 |  |  |                     self._strip_by_key_in_place(v, bad_key) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 262 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 263 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 264 |  |  | class CachingPubchemApi(PubchemApi): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 265 |  |  |     def __init__( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 266 |  |  |         self, cache_dir: Path, querier: Optional[QueryingPubchemApi], compress: bool = True | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 267 |  |  |     ): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 268 |  |  |         self._cache_dir = cache_dir | 
            
                                                                                                            
                            
            
                                    
            
            
                | 269 |  |  |         self._querier = querier | 
            
                                                                                                            
                            
            
                                    
            
            
                | 270 |  |  |         self._compress = compress | 
            
                                                                                                            
                            
            
                                    
            
            
                | 271 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 272 |  |  |     def fetch_data(self, inchikey: str) -> Optional[PubchemData]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 273 |  |  |         path = self.data_path(inchikey) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 274 |  |  |         if path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 275 |  |  |             logger.info(f"Found cached PubChem data at {path.absolute()}") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 276 |  |  |         elif self._querier is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 277 |  |  |             raise LookupError(f"Key {inchikey} not found in cache") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 278 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 279 |  |  |             logger.info(f"Downloading PubChem data for {inchikey} ...") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 280 |  |  |             data = self._querier.fetch_data(inchikey) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 281 |  |  |             path.parent.mkdir(parents=True, exist_ok=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 282 |  |  |             encoded = data.to_json() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 283 |  |  |             self._write_json(encoded, path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 284 |  |  |             logger.info(f"Wrote PubChem data to {path.absolute()}") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 285 |  |  |             return data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 286 |  |  |         read = self._read_json(path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 287 |  |  |         return PubchemData(read) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 288 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 289 |  |  |     def _write_json(self, encoded: str, path: Path) -> None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 290 |  |  |         if self._compress: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 291 |  |  |             path.write_bytes(gzip.compress(encoded.encode(encoding="utf8"))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 292 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 293 |  |  |             path.write_text(encoded, encoding="utf8") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 294 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 295 |  |  |     def _read_json(self, path: Path) -> NestedDotDict: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 296 |  |  |         if self._compress: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 297 |  |  |             deflated = gzip.decompress(path.read_bytes()) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 298 |  |  |             read = orjson.loads(deflated) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 299 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 300 |  |  |             read = orjson.loads(path.read_text(encoding="utf8")) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 301 |  |  |         return NestedDotDict(read) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 302 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 303 |  |  |     def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 304 |  |  |         path = self.similarity_path(inchi) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 305 |  |  |         if not path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 306 |  |  |             df = None | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 307 |  |  |             existing = set() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 308 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 309 |  |  |             df = pd.read_csv(path, sep="\t") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 310 |  |  |             df = df[df["min_tc"] < min_tc] | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 311 |  |  |             existing = set(df["cid"].values) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 312 |  |  |         if len(existing) == 0: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 313 |  |  |             found = self._querier.find_similar_compounds(inchi, min_tc) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 314 |  |  |             path.parent.mkdir(parents=True, exist_ok=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 315 |  |  |             new_df = pd.DataFrame([pd.Series(dict(cid=cid, min_tc=min_tc)) for cid in found]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 316 |  |  |             if df is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 317 |  |  |                 new_df = pd.concat([df, new_df]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 318 |  |  |             new_df.to_csv(path, sep="\t") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 319 |  |  |             return frozenset(existing.union(found)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 320 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 321 |  |  |             return frozenset(existing) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 322 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 323 |  |  |     def data_path(self, inchikey: str): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 324 |  |  |         ext = ".json.gz" if self._compress else ".json" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 325 |  |  |         return self._cache_dir / "data" / f"{inchikey}{ext}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 326 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 327 |  |  |     def similarity_path(self, inchikey: str): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 328 |  |  |         ext = ".tab.gz" if self._compress else ".tab" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 329 |  |  |         return self._cache_dir / "similarity" / f"{inchikey}{ext}" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 330 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 331 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 332 |  |  | __all__ = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 333 |  |  |     "PubchemApi", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 334 |  |  |     "CachingPubchemApi", | 
            
                                                                                                            
                            
            
                                    
            
            
                | 335 |  |  |     "QueryingPubchemApi", | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 336 |  |  | ] | 
            
                                                        
            
                                    
            
            
                | 337 |  |  |  |