| 1 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  | Run searches and write files. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | from __future__ import annotations | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | import gzip | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from pathlib import Path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from typing import Sequence, Optional, Dict | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | import pandas as pd | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | from pocketutils.core.dot_dict import NestedDotDict | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | from pocketutils.tools.common_tools import CommonTools | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | from pocketutils.tools.path_tools import PathTools | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | from typeddfs import TypedDfs | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | from mandos import logger | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | from mandos.model import CompoundNotFoundError | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  | from mandos.model.chembl_support.chembl_utils import ChemblUtils | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | from mandos.model.searches import Search | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | from mandos.model.settings import MANDOS_SETTINGS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  | from mandos.search.chembl import ChemblSearch | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | from mandos.search.pubchem import PubchemSearch | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  | from mandos.entries.api_singletons import Apis | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  | InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | IdMatchFrame = ( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     TypedDfs.typed("IdMatchFrame") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |     .require("inchikey") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |     .require("chembl_id") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     .require("pubchem_id") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     .strict() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  | ).build() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  | class SearcherUtils: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |     def dl( | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |         cls, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |         inchikeys: Sequence[str], | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |         pubchem: bool = True, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |         chembl: bool = True, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |         hmdb: bool = True, | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |     ) -> IdMatchFrame: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |         # we actually cache the results, even though the underlying APIs cache | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |         # the reasons for this are a little obscure -- | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |         # when running a Searcher, we want to run before the FIRST search | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |         # for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |         # but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |         # the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |         # this is much cleaner, even though it's redundant | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |         # if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |         # when running this command | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |         # to fix that, we need to delete the cached /match dataframes | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |         # now that I'm writing this down, I realize this is pretty bad | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |         # TODO | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         # noinspection PyPep8Naming | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |         Chembl, Pubchem = Apis.Chembl, Apis.Pubchem | 
                            
                    |  |  |  | 
                                                                                        
                                                                                            
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |         logger.notice(f"Using {Chembl}, {Pubchem}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         key = hash(",".join(inchikeys)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |         cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         if cached_path.exists(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |             logger.info(f"Found ID matching results at {cached_path}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |             return IdMatchFrame.read_feather(cached_path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |         found_chembl: Dict[str, str] = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |         found_pubchem: Dict[str, str] = {} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |         if pubchem: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |             for inchikey in inchikeys: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |                 try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |                     cid = Pubchem.fetch_data(inchikey).cid | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |                     found_pubchem[inchikey] = str(cid) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |                     logger.info(f"Found:      PubChem {inchikey} ({cid})") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |                 except CompoundNotFoundError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |                     logger.error(f"NOT FOUND: PubChem {inchikey}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |                     logger.debug(f"Did not find PubChem {inchikey}", exc_info=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |         if chembl: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |             for inchikey in inchikeys: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |                 try: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |                     chid = ChemblUtils(Chembl).get_compound(inchikey).chid | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |                     found_chembl[inchikey] = chid | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |                     logger.info(f"Found:      ChEMBL {inchikey} ({chid})") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |                 except CompoundNotFoundError: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |                     logger.error(f"NOT FOUND: ChEMBL {inchikey}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |                     logger.debug(f"Did not find ChEMBL {inchikey}", exc_info=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |         df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys]) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |         df["chembl_id"] = df["inchikey"].map(found_chembl.get) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |         df["pubchem_id"] = df["inchikey"].map(found_pubchem.get) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         df = IdMatchFrame(df) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         df.to_feather(cached_path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         logger.info(f"Wrote {cached_path}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |     def read(cls, input_path: Path) -> InputFrame: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         df = TypedDfs.untyped("Input").read_file(input_path, header=None, comment="#") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |         if "inchikey" in df.columns_names: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |             return InputFrame.convert(df) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         elif ".lines" in input_path.name or ".txt" in input_path.name: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |             df.columns = ["inchikey"] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |             return InputFrame.convert(df) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 101 |  |  |         raise ValueError(f"Could not parse {input_path}; no column 'inchikey'") | 
            
                                                                        
                            
            
                                    
            
            
                | 102 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 103 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 104 |  |  | class Searcher: | 
            
                                                                        
                            
            
                                    
            
            
                | 105 |  |  |     """ | 
            
                                                                        
                            
            
                                    
            
            
                | 106 |  |  |     Executes one or more searches and saves the results to CSV files. | 
            
                                                                        
                            
            
                                    
            
            
                | 107 |  |  |     Create and use once. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |     def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         Constructor. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |         Args: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |             searches: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |             input_path: Path to the input file of one of the formats: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |                 - .txt containing one InChI Key per line | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |                 - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |         self.what = searches | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |         self.input_path: Optional[Path] = input_path | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |         self.input_df: InputFrame = None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |         self.output_paths = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |             what.key: self._output_path_of(path, path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |             for what, path in CommonTools.zip_list(searches, to) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |         if str(to).startswith("."): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |             pass | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |     def search(self) -> Searcher: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         Performs the search, and writes data. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |         """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |         if self.input_df is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |             raise ValueError(f"Already ran a search") | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |         self.input_df = SearcherUtils.read(self.input_path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |         inchikeys = self.input_df["inchikey"].unique() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |         has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what)) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |         has_chembl = any((isinstance(what, ChemblSearch) for what in self.what)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |         # find the compounds first so the user knows what's missing before proceeding | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |         SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |         for what in self.what: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |             output_path = self.output_paths[what.key] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |             df = what.find_to_df(inchikeys) | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |             # TODO keep any other columns in input_df | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |             df.to_csv(output_path) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |             params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |             metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |             metadata.write_json(output_path.with_suffix(".json")) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |             logger.notice(f"Wrote {what.key} to {output_path}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |         return self | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |     def _output_path_of(self, what: Search, to: Optional[Path]) -> Path: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |         if to is None: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |             return self._default_path_of(what) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |         elif str(to).startswith("."): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |             return self._default_path_of(what).with_suffix(str(to)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |             return to | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |     def _default_path_of(self, what: Search) -> Path: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |         parent = self.input_path.parent / (self.input_path.stem + "-output") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |         parent.mkdir(exist_ok=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |         child = what.key + ".csv" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |         node = PathTools.sanitize_path_node(child) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |         if (parent / node).resolve() != (parent / child).resolve(): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |             logger.debug(f"Path {child} sanitized to {node}") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |         return parent / node | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 171 |  |  | __all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"] | 
            
                                                        
            
                                    
            
            
                | 172 |  |  |  |