mandos.model.apis.querying_pubchem_api.QueryingPubchemApi.__init__() - Code Metrics - Inspection of "build(deps): bump pyarrow from 3.0.0 to 4.0.1" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — dependabot/pip/pyarrow-4.0.1 ( ca09ce...b2836e )

unknown

created 2021-07-05 18:49 UTC

QueryingPubchemApi.init() A

↳ Parent: mandos.model.apis.querying_pubchem_api

Complexity

Conditions

Size

Total Lines	13
Code Lines	12

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	12
nop	6
dl	0
loc	13
rs	9.8
c	0
b	0
f	0

"""
PubChem querying API.
"""
from __future__ import annotations

import io
import re
import time
from datetime import datetime, timezone
from typing import FrozenSet, Mapping, Optional, Sequence, Union
from urllib.error import HTTPError

import orjson

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.core.query_utils import QueryExecutor


from mandos import logger
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData


class QueryingPubchemApi(PubchemApi):

    def __init__(

        self,

        chem_data: bool = False,

        extra_tables: bool = False,

        classifiers: bool = False,

        extra_classifiers: bool = False,

        query: Optional[QueryExecutor] = None,

    ):
        self._use_chem_data = chem_data
        self._use_extra_tables = extra_tables
        self._use_classifiers = classifiers
        self._use_extra_classifiers = extra_classifiers
        self._query = QueryExecutor(0.22, 0.25) if query is None else query

    _pug = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    _pug_view = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
    _sdg = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi"
    _classifications = "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi"
    _link_db = "https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi"

    def fetch_data(self, inchikey: str) -> Optional[PubchemData]:
        # Dear God this is terrible
        # Here are the steps:
        # 1. Download HTML for the InChI key and scrape the CID
        # 2. Download the "display" JSON data from the CID
        # 3. Look for a Parent-type related compound. If it exists, download its display data
        # 4. Download the structural data and append it
        # 5. Download the external table CSVs and append them
        # 6. Download the link sets and append them
        # 7. Download the classifiers (hierarchies) and append them
        # 8. Attach metadata about how we found this.
        # 9. Return the stupid, stupid result as a massive JSON struct.
        logger.info(f"Downloading PubChem data for {inchikey}")
        cid = self._scrape_cid(inchikey)
        try:
            data = self._fetch_data(cid, inchikey)
        except HTTPError:
            raise PubchemCompoundLookupError(
                f"Failed finding pubchem compound (JSON) from cid {cid}, inchikey {inchikey}"
            )
        data = self._get_parent(cid, inchikey, data)
        return data

    def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]:
        req = self._query(
            f"{self._pug}/compound/similarity/inchikey/{inchi}/JSON?Threshold={min_tc}",
            method="post",
        )
        key = orjson.loads(req)["Waiting"]["ListKey"]
        t0 = time.monotonic()

        while time.monotonic() - t0 < 5:
            # it'll wait as needed here
            resp = self._query(f"{self._pug}/compound/listkey/{key}/cids/JSON")
            resp = NestedDotDict(orjson.loads(resp))
            if resp.get("IdentifierList.CID") is not None:
                return frozenset(resp.req_list_as("IdentifierList.CID", int))
        raise TimeoutError(f"Search for {inchi} using key {key} timed out")

    def _scrape_cid(self, inchikey: str) -> int:
        # This is awful
        # Every attempt to get the actual, correct, unique CID corresponding to the inchikey
        # failed with every proper PubChem API
        # We can't use <pug_view>/data/compound/<inchikey> -- we can only use a CID there
        # I found it with a PUG API
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/GJSURZIOUXUGAL-UHFFFAOYSA-N/record/JSON
        # But that returns multiple results!!
        # There's no apparent way to find out which one is real
        # I tried then querying each found CID, getting the display data, and looking at their parents

        # Unfortunately, we end up with multiple contradictory parents
        # Plus, that's insanely slow -- we have to get the full JSON data for each parent
        # Every worse -- the PubChem API docs LIE!!
        # Using ?cids_type=parent DOES NOT GIVE THE PARENT compound
        # Ex: https://pubchem.ncbi.nlm.nih.gov/compound/656832
        # This is cocaine HCl, which has cocaine (446220) as a parent
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/656832/JSON
        # gives 656832 back again
        # same thing when querying by inchikey
        # Ultimately, I found that I can get HTML containing the CID from an inchikey
        # From there, we'll just have to download its "display" data and get the parent, then download that data

        url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{inchikey}"
        pat = re.compile(
            r'<meta property="og:url" content="https://pubchem\.ncbi\.nlm\.nih\.gov/compound/(\d+)">'

        )
        try:
            html = self._query(url)
        except HTTPError:
            raise PubchemCompoundLookupError(
                f"Failed finding pubchem compound (HTML) from inchikey {inchikey} [url: {url}]"
            )
        match = pat.search(html)
        if match is None:
            raise PubchemCompoundLookupError(
                f"Something is wrong with the HTML from {url}; og:url not found"
            )
        return int(match.group(1))

    def _get_parent(self, cid: int, inchikey: str, data: PubchemData) -> PubchemData:
        # guard with is not None: we're not caching, so don't do it twice
        if data.parent_or_none is None:
            return data
        try:
            return self._fetch_data(data.parent_or_none, inchikey)
        except HTTPError:
            raise PubchemCompoundLookupError(
                f"Failed finding pubchem parent compound (JSON)"
                f"for cid {data.parent_or_none}, child cid {cid}, inchikey {inchikey}"
            )

    def _fetch_data(self, cid: int, inchikey: str) -> PubchemData:
        when_started = datetime.now(timezone.utc).astimezone()
        t0 = time.monotonic_ns()

        data = self._fetch_core_data(cid)
        t1 = time.monotonic_ns()

        when_finished = datetime.now(timezone.utc).astimezone()
        data["meta"] = self._get_metadata(inchikey, when_started, when_finished, t0, t1)
        self._strip_by_key_in_place(data, "DisplayControls")
        return PubchemData(NestedDotDict(data))

    def _fetch_core_data(self, cid: int) -> dict:
        return dict(
            record=self._fetch_display_data(cid),
            structure=self._fetch_structure_data(cid),
            external_tables=self._fetch_external_tables(cid),
            link_sets=self._fetch_external_linksets(cid),
            classifications=self._fetch_hierarchies(cid),
        )

    def _get_metadata(self, inchikey: str, started: datetime, finished: datetime, t0: int, t1: int):
class Foo:
    def some_method(self, x, y):
        return x + y;
        return dict(
            timestamp_fetch_started=started.isoformat(),
            timestamp_fetch_finished=finished.isoformat(),
            from_lookup=inchikey,
            fetch_nanos_taken=str(t1 - t0),
        )

    def _fetch_display_data(self, cid: int) -> Optional[NestedDotDict]:
        url = f"{self._pug_view}/data/compound/{cid}/JSON/?response_type=display"
        return self._query_json(url)["Record"]

    def _fetch_structure_data(self, cid: int) -> NestedDotDict:
        if not self._use_chem_data:
            return NestedDotDict({})
        url = f"{self._pug}/compound/cid/{cid}/JSON"
        data = self._query_json(url)["PC_Compounds"][0]
        del [data["structure"]["props"]]  # redundant with props section in record
        return data

    def _fetch_external_tables(self, cid: int) -> Mapping[str, str]:
        return {
            ext_table: self._fetch_external_table(cid, ext_table)
            for ext_table in self._tables_to_use.values()
        }

    def _fetch_external_linksets(self, cid: int) -> Mapping[str, str]:
        return {
            table: self._fetch_external_linkset(cid, table)
            for table in self._linksets_to_use.values()
        }

    def _fetch_hierarchies(self, cid: int) -> NestedDotDict:
        build_up = {}
        for hname, hid in self._hierarchies_to_use.items():
            try:
                build_up[hname] = self._fetch_hierarchy(cid, hid)
            except (HTTPError, KeyError, LookupError) as e:

                logger.debug(f"No data for classifier {hid}, compound {cid}: {e}")
        # These list all of the child nodes for each node
        # Some of them are > 1000 items -- they're HUGE
        # We don't expect to need to navigate to children
        self._strip_by_key_in_place(build_up, "ChildID")
        return NestedDotDict(build_up)

    def _fetch_external_table(self, cid: int, table: str) -> Sequence[dict]:
        url = self._external_table_url(cid, table)
        data = self._query(url)
        df: pd.DataFrame = pd.read_csv(io.StringIO(data))

        return list(df.T.to_dict().values())

    def _fetch_external_linkset(self, cid: int, table: str) -> NestedDotDict:
        url = f"{self._link_db}?format=JSON&type={table}&operation=GetAllLinks&id_1={cid}"
        data = self._query(url)
        return NestedDotDict(orjson.loads(data))

    def _fetch_hierarchy(self, cid: int, hid: int) -> Sequence[dict]:
        url = f"{self._classifications}?format=json&hid={hid}&search_uid_type=cid&search_uid={cid}&search_type=list&response_type=display"

        data: Sequence[dict] = orjson.loads(self._query(url))["Hierarchies"]
        # underneath Hierarchies is a list of Hierarchy
        logger.debug(f"Found data for classifier {hid}, compound {cid}")
        if len(data) == 0:
            raise LookupError(f"Failed getting hierarchy {hid}")
        return data

    @property
    def _tables_to_use(self) -> Mapping[str, str]:
        dct = {
            "drug:clinicaltrials.gov:clinical_trials": "clinicaltrials",
            "pharm:pubchem:reactions": "pathwayreaction",
            "uses:cpdat:uses": "cpdat",
            "tox:chemidplus:acute_effects": "chemidplus",
            "dis:ctd:associated_disorders_and_diseases": "ctd_chemical_disease",
            "lit:pubchem:depositor_provided_pubmed_citations": "pubmed",
            "bio:dgidb:drug_gene_interactions": "dgidb",
            "bio:ctd:chemical_gene_interactions": "ctdchemicalgene",
            "bio:drugbank:drugbank_interactions": "drugbank",
            "bio:drugbank:drug_drug_interactions": "drugbankddi",
            "bio:pubchem:bioassay_results": "bioactivity",
        }
        if self._use_extra_tables:
            dct.update(
                {
                    "patent:depositor_provided_patent_identifiers": "patent",
                    "bio:rcsb_pdb:protein_bound_3d_structures": "pdb",
                    "related:pubchem:related_compounds_with_annotation": "compound",
                }
            )
        return dct

    @property
    def _linksets_to_use(self) -> Mapping[str, str]:
        return {
            "lit:pubchem:chemical_cooccurrences_in_literature": "ChemicalNeighbor",
            "lit:pubchem:gene_cooccurrences_in_literature": "ChemicalGeneSymbolNeighbor",
            "lit:pubchem:disease_cooccurrences_in_literature": "ChemicalDiseaseNeighbor",
        }

    @property
    def _hierarchies_to_use(self) -> Mapping[str, int]:
        if not self._use_classifiers:
            return {}
        dct = {
            "MeSH Tree": 1,
            "ChEBI Ontology": 2,
            "WHO ATC Classification System": 79,
            "Guide to PHARMACOLOGY Target Classification": 92,
            "ChEMBL Target Tree": 87,
        }
        if self._use_extra_classifiers:
            dct.update(
                {
                    "KEGG: Phytochemical Compounds": 5,
                    "KEGG: Drug": 14,
                    "KEGG: USP": 15,
                    "KEGG: Major components of natural products": 69,
                    "KEGG: Target-based Classification of Drugs": 22,
                    "KEGG: OTC drugs": 25,
                    "KEGG: Drug Classes": 96,
                    "CAMEO Chemicals": 86,
                    "EPA CPDat Classification": 99,
                    "FDA Pharm Classes": 78,
                    "ChemIDplus": 84,
                }
            )
        return dct

    def _external_table_url(self, cid: int, collection: str) -> str:
        return (
            self._sdg
            + "?infmt=json"
            + "&outfmt=csv"
            + "&query={ download : * , collection : "
            + collection
            + " , where :{ ands :[{ cid : "
            + str(cid)
            + " }]}}"
        ).replace(" ", "%22")

    def _query_json(self, url: str) -> NestedDotDict:
        data = self._query(url)
        data = NestedDotDict(orjson.loads(data))
        if "Fault" in data:
            raise ValueError(f"Request failed ({data.get('Code')}) on {url}: {data.get('Message')}")
        return data

    def _strip_by_key_in_place(self, data: Union[dict, list], bad_key: str) -> None:
        if isinstance(data, list):
            for x in data:

                self._strip_by_key_in_place(x, bad_key)
        elif isinstance(data, dict):
            for k, v in list(data.items()):

                if k == bad_key:
                    del data[k]
                elif isinstance(v, (list, dict)):
                    self._strip_by_key_in_place(v, bad_key)


__all__ = ["QueryingPubchemApi"]


1			"""
2			PubChem querying API.
3			"""
4			from __future__ import annotations
5
6			import io
7			import re
8			import time
9			from datetime import datetime, timezone
10			from typing import FrozenSet, Mapping, Optional, Sequence, Union
11			from urllib.error import HTTPError
12
13			import orjson
			0 ignored issues – show introduced 2021-07-05 18:51 UTC by Report Bug Copy Issue Report Unable to import 'orjson' Loading history...
14			import pandas as pd
			0 ignored issues – show introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
15			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
16			from pocketutils.core.query_utils import QueryExecutor
			0 ignored issues – show introduced 2021-06-30 04:51 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.query_utils' Loading history...
17
18			from mandos import logger
19			from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
20			from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
21
22
23			class QueryingPubchemApi(PubchemApi):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
24			def __init__(
			0 ignored issues – show best-practice introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Too many arguments (6/5) Loading history...
25			self,
			0 ignored issues – show Coding Style introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
26			chem_data: bool = False,
			0 ignored issues – show Coding Style introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
27			extra_tables: bool = False,
			0 ignored issues – show Coding Style introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
28			classifiers: bool = False,
			0 ignored issues – show Coding Style introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
29			extra_classifiers: bool = False,
			0 ignored issues – show Coding Style introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...

dmyersturnbull / mandos

Push — dependabot/pip/pyarrow-4.0.1 ( ca09ce...b2836e )

QueryingPubchemApi.__init__() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

QueryingPubchemApi.init() A