Passed
Push — main ( 2e1b6b...3a0c28 )
by Douglas
02:06
created

QueryingPubchemApi._scrape_cid()   A

Complexity

Conditions 5

Size

Total Lines 39
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 17
nop 2
dl 0
loc 39
rs 9.0833
c 0
b 0
f 0
1
"""
2
PubChem querying API.
3
"""
4
from __future__ import annotations
5
6
import io
7
import time
8
from datetime import datetime, timezone
9
from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
10
from urllib.error import HTTPError
11
12
import orjson
0 ignored issues
show
introduced by
Unable to import 'orjson'
Loading history...
13
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
14
import regex
0 ignored issues
show
introduced by
Unable to import 'regex'
Loading history...
15
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
16
from pocketutils.core.exceptions import (
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
17
    DataIntegrityError,
18
    DownloadError,
19
    LookupFailedError,
20
)
21
from pocketutils.core.query_utils import QueryExecutor
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.query_utils'
Loading history...
22
23
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
24
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
25
from mandos.model.settings import QUERY_EXECUTORS, SETTINGS
26
from mandos.model.utils.setup import logger
27
28
_html_cid_pattern = regex.compile(
29
    r'<meta property="og:url" content="https://pubchem\.ncbi\.nlm\.nih\.gov/compound/(\d+)">',
30
    flags=regex.V1,
31
)
32
33
34
class QueryingPubchemApi(PubchemApi):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
35
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
36
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
37
        chem_data: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
38
        extra_tables: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
39
        classifiers: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
40
        extra_classifiers: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
41
        executor: QueryExecutor = QUERY_EXECUTORS.pubchem,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
42
    ):
43
        self._use_chem_data = chem_data
44
        self._use_extra_tables = extra_tables
45
        self._use_classifiers = classifiers
46
        self._use_extra_classifiers = extra_classifiers
47
        self._executor = executor
48
49
    _pug = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
50
    _pug_view = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
51
    _sdg = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi"
52
    _classifications = "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi"
53
    _link_db = "https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi"
54
55
    def find_inchikey(self, cid: int) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
56
        # return self.fetch_data(cid).names_and_identifiers.inchikey
57
        props = self.fetch_properties(cid)
58
        return props["InChIKey"]
59
60
    def find_id(self, inchikey: str) -> Optional[int]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
61
        # we have to scrape to get the parent anyway,
62
        # so just download it
63
        # TODO: there's a faster way
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
64
        try:
65
            return self.fetch_data(inchikey).cid
66
        except PubchemCompoundLookupError:
67
            logger.debug(f"Could not find pubchem ID for {inchikey}", exc_info=True)
68
            return None
69
70
    def fetch_properties(self, cid: int) -> Mapping[str, Any]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
71
        url = f"{self._pug}/compound/cid/{cid}/JSON"
72
        #
73
        try:
74
            matches: NestedDotDict = self._query_json(url)
75
        except HTTPError:
76
            raise PubchemCompoundLookupError(f"Failed finding pubchem compound {cid}")
77
        props = matches["PC_Compounds"][0]["props"]
78
        props = {NestedDotDict(p).get("urn.label"): p.get("value") for p in props}
79
80
        def _get_val(v):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
81
            v = NestedDotDict(v)
82
            for t in ["ival", "fval", "sval"]:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
83
                if t in v.keys():
84
                    return v[t]
85
86
        props = {k: _get_val(v) for k, v in props.items() if k is not None and v is not None}
87
        logger.debug(f"DLed properties for {cid}")
88
        return props
89
90
    def fetch_data(self, inchikey: Union[str, int]) -> [PubchemData]:
91
        # Dear God this is terrible
92
        # Here are the steps:
93
        # 1. Download HTML for the InChI key and scrape the CID
94
        # 2. Download the "display" JSON data from the CID
95
        # 3. Look for a Parent-type related compound. If it exists, download its display data
96
        # 4. Download the structural data and append it
97
        # 5. Download the external table CSVs and append them
98
        # 6. Download the link sets and append them
99
        # 7. Download the classifiers (hierarchies) and append them
100
        # 8. Attach metadata about how we found this.
101
        # 9. Return the stupid, stupid result as a massive JSON struct.
102
        logger.info(f"Downloading PubChem data for {inchikey}")
103
        if isinstance(inchikey, int):
104
            cid = inchikey
105
            # note: this might not be the parent
106
            # that's ok -- we're about to fix that
107
            inchikey = self.find_inchikey(cid)
108
            logger.debug(f"Matched CID {cid} to {inchikey}")
109
        else:
110
            cid = self._scrape_cid(inchikey)
111
            logger.debug(f"Matched inchikey {inchikey} to CID {cid} (scraped)")
112
        stack = []
113
        data = self._fetch_data(cid, inchikey, stack)
114
        logger.debug(f"DLed raw data for {cid}/{inchikey}")
115
        data = self._get_parent(cid, inchikey, data, stack)
116
        logger.debug(f"DLed PubChem compound {cid}")
117
        return data
118
119
    def _scrape_cid(self, inchikey: str) -> int:
120
        # This is awful
121
        # Every attempt to get the actual, correct, unique CID corresponding to the inchikey
122
        # failed with every proper PubChem API
123
        # We can't use <pug_view>/data/compound/<inchikey> -- we can only use a CID there
124
        # I found it with a PUG API
125
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/GJSURZIOUXUGAL-UHFFFAOYSA-N/record/JSON
126
        # But that returns multiple results!!
127
        # There's no apparent way to find out which one is real
128
        # I tried then querying each found CID, getting the display data, and looking at their parents
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
129
        # Unfortunately, we end up with multiple contradictory parents
130
        # Plus, that's insanely slow -- we have to get the full JSON data for each parent
131
        # Every worse -- the PubChem API docs LIE!!
132
        # Using ?cids_type=parent DOES NOT GIVE THE PARENT compound
133
        # Ex: https://pubchem.ncbi.nlm.nih.gov/compound/656832
134
        # This is cocaine HCl, which has cocaine (446220) as a parent
135
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/656832/JSON
136
        # gives 656832 back again
137
        # same thing when querying by inchikey
138
        # Ultimately, I found that I can get HTML containing the CID from an inchikey
139
        # From there, we'll just have to download its "display" data and get the parent, then download that data
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
140
        url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{inchikey}"
141
        try:
142
            for i in range(SETTINGS.pubchem_n_tries):
0 ignored issues
show
Unused Code introduced by
The variable i seems to be unused.
Loading history...
143
                try:
144
                    html = self._query(url)
145
                except ConnectionAbortedError:
146
                    logger.warning(f"Connection aborted for {inchikey} [url: {url}]", exc_info=True)
147
                    continue
148
        except HTTPError:
149
            raise PubchemCompoundLookupError(
150
                f"Failed finding pubchem compound (HTML) from {inchikey} [url: {url}]"
151
            )
152
        match = _html_cid_pattern.search(html)
0 ignored issues
show
introduced by
The variable html does not seem to be defined in case the for loop on line 142 is not entered. Are you sure this can never be the case?
Loading history...
153
        if match is None:
154
            raise DataIntegrityError(
155
                f"Something is wrong with the HTML from {url}; og:url not found"
156
            )
157
        return int(match.group(1))
158
159
    def _get_parent(
160
        self, cid: int, inchikey: str, data: PubchemData, stack: List[Tuple[int, str]]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
161
    ) -> PubchemData:
162
        # guard with is not None: we're not caching, so don't do it twice
163
        p = data.parent_or_none
0 ignored issues
show
Coding Style Naming introduced by
Variable name "p" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
164
        if p is None:
165
            logger.info(f"{cid}/{inchikey} is its own parent")
166
            return data
167
        try:
168
            logger.info(f"{cid}/{inchikey} has parent {p}")
169
            del data
170
            return self._fetch_data(p, inchikey, stack)
171
        except HTTPError:
172
            raise PubchemCompoundLookupError(
173
                f"Failed finding pubchem parent compound (JSON)"
174
                f"for cid {p}, child cid {cid}, inchikey {inchikey}"
175
            )
176
177
    def _fetch_data(self, cid: int, inchikey: str, stack: List[Tuple[int, str]]) -> PubchemData:
178
        when_started = datetime.now(timezone.utc).astimezone()
179
        t0 = time.monotonic()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
180
        try:
181
            data = self._fetch_core_data(cid, stack)
182
        except HTTPError:
183
            raise PubchemCompoundLookupError(
184
                f"Failed finding pubchem compound (JSON) from cid {cid}, inchikey {inchikey}"
185
            )
186
        t1 = time.monotonic()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
187
        when_finished = datetime.now(timezone.utc).astimezone()
188
        logger.trace(f"Downloaded {cid} in {t1-t0} s")
189
        data["meta"] = self._get_metadata(inchikey, when_started, when_finished, t0, t1)
190
        self._strip_by_key_in_place(data, "DisplayControls")
191
        stack.append((cid, inchikey))
192
        logger.trace(f"Stack: {stack}")
193
        return PubchemData(NestedDotDict(data))
194
195
    def _fetch_core_data(self, cid: int, stack: List[Tuple[int, str]]) -> dict:
196
        return dict(
197
            record=self._fetch_display_data(cid),
198
            linked_records=self._get_linked_records(cid, stack),
199
            structure=self._fetch_structure_data(cid),
200
            external_tables=self._fetch_external_tables(cid),
201
            link_sets=self._fetch_external_linksets(cid),
202
            classifications=self._fetch_hierarchies(cid),
203
            properties=NestedDotDict(self.fetch_properties(cid)),
204
        )
205
206
    def _get_metadata(self, inchikey: str, started: datetime, finished: datetime, t0: int, t1: int):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "t1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
best-practice introduced by
Too many arguments (6/5)
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
207
        return dict(
208
            timestamp_fetch_started=started.isoformat(),
209
            timestamp_fetch_finished=finished.isoformat(),
210
            from_lookup=inchikey,
211
            fetch_nanos_taken=str(t1 - t0),
212
        )
213
214
    def _get_linked_records(self, cid: int, stack: List[Tuple[int, str]]) -> NestedDotDict:
215
        url = f"{self._pug}/compound/cid/{cid}/cids/JSON?cids_type=same_parent_stereo"
216
        data = self._query_json(url).sub("IdentifierList")
217
        logger.debug(f"DLed {len(data.get('CID', []))} linked records for {cid}")
218
        results = {
219
            "CID": [*data.get("CID", []), *[s for s, _ in stack]],
220
            "inchikey": [i for _, i in stack],
221
        }
222
        logger.debug(f"Linked records are: {results}")
223
        return NestedDotDict(results)
224
225
    def _fetch_display_data(self, cid: int) -> Optional[NestedDotDict]:
226
        url = f"{self._pug_view}/data/compound/{cid}/JSON/?response_type=display"
227
        data = self._query_json(url)["Record"]
228
        logger.debug(f"DLed display data for {cid}")
229
        return data
230
231
    def _fetch_structure_data(self, cid: int) -> NestedDotDict:
232
        if not self._use_chem_data:
233
            return NestedDotDict({})
234
        url = f"{self._pug}/compound/cid/{cid}/JSON"
235
        data = self._query_json(url)["PC_Compounds"][0]
236
        del data["props"]  # redundant with props section in record
237
        logger.debug(f"DLed structure for {cid}")
238
        return data
239
240
    def _fetch_external_tables(self, cid: int) -> Mapping[str, str]:
241
        x = {
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
242
            ext_table: self._fetch_external_table(cid, ext_table)
243
            for ext_table in self._tables_to_use.values()
244
        }
245
        logger.debug(f"DLed {len(self._tables_to_use)} external tables for {cid}")
246
        return x
247
248
    def _fetch_external_linksets(self, cid: int) -> Mapping[str, str]:
249
        x = {
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
250
            table: self._fetch_external_linkset(cid, table)
251
            for table in self._linksets_to_use.values()
252
        }
253
        logger.debug(f"DLed {len(self._linksets_to_use)} external linksets for {cid}")
254
        return x
255
256
    def _fetch_hierarchies(self, cid: int) -> NestedDotDict:
257
        build_up = {}
258
        for hname, hid in self._hierarchies_to_use.items():
259
            try:
260
                build_up[hname] = self._fetch_hierarchy(cid, hname, hid)
261
            except (HTTPError, KeyError, LookupError) as e:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "e" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
262
                logger.debug(f"No data for classifier {hid}, compound {cid}: {e}")
263
        # These list all of the child nodes for each node
264
        # Some of them are > 1000 items -- they're HUGE
265
        # We don't expect to need to navigate to children
266
        self._strip_by_key_in_place(build_up, "ChildID")
267
        logger.debug(f"DLed {len(self._hierarchies_to_use)} hierarchies for {cid}")
268
        return NestedDotDict(build_up)
269
270
    def _fetch_external_table(self, cid: int, table: str) -> Sequence[dict]:
271
        url = self._external_table_url(cid, table)
272
        data = self._query(url)
273
        df: pd.DataFrame = pd.read_csv(io.StringIO(data)).reset_index()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
274
        logger.debug(f"DLed table {table} with {len(df)} rows for {cid}")
275
        return list(df.to_dict(orient="records"))
276
277
    def _fetch_external_linkset(self, cid: int, table: str) -> NestedDotDict:
278
        url = f"{self._link_db}?format=JSON&type={table}&operation=GetAllLinks&id_1={cid}"
279
        data = self._query(url)
280
        logger.debug(f"DLed linkset {table} rows for {cid}")
281
        return NestedDotDict(orjson.loads(data))
282
283
    def _fetch_hierarchy(self, cid: int, hname: str, hid: int) -> Sequence[dict]:
284
        url = f"{self._classifications}?format=json&hid={hid}&search_uid_type=cid&search_uid={cid}&search_type=list&response_type=display"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (138/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
285
        data: Sequence[dict] = orjson.loads(self._query(url))["Hierarchies"]
286
        # underneath Hierarchies is a list of Hierarchy
287
        logger.debug(f"Found data for classifier {hid}, compound {cid}")
288
        if len(data) == 0:
289
            raise LookupFailedError(f"Failed getting hierarchy {hid}")
290
        logger.debug(f"DLed hierarchy {hname} ({hid}) for {cid}")
291
        return data
292
293
    @property
294
    def _tables_to_use(self) -> Mapping[str, str]:
295
        dct = {
296
            "drug:clinicaltrials.gov:clinical_trials": "clinicaltrials",
297
            "pharm:pubchem:reactions": "pathwayreaction",
298
            "uses:cpdat:uses": "cpdat",
299
            "tox:chemidplus:acute_effects": "chemidplus",
300
            "dis:ctd:associated_disorders_and_diseases": "ctd_chemical_disease",
301
            "lit:pubchem:depositor_provided_pubmed_citations": "pubmed",
302
            "bio:dgidb:drug_gene_interactions": "dgidb",
303
            "bio:ctd:chemical_gene_interactions": "ctdchemicalgene",
304
            "bio:drugbank:drugbank_interactions": "drugbank",
305
            "bio:drugbank:drug_drug_interactions": "drugbankddi",
306
            "bio:pubchem:bioassay_results": "bioactivity",
307
        }
308
        if self._use_extra_tables:
309
            dct.update(
310
                {
311
                    "patent:depositor_provided_patent_identifiers": "patent",
312
                    "bio:rcsb_pdb:protein_bound_3d_structures": "pdb",
313
                    "related:pubchem:related_compounds_with_annotation": "compound",
314
                }
315
            )
316
        return dct
317
318
    @property
319
    def _linksets_to_use(self) -> Mapping[str, str]:
320
        return {
321
            "lit:pubchem:chemical_cooccurrences_in_literature": "ChemicalNeighbor",
322
            "lit:pubchem:gene_cooccurrences_in_literature": "ChemicalGeneSymbolNeighbor",
323
            "lit:pubchem:disease_cooccurrences_in_literature": "ChemicalDiseaseNeighbor",
324
        }
325
326
    @property
327
    def _hierarchies_to_use(self) -> Mapping[str, int]:
328
        if not self._use_classifiers:
329
            return {}
330
        dct = {
331
            "MeSH Tree": 1,
332
            "ChEBI Ontology": 2,
333
            "WHO ATC Classification System": 79,
334
            "Guide to PHARMACOLOGY Target Classification": 92,
335
            "ChEMBL Target Tree": 87,
336
        }
337
        if self._use_extra_classifiers:
338
            dct.update(
339
                {
340
                    "KEGG: Phytochemical Compounds": 5,
341
                    "KEGG: Drug": 14,
342
                    "KEGG: USP": 15,
343
                    "KEGG: Major components of natural products": 69,
344
                    "KEGG: Target-based Classification of Drugs": 22,
345
                    "KEGG: OTC drugs": 25,
346
                    "KEGG: Drug Classes": 96,
347
                    "CAMEO Chemicals": 86,
348
                    "EPA CPDat Classification": 99,
349
                    "FDA Pharm Classes": 78,
350
                    "ChemIDplus": 84,
351
                }
352
            )
353
        return dct
354
355
    def _external_table_url(self, cid: int, collection: str) -> str:
356
        return (
357
            self._sdg
358
            + "?infmt=json"
359
            + "&outfmt=csv"
360
            + "&query={ download : * , collection : "
361
            + collection
362
            + " , where :{ ands :[{ cid : "
363
            + str(cid)
364
            + " }]}}"
365
        ).replace(" ", "%22")
366
367
    def _query_json(self, url: str) -> NestedDotDict:
368
        data = self._query(url)
369
        data = NestedDotDict(orjson.loads(data))
370
        if "Fault" in data:
371
            raise DownloadError(
372
                f"PubChem query failed ({data.get('Code')}) on {url}: {data.get('Message')}"
373
            )
374
        logger.trace(
375
            f"Fetched JSON has {data.n_bytes_total()} bytes and {data.n_elements_total()} elements"
376
        )
377
        return data
378
379
    def _query(self, url: str):
380
        data = self._executor(url)
381
        tt = self._executor.last_time_taken
0 ignored issues
show
Coding Style Naming introduced by
Variable name "tt" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
382
        wt, qt = tt.wait.total_seconds(), tt.query.total_seconds()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "qt" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Variable name "wt" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
383
        bts = int(len(data) * 8 / 1024)
384
        logger.trace(f"Queried {bts} kb from {url} in {qt:.1} s with {wt:.1} s of wait")
385
        return data
386
387
    def _strip_by_key_in_place(self, data: Union[dict, list], bad_key: str) -> None:
388
        if isinstance(data, list):
389
            for x in data:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
390
                self._strip_by_key_in_place(x, bad_key)
391
        elif isinstance(data, dict):
392
            for k, v in list(data.items()):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
393
                if k == bad_key:
394
                    del data[k]
395
                elif isinstance(v, (list, dict)):
396
                    self._strip_by_key_in_place(v, bad_key)
397
398
399
__all__ = ["QueryingPubchemApi"]
400