Passed
Push — dependabot/pip/selenium-4.0.0 ( 707494...3d45c5 )
by
unknown
02:02
created

QueryingPubchemApi._query()   A

Complexity

Conditions 1

Size

Total Lines 7
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 7
nop 2
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
"""
2
PubChem querying API.
3
"""
4
from __future__ import annotations
5
6
import io
7
import time
8
from datetime import datetime, timezone
9
from typing import Any, List, Mapping, Optional, Sequence, Tuple, Union
10
from urllib.error import HTTPError
11
12
import orjson
0 ignored issues
show
introduced by
Unable to import 'orjson'
Loading history...
13
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
14
import regex
0 ignored issues
show
introduced by
Unable to import 'regex'
Loading history...
15
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
16
from pocketutils.core.exceptions import (
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
17
    DataIntegrityError,
18
    DownloadError,
19
    LookupFailedError,
20
)
21
from pocketutils.core.query_utils import QueryExecutor
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.query_utils'
Loading history...
22
23
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
24
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
25
from mandos.model.settings import QUERY_EXECUTORS, SETTINGS
26
from mandos.model.utils.setup import logger
27
28
29
class QueryingPubchemApi(PubchemApi):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
30
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
31
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
32
        chem_data: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
33
        extra_tables: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
34
        classifiers: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
35
        extra_classifiers: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
36
        executor: QueryExecutor = QUERY_EXECUTORS.pubchem,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
37
    ):
38
        self._use_chem_data = chem_data
39
        self._use_extra_tables = extra_tables
40
        self._use_classifiers = classifiers
41
        self._use_extra_classifiers = extra_classifiers
42
        self._executor = executor
43
44
    _pug = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
45
    _pug_view = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
46
    _sdg = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi"
47
    _classifications = "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi"
48
    _link_db = "https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi"
49
50
    def find_inchikey(self, cid: int) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
51
        # return self.fetch_data(cid).names_and_identifiers.inchikey
52
        props = self.fetch_properties(cid)
53
        return props["InChIKey"]
54
55
    def find_id(self, inchikey: str) -> Optional[int]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
56
        # we have to scrape to get the parent anyway,
57
        # so just download it
58
        # TODO: there's a faster way
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
59
        try:
60
            return self.fetch_data(inchikey).cid
61
        except PubchemCompoundLookupError:
62
            logger.debug(f"Could not find pubchem ID for {inchikey}", exc_info=True)
63
            return None
64
65
    def fetch_properties(self, cid: int) -> Mapping[str, Any]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
66
        url = f"{self._pug}/compound/cid/{cid}/JSON"
67
        #
68
        try:
69
            matches: NestedDotDict = self._query_json(url)
70
        except HTTPError:
71
            raise PubchemCompoundLookupError(f"Failed finding pubchem compound {cid}")
72
        props = matches["PC_Compounds"][0]["props"]
73
        props = {NestedDotDict(p).get("urn.label"): p.get("value") for p in props}
74
75
        def _get_val(v):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
76
            v = NestedDotDict(v)
77
            for t in ["ival", "fval", "sval"]:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
78
                if t in v.keys():
79
                    return v[t]
80
81
        props = {k: _get_val(v) for k, v in props.items() if k is not None and v is not None}
82
        logger.debug(f"DLed properties for {cid}")
83
        return props
84
85
    def fetch_data(self, inchikey: Union[str, int]) -> [PubchemData]:
86
        # Dear God this is terrible
87
        # Here are the steps:
88
        # 1. Download HTML for the InChI key and scrape the CID
89
        # 2. Download the "display" JSON data from the CID
90
        # 3. Look for a Parent-type related compound. If it exists, download its display data
91
        # 4. Download the structural data and append it
92
        # 5. Download the external table CSVs and append them
93
        # 6. Download the link sets and append them
94
        # 7. Download the classifiers (hierarchies) and append them
95
        # 8. Attach metadata about how we found this.
96
        # 9. Return the stupid, stupid result as a massive JSON struct.
97
        logger.info(f"Downloading PubChem data for {inchikey}")
98
        if isinstance(inchikey, int):
99
            cid = inchikey
100
            # note: this might not be the parent
101
            # that's ok -- we're about to fix that
102
            inchikey = self.find_inchikey(cid)
103
            logger.debug(f"Matched CID {cid} to {inchikey}")
104
        else:
105
            cid = self._scrape_cid(inchikey)
106
            logger.debug(f"Matched inchikey {inchikey} to CID {cid} (scraped)")
107
        stack = []
108
        data = self._fetch_data(cid, inchikey, stack)
109
        logger.debug(f"DLed raw data for {cid}/{inchikey}")
110
        data = self._get_parent(cid, inchikey, data, stack)
111
        logger.debug(f"DLed PubChem compound {cid}")
112
        return data
113
114
    def _scrape_cid(self, inchikey: str) -> int:
115
        # This is awful
116
        # Every attempt to get the actual, correct, unique CID corresponding to the inchikey
117
        # failed with every proper PubChem API
118
        # We can't use <pug_view>/data/compound/<inchikey> -- we can only use a CID there
119
        # I found it with a PUG API
120
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/GJSURZIOUXUGAL-UHFFFAOYSA-N/record/JSON
121
        # But that returns multiple results!!
122
        # There's no apparent way to find out which one is real
123
        # I tried then querying each found CID, getting the display data, and looking at their parents
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
124
        # Unfortunately, we end up with multiple contradictory parents
125
        # Plus, that's insanely slow -- we have to get the full JSON data for each parent
126
        # Every worse -- the PubChem API docs LIE!!
127
        # Using ?cids_type=parent DOES NOT GIVE THE PARENT compound
128
        # Ex: https://pubchem.ncbi.nlm.nih.gov/compound/656832
129
        # This is cocaine HCl, which has cocaine (446220) as a parent
130
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/656832/JSON
131
        # gives 656832 back again
132
        # same thing when querying by inchikey
133
        # Ultimately, I found that I can get HTML containing the CID from an inchikey
134
        # From there, we'll just have to download its "display" data and get the parent, then download that data
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
135
        url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{inchikey}"
136
        pat = regex.compile(
137
            r'<meta property="og:url" content="https://pubchem\.ncbi\.nlm\.nih\.gov/compound/(\d+)">',
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
138
            flags=regex.V1,
139
        )
140
        try:
141
            for i in range(SETTINGS.pubchem_n_tries):
0 ignored issues
show
Unused Code introduced by
The variable i seems to be unused.
Loading history...
142
                try:
143
                    html = self._query(url)
144
                except ConnectionAbortedError:
145
                    logger.warning(f"Connection aborted for {inchikey} [url: {url}]", exc_info=True)
146
                    continue
147
        except HTTPError:
148
            raise PubchemCompoundLookupError(
149
                f"Failed finding pubchem compound (HTML) from {inchikey} [url: {url}]"
150
            )
151
        match = pat.search(html)
0 ignored issues
show
introduced by
The variable html does not seem to be defined in case the for loop on line 141 is not entered. Are you sure this can never be the case?
Loading history...
152
        if match is None:
153
            raise DataIntegrityError(
154
                f"Something is wrong with the HTML from {url}; og:url not found"
155
            )
156
        return int(match.group(1))
157
158
    def _get_parent(
159
        self, cid: int, inchikey: str, data: PubchemData, stack: List[Tuple[int, str]]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
160
    ) -> PubchemData:
161
        # guard with is not None: we're not caching, so don't do it twice
162
        p = data.parent_or_none
0 ignored issues
show
Coding Style Naming introduced by
Variable name "p" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
163
        if p is None:
164
            logger.info(f"{cid}/{inchikey} is its own parent")
165
            return data
166
        try:
167
            logger.info(f"{cid}/{inchikey} has parent {p}")
168
            del data
169
            return self._fetch_data(p, inchikey, stack)
170
        except HTTPError:
171
            raise PubchemCompoundLookupError(
172
                f"Failed finding pubchem parent compound (JSON)"
173
                f"for cid {p}, child cid {cid}, inchikey {inchikey}"
174
            )
175
176
    def _fetch_data(self, cid: int, inchikey: str, stack: List[Tuple[int, str]]) -> PubchemData:
177
        when_started = datetime.now(timezone.utc).astimezone()
178
        t0 = time.monotonic()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
179
        try:
180
            data = self._fetch_core_data(cid, stack)
181
        except HTTPError:
182
            raise PubchemCompoundLookupError(
183
                f"Failed finding pubchem compound (JSON) from cid {cid}, inchikey {inchikey}"
184
            )
185
        t1 = time.monotonic()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
186
        when_finished = datetime.now(timezone.utc).astimezone()
187
        logger.trace(f"Downloaded {cid} in {t1-t0} s")
188
        data["meta"] = self._get_metadata(inchikey, when_started, when_finished, t0, t1)
189
        self._strip_by_key_in_place(data, "DisplayControls")
190
        stack.append((cid, inchikey))
191
        logger.trace(f"Stack: {stack}")
192
        return PubchemData(NestedDotDict(data))
193
194
    def _fetch_core_data(self, cid: int, stack: List[Tuple[int, str]]) -> dict:
195
        return dict(
196
            record=self._fetch_display_data(cid),
197
            linked_records=self._get_linked_records(cid, stack),
198
            structure=self._fetch_structure_data(cid),
199
            external_tables=self._fetch_external_tables(cid),
200
            link_sets=self._fetch_external_linksets(cid),
201
            classifications=self._fetch_hierarchies(cid),
202
            properties=NestedDotDict(self.fetch_properties(cid)),
203
        )
204
205
    def _get_metadata(self, inchikey: str, started: datetime, finished: datetime, t0: int, t1: int):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "t1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
best-practice introduced by
Too many arguments (6/5)
Loading history...
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
206
        return dict(
207
            timestamp_fetch_started=started.isoformat(),
208
            timestamp_fetch_finished=finished.isoformat(),
209
            from_lookup=inchikey,
210
            fetch_nanos_taken=str(t1 - t0),
211
        )
212
213
    def _get_linked_records(self, cid: int, stack: List[Tuple[int, str]]) -> NestedDotDict:
214
        url = f"{self._pug}/compound/cid/{cid}/cids/JSON?cids_type=same_parent_stereo"
215
        data = self._query_json(url).sub("IdentifierList")
216
        logger.debug(f"DLed {len(data.get('CID', []))} linked records for {cid}")
217
        results = {
218
            "CID": [*data.get("CID", []), *[s for s, _ in stack]],
219
            "inchikey": [i for _, i in stack],
220
        }
221
        logger.debug(f"Linked records are: {results}")
222
        return NestedDotDict(results)
223
224
    def _fetch_display_data(self, cid: int) -> Optional[NestedDotDict]:
225
        url = f"{self._pug_view}/data/compound/{cid}/JSON/?response_type=display"
226
        data = self._query_json(url)["Record"]
227
        logger.debug(f"DLed display data for {cid}")
228
        return data
229
230
    def _fetch_structure_data(self, cid: int) -> NestedDotDict:
231
        if not self._use_chem_data:
232
            return NestedDotDict({})
233
        url = f"{self._pug}/compound/cid/{cid}/JSON"
234
        data = self._query_json(url)["PC_Compounds"][0]
235
        del data["props"]  # redundant with props section in record
236
        logger.debug(f"DLed structure for {cid}")
237
        return data
238
239
    def _fetch_external_tables(self, cid: int) -> Mapping[str, str]:
240
        x = {
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
241
            ext_table: self._fetch_external_table(cid, ext_table)
242
            for ext_table in self._tables_to_use.values()
243
        }
244
        logger.debug(f"DLed {len(self._tables_to_use)} external tables for {cid}")
245
        return x
246
247
    def _fetch_external_linksets(self, cid: int) -> Mapping[str, str]:
248
        x = {
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
249
            table: self._fetch_external_linkset(cid, table)
250
            for table in self._linksets_to_use.values()
251
        }
252
        logger.debug(f"DLed {len(self._linksets_to_use)} external linksets for {cid}")
253
        return x
254
255
    def _fetch_hierarchies(self, cid: int) -> NestedDotDict:
256
        build_up = {}
257
        for hname, hid in self._hierarchies_to_use.items():
258
            try:
259
                build_up[hname] = self._fetch_hierarchy(cid, hname, hid)
260
            except (HTTPError, KeyError, LookupError) as e:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "e" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
261
                logger.debug(f"No data for classifier {hid}, compound {cid}: {e}")
262
        # These list all of the child nodes for each node
263
        # Some of them are > 1000 items -- they're HUGE
264
        # We don't expect to need to navigate to children
265
        self._strip_by_key_in_place(build_up, "ChildID")
266
        logger.debug(f"DLed {len(self._hierarchies_to_use)} hierarchies for {cid}")
267
        return NestedDotDict(build_up)
268
269
    def _fetch_external_table(self, cid: int, table: str) -> Sequence[dict]:
270
        url = self._external_table_url(cid, table)
271
        data = self._query(url)
272
        df: pd.DataFrame = pd.read_csv(io.StringIO(data)).reset_index()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
273
        logger.debug(f"DLed table {table} with {len(df)} rows for {cid}")
274
        return list(df.to_dict(orient="records"))
275
276
    def _fetch_external_linkset(self, cid: int, table: str) -> NestedDotDict:
277
        url = f"{self._link_db}?format=JSON&type={table}&operation=GetAllLinks&id_1={cid}"
278
        data = self._query(url)
279
        logger.debug(f"DLed linkset {table} rows for {cid}")
280
        return NestedDotDict(orjson.loads(data))
281
282
    def _fetch_hierarchy(self, cid: int, hname: str, hid: int) -> Sequence[dict]:
283
        url = f"{self._classifications}?format=json&hid={hid}&search_uid_type=cid&search_uid={cid}&search_type=list&response_type=display"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (138/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
284
        data: Sequence[dict] = orjson.loads(self._query(url))["Hierarchies"]
285
        # underneath Hierarchies is a list of Hierarchy
286
        logger.debug(f"Found data for classifier {hid}, compound {cid}")
287
        if len(data) == 0:
288
            raise LookupFailedError(f"Failed getting hierarchy {hid}")
289
        logger.debug(f"DLed hierarchy {hname} ({hid}) for {cid}")
290
        return data
291
292
    @property
293
    def _tables_to_use(self) -> Mapping[str, str]:
294
        dct = {
295
            "drug:clinicaltrials.gov:clinical_trials": "clinicaltrials",
296
            "pharm:pubchem:reactions": "pathwayreaction",
297
            "uses:cpdat:uses": "cpdat",
298
            "tox:chemidplus:acute_effects": "chemidplus",
299
            "dis:ctd:associated_disorders_and_diseases": "ctd_chemical_disease",
300
            "lit:pubchem:depositor_provided_pubmed_citations": "pubmed",
301
            "bio:dgidb:drug_gene_interactions": "dgidb",
302
            "bio:ctd:chemical_gene_interactions": "ctdchemicalgene",
303
            "bio:drugbank:drugbank_interactions": "drugbank",
304
            "bio:drugbank:drug_drug_interactions": "drugbankddi",
305
            "bio:pubchem:bioassay_results": "bioactivity",
306
        }
307
        if self._use_extra_tables:
308
            dct.update(
309
                {
310
                    "patent:depositor_provided_patent_identifiers": "patent",
311
                    "bio:rcsb_pdb:protein_bound_3d_structures": "pdb",
312
                    "related:pubchem:related_compounds_with_annotation": "compound",
313
                }
314
            )
315
        return dct
316
317
    @property
318
    def _linksets_to_use(self) -> Mapping[str, str]:
319
        return {
320
            "lit:pubchem:chemical_cooccurrences_in_literature": "ChemicalNeighbor",
321
            "lit:pubchem:gene_cooccurrences_in_literature": "ChemicalGeneSymbolNeighbor",
322
            "lit:pubchem:disease_cooccurrences_in_literature": "ChemicalDiseaseNeighbor",
323
        }
324
325
    @property
326
    def _hierarchies_to_use(self) -> Mapping[str, int]:
327
        if not self._use_classifiers:
328
            return {}
329
        dct = {
330
            "MeSH Tree": 1,
331
            "ChEBI Ontology": 2,
332
            "WHO ATC Classification System": 79,
333
            "Guide to PHARMACOLOGY Target Classification": 92,
334
            "ChEMBL Target Tree": 87,
335
        }
336
        if self._use_extra_classifiers:
337
            dct.update(
338
                {
339
                    "KEGG: Phytochemical Compounds": 5,
340
                    "KEGG: Drug": 14,
341
                    "KEGG: USP": 15,
342
                    "KEGG: Major components of natural products": 69,
343
                    "KEGG: Target-based Classification of Drugs": 22,
344
                    "KEGG: OTC drugs": 25,
345
                    "KEGG: Drug Classes": 96,
346
                    "CAMEO Chemicals": 86,
347
                    "EPA CPDat Classification": 99,
348
                    "FDA Pharm Classes": 78,
349
                    "ChemIDplus": 84,
350
                }
351
            )
352
        return dct
353
354
    def _external_table_url(self, cid: int, collection: str) -> str:
355
        return (
356
            self._sdg
357
            + "?infmt=json"
358
            + "&outfmt=csv"
359
            + "&query={ download : * , collection : "
360
            + collection
361
            + " , where :{ ands :[{ cid : "
362
            + str(cid)
363
            + " }]}}"
364
        ).replace(" ", "%22")
365
366
    def _query_json(self, url: str) -> NestedDotDict:
367
        data = self._query(url)
368
        data = NestedDotDict(orjson.loads(data))
369
        if "Fault" in data:
370
            raise DownloadError(
371
                f"PubChem query failed ({data.get('Code')}) on {url}: {data.get('Message')}"
372
            )
373
        logger.trace(
374
            f"Fetched JSON has {data.n_bytes_total()} bytes and {data.n_elements_total()} elements"
375
        )
376
        return data
377
378
    def _query(self, url: str):
379
        data = self._executor(url)
380
        tt = self._executor.last_time_taken
0 ignored issues
show
Coding Style Naming introduced by
Variable name "tt" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
381
        wt, qt = tt.wait.total_seconds(), tt.query.total_seconds()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "qt" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Variable name "wt" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
382
        bts = int(len(data) * 8 / 1024)
383
        logger.trace(f"Queried {bts} kb from {url} in {qt:.1} s with {wt:.1} s of wait")
384
        return data
385
386
    def _strip_by_key_in_place(self, data: Union[dict, list], bad_key: str) -> None:
387
        if isinstance(data, list):
388
            for x in data:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
389
                self._strip_by_key_in_place(x, bad_key)
390
        elif isinstance(data, dict):
391
            for k, v in list(data.items()):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
392
                if k == bad_key:
393
                    del data[k]
394
                elif isinstance(v, (list, dict)):
395
                    self._strip_by_key_in_place(v, bad_key)
396
397
398
__all__ = ["QueryingPubchemApi"]
399