Passed
Push — main ( 4b9dc0...1b55d1 )
by Douglas
06:16 queued 02:32
created

  A

Complexity

Conditions 1

Size

Total Lines 7
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 7
nop 2
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
"""
2
PubChem querying API.
3
"""
4
from __future__ import annotations
5
6
import io
7
import time
8
from datetime import datetime, timezone
9
from typing import Any, List, Mapping, NamedTuple, Optional, Sequence, Union
10
from urllib.error import HTTPError
11
12
import orjson
0 ignored issues
show
introduced by
Unable to import 'orjson'
Loading history...
13
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
14
import regex
0 ignored issues
show
introduced by
Unable to import 'regex'
Loading history...
15
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
16
from pocketutils.core.exceptions import (
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
17
    DataIntegrityError,
18
    DownloadError,
19
    LookupFailedError,
20
)
21
from pocketutils.core.query_utils import QueryExecutor, QueryMixin
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.query_utils'
Loading history...
22
23
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
24
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
25
from mandos.model.settings import QUERY_EXECUTORS, SETTINGS
26
from mandos.model.utils.setup import logger
27
28
_html_cid_pattern = regex.compile(
29
    r'<meta property="og:url" content="https://pubchem\.ncbi\.nlm\.nih\.gov/compound/(\d+)">',
30
    flags=regex.V1,
31
)
32
33
34
class _CidInchikey(NamedTuple):
35
    cid: int
36
    inchikey: str
37
38
39
class QueryingPubchemApi(PubchemApi, QueryMixin):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
40
    def __init__(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
41
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
42
        chem_data: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
43
        extra_tables: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
44
        classifiers: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
45
        extra_classifiers: bool = False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
46
        executor: QueryExecutor = QUERY_EXECUTORS.pubchem,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
47
    ):
48
        self._use_chem_data = chem_data
49
        self._use_extra_tables = extra_tables
50
        self._use_classifiers = classifiers
51
        self._use_extra_classifiers = extra_classifiers
52
        self._executor = executor
53
54
    _pug = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
55
    _pug_view = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view"
56
    _sdg = "https://pubchem.ncbi.nlm.nih.gov/sdq/sdqagent.cgi"
57
    _classifications = "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi"
58
    _link_db = "https://pubchem.ncbi.nlm.nih.gov/link_db/link_db_server.cgi"
59
60
    def find_inchikey(self, cid: int) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
61
        # return self.fetch_data(cid).names_and_identifiers.inchikey
62
        props = self.fetch_properties(cid)
63
        return props["InChIKey"]
64
65
    def find_id(self, inchikey: str) -> Optional[int]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
66
        # we have to scrape to get the parent anyway,
67
        # so just download it
68
        # TODO: there's a faster way
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
69
        try:
70
            return self.fetch_data(inchikey).cid
71
        except PubchemCompoundLookupError:
72
            logger.opt(exception=True).debug(f"Could not find pubchem ID for {inchikey}")
73
            return None
74
75
    def fetch_properties(self, cid: int) -> Mapping[str, Any]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
76
        url = f"{self._pug}/compound/cid/{cid}/JSON"
77
        #
78
        try:
79
            matches: NestedDotDict = self._query_json(url)
80
        except HTTPError:
81
            raise PubchemCompoundLookupError(f"Failed finding pubchem compound {cid}")
82
        props = matches["PC_Compounds"][0]["props"]
83
        props = {NestedDotDict(p).get("urn.label"): p.get("value") for p in props}
84
85
        def _get_val(v):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
86
            v = NestedDotDict(v)
87
            for t in ["ival", "fval", "sval"]:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
88
                if t in v.keys():
89
                    return v[t]
90
91
        props = {k: _get_val(v) for k, v in props.items() if k is not None and v is not None}
92
        logger.debug(f"DLed properties for {cid}")
93
        return props
94
95
    def fetch_data(self, inchikey: Union[str, int]) -> [PubchemData]:
96
        # Dear God this is terrible
97
        # Here are the steps:
98
        # 1. Download HTML for the InChI key and scrape the CID
99
        # 2. Download the "display" JSON data from the CID
100
        # 3. Look for a Parent-type related compound. If it exists, download its display data
101
        # 4. Download the structural data and append it
102
        # 5. Download the external table CSVs and append them
103
        # 6. Download the link sets and append them
104
        # 7. Download the classifiers (hierarchies) and append them
105
        # 8. Attach metadata about how we found this.
106
        # 9. Return the stupid, stupid result as a massive JSON struct.
107
        logger.info(f"Downloading PubChem data for {inchikey}")
108
        if isinstance(inchikey, int):
109
            cid = inchikey
110
            # note: this might not be the parent
111
            # that's ok -- we're about to fix that
112
            inchikey = self.find_inchikey(cid)
113
            logger.debug(f"Matched CID {cid} to {inchikey}")
114
        else:
115
            cid = self._scrape_cid(inchikey)
116
            logger.debug(f"Matched inchikey {inchikey} to CID {cid} (scraped)")
117
        stack = []
118
        data = self._fetch_data(cid, inchikey, stack)
119
        logger.debug(f"DLed raw data for {cid}/{inchikey}")
120
        data = self._get_parent(cid, inchikey, data, stack)
121
        logger.debug(f"DLed PubChem compound {cid}")
122
        return data
123
124
    def _scrape_cid(self, inchikey: str) -> int:
125
        # This is awful
126
        # Every attempt to get the actual, correct, unique CID corresponding to the inchikey
127
        # failed with every proper PubChem API
128
        # We can't use <pug_view>/data/compound/<inchikey> -- we can only use a CID there
129
        # I found it with a PUG API
130
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/GJSURZIOUXUGAL-UHFFFAOYSA-N/record/JSON
131
        # But that returns multiple results!!
132
        # There's no apparent way to find out which one is real
133
        # I tried then querying each found CID, getting the display data, and looking at their parents
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (102/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
134
        # Unfortunately, we end up with multiple contradictory parents
135
        # Plus, that's insanely slow -- we have to get the full JSON data for each parent
136
        # Every worse -- the PubChem API docs LIE!!
137
        # Using ?cids_type=parent DOES NOT GIVE THE PARENT compound
138
        # Ex: https://pubchem.ncbi.nlm.nih.gov/compound/656832
139
        # This is cocaine HCl, which has cocaine (446220) as a parent
140
        # https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/656832/JSON
141
        # gives 656832 back again
142
        # same thing when querying by inchikey
143
        # Ultimately, I found that I can get HTML containing the CID from an inchikey
144
        # From there, we'll just have to download its "display" data and get the parent, then download that data
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
145
        url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{inchikey}"
146
        html = None
147
        try:
148
            for i in range(SETTINGS.pubchem_n_tries):
0 ignored issues
show
Unused Code introduced by
The variable i seems to be unused.
Loading history...
149
                try:
150
                    html = self._query(url)
151
                except ConnectionAbortedError:
152
                    logger.opt(exception=True).warning(
153
                        f"Connection aborted for {inchikey} [url: {url}]"
154
                    )
155
                    continue
156
        except HTTPError:
157
            raise PubchemCompoundLookupError(
158
                f"Failed finding pubchem compound (HTML) from {inchikey} [url: {url}]"
159
            )
160
        if html is None:
161
            raise AssertionError(f"Impossible!!")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
162
        match = _html_cid_pattern.search(html)
163
        if match is None:
164
            raise DataIntegrityError(
165
                f"Something is wrong with the HTML from {url}; og:url not found"
166
            )
167
        return int(match.group(1))
168
169
    def _get_parent(
170
        self, cid: int, inchikey: str, data: PubchemData, stack: List[_CidInchikey]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
171
    ) -> PubchemData:
172
        # guard with is not None: we're not caching, so don't do it twice
173
        p = data.parent_or_none
0 ignored issues
show
Coding Style Naming introduced by
Variable name "p" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
174
        if p is None:
175
            logger.info(f"{cid}/{inchikey} is its own parent")
176
            return data
177
        try:
178
            logger.info(f"{cid}/{inchikey} has parent {p}")
179
            del data
180
            return self._fetch_data(p, inchikey, stack)
181
        except HTTPError:
182
            raise PubchemCompoundLookupError(
183
                f"Failed finding pubchem parent compound (JSON)"
184
                f"for cid {p}, child cid {cid}, inchikey {inchikey}"
185
            )
186
187
    def _fetch_data(self, cid: int, inchikey: str, stack: List[_CidInchikey]) -> PubchemData:
188
        when_started = datetime.now(timezone.utc).astimezone()
189
        t0 = time.monotonic()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
190
        try:
191
            data = self._fetch_core_data(cid, stack)
192
        except HTTPError:
193
            raise PubchemCompoundLookupError(
194
                f"Failed finding pubchem compound (JSON) from cid {cid}, inchikey {inchikey}"
195
            )
196
        t1 = time.monotonic()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "t1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
197
        when_finished = datetime.now(timezone.utc).astimezone()
198
        logger.trace(f"Downloaded {cid} in {t1-t0} s")
199
        data["meta"] = self._get_metadata(inchikey, when_started, when_finished, t0, t1)
200
        self._strip_by_key_in_place(data, "DisplayControls")
201
        stack.append(_CidInchikey(cid, inchikey))
202
        logger.trace(f"Stack: {stack}")
203
        return PubchemData(NestedDotDict(data))
204
205
    def _fetch_core_data(self, cid: int, stack: List[_CidInchikey]) -> dict:
206
        return dict(
207
            record=self._fetch_display_data(cid),
208
            linked_records=self._get_linked_records(cid, stack),
209
            structure=self._fetch_structure_data(cid),
210
            external_tables=self._fetch_external_tables(cid),
211
            link_sets=self._fetch_external_linksets(cid),
212
            classifications=self._fetch_hierarchies(cid),
213
            properties=NestedDotDict(self.fetch_properties(cid)),
214
        )
215
216
    def _get_metadata(
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
Coding Style Naming introduced by
Argument name "t1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Argument name "t0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
best-practice introduced by
Too many arguments (6/5)
Loading history...
217
        self, inchikey: str, started: datetime, finished: datetime, t0: float, t1: float
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
218
    ):
219
        return dict(
220
            timestamp_fetch_started=started.isoformat(),
221
            timestamp_fetch_finished=finished.isoformat(),
222
            from_lookup=inchikey,
223
            fetch_secs_taken=str(t1 - t0),
224
        )
225
226
    def _get_linked_records(self, cid: int, stack: List[_CidInchikey]) -> NestedDotDict:
227
        url = f"{self._pug}/compound/cid/{cid}/cids/JSON?cids_type=same_parent_stereo"
228
        data = self._query_json(url).sub("IdentifierList")
229
        logger.debug(f"DLed {len(data.get('CID', []))} linked records for {cid}")
230
        results = {
231
            "CID": [*data.get("CID", []), *[s for s, _ in stack]],
232
            "inchikey": [i for _, i in stack],
233
        }
234
        logger.debug(f"Linked records are: {results}")
235
        return NestedDotDict(results)
236
237
    def _fetch_display_data(self, cid: int) -> Optional[NestedDotDict]:
238
        url = f"{self._pug_view}/data/compound/{cid}/JSON/?response_type=display"
239
        data = self._query_json(url)["Record"]
240
        logger.debug(f"DLed display data for {cid}")
241
        return data
242
243
    def _fetch_structure_data(self, cid: int) -> NestedDotDict:
244
        if not self._use_chem_data:
245
            return NestedDotDict({})
246
        url = f"{self._pug}/compound/cid/{cid}/JSON"
247
        data = self._query_json(url)["PC_Compounds"][0]
248
        del data["props"]  # redundant with props section in record
249
        logger.debug(f"DLed structure for {cid}")
250
        return data
251
252
    def _fetch_external_tables(self, cid: int) -> Mapping[str, str]:
253
        x = {
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
254
            ext_table: self._fetch_external_table(cid, ext_table)
255
            for ext_table in self._tables_to_use.values()
256
        }
257
        logger.debug(f"DLed {len(self._tables_to_use)} external tables for {cid}")
258
        return x
259
260
    def _fetch_external_linksets(self, cid: int) -> Mapping[str, str]:
261
        x = {
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
262
            table: self._fetch_external_linkset(cid, table)
263
            for table in self._linksets_to_use.values()
264
        }
265
        logger.debug(f"DLed {len(self._linksets_to_use)} external linksets for {cid}")
266
        return x
267
268
    def _fetch_hierarchies(self, cid: int) -> NestedDotDict:
269
        build_up = {}
270
        for hname, hid in self._hierarchies_to_use.items():
271
            try:
272
                build_up[hname] = self._fetch_hierarchy(cid, hname, hid)
273
            except (HTTPError, KeyError, LookupError) as e:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "e" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
274
                logger.debug(f"No data for classifier {hid}, compound {cid}: {e}")
275
        # These list all of the child nodes for each node
276
        # Some of them are > 1000 items -- they're HUGE
277
        # We don't expect to need to navigate to children
278
        self._strip_by_key_in_place(build_up, "ChildID")
279
        logger.debug(f"DLed {len(self._hierarchies_to_use)} hierarchies for {cid}")
280
        return NestedDotDict(build_up)
281
282
    def _fetch_external_table(self, cid: int, table: str) -> Sequence[dict]:
283
        url = self._external_table_url(cid, table)
284
        data = self._query(url)
285
        df: pd.DataFrame = pd.read_csv(io.StringIO(data)).reset_index()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
286
        logger.debug(f"DLed table {table} with {len(df)} rows for {cid}")
287
        return list(df.to_dict(orient="records"))
288
289
    def _fetch_external_linkset(self, cid: int, table: str) -> NestedDotDict:
290
        url = f"{self._link_db}?format=JSON&type={table}&operation=GetAllLinks&id_1={cid}"
291
        data = self._query(url)
292
        logger.debug(f"DLed linkset {table} rows for {cid}")
293
        return NestedDotDict(orjson.loads(data))
294
295
    def _fetch_hierarchy(self, cid: int, hname: str, hid: int) -> Sequence[dict]:
296
        url = f"{self._classifications}?format=json&hid={hid}&search_uid_type=cid&search_uid={cid}&search_type=list&response_type=display"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (138/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
297
        data: Sequence[dict] = orjson.loads(self._query(url))["Hierarchies"]
298
        # underneath Hierarchies is a list of Hierarchy
299
        logger.debug(f"Found data for classifier {hid}, compound {cid}")
300
        if len(data) == 0:
301
            raise LookupFailedError(f"Failed getting hierarchy {hid}")
302
        logger.debug(f"DLed hierarchy {hname} ({hid}) for {cid}")
303
        return data
304
305
    @property
306
    def _tables_to_use(self) -> Mapping[str, str]:
307
        dct = {
308
            "drug:clinicaltrials.gov:clinical_trials": "clinicaltrials",
309
            "pharm:pubchem:reactions": "pathwayreaction",
310
            "uses:cpdat:uses": "cpdat",
311
            "tox:chemidplus:acute_effects": "chemidplus",
312
            "dis:ctd:associated_disorders_and_diseases": "ctd_chemical_disease",
313
            "lit:pubchem:depositor_provided_pubmed_citations": "pubmed",
314
            "bio:dgidb:drug_gene_interactions": "dgidb",
315
            "bio:ctd:chemical_gene_interactions": "ctdchemicalgene",
316
            "bio:drugbank:drugbank_interactions": "drugbank",
317
            "bio:drugbank:drug_drug_interactions": "drugbankddi",
318
            "bio:pubchem:bioassay_results": "bioactivity",
319
        }
320
        if self._use_extra_tables:
321
            dct.update(
322
                {
323
                    "patent:depositor_provided_patent_identifiers": "patent",
324
                    "bio:rcsb_pdb:protein_bound_3d_structures": "pdb",
325
                    "related:pubchem:related_compounds_with_annotation": "compound",
326
                }
327
            )
328
        return dct
329
330
    @property
331
    def _linksets_to_use(self) -> Mapping[str, str]:
332
        return {
333
            "lit:pubchem:chemical_cooccurrences_in_literature": "ChemicalNeighbor",
334
            "lit:pubchem:gene_cooccurrences_in_literature": "ChemicalGeneSymbolNeighbor",
335
            "lit:pubchem:disease_cooccurrences_in_literature": "ChemicalDiseaseNeighbor",
336
        }
337
338
    @property
339
    def _hierarchies_to_use(self) -> Mapping[str, int]:
340
        if not self._use_classifiers:
341
            return {}
342
        dct = {
343
            "MeSH Tree": 1,
344
            "ChEBI Ontology": 2,
345
            "WHO ATC Classification System": 79,
346
            "Guide to PHARMACOLOGY Target Classification": 92,
347
            "ChEMBL Target Tree": 87,
348
        }
349
        if self._use_extra_classifiers:
350
            dct.update(
351
                {
352
                    "KEGG: Phytochemical Compounds": 5,
353
                    "KEGG: Drug": 14,
354
                    "KEGG: USP": 15,
355
                    "KEGG: Major components of natural products": 69,
356
                    "KEGG: Target-based Classification of Drugs": 22,
357
                    "KEGG: OTC drugs": 25,
358
                    "KEGG: Drug Classes": 96,
359
                    "CAMEO Chemicals": 86,
360
                    "EPA CPDat Classification": 99,
361
                    "FDA Pharm Classes": 78,
362
                    "ChemIDplus": 84,
363
                }
364
            )
365
        return dct
366
367
    def _external_table_url(self, cid: int, collection: str) -> str:
368
        return (
369
            self._sdg
370
            + "?infmt=json"
371
            + "&outfmt=csv"
372
            + "&query={ download : * , collection : "
373
            + collection
374
            + " , where :{ ands :[{ cid : "
375
            + str(cid)
376
            + " }]}}"
377
        ).replace(" ", "%22")
378
379
    def _query_json(self, url: str) -> NestedDotDict:
380
        data = self._query(url)
381
        data = NestedDotDict(orjson.loads(data))
382
        if "Fault" in data:
383
            raise DownloadError(
384
                f"PubChem query failed ({data.get('Code')}) on {url}: {data.get('Message')}"
385
            )
386
        logger.trace(
387
            f"Fetched JSON has {data.n_bytes_total()} bytes and {data.n_elements_total()} elements"
388
        )
389
        return data
390
391
    @property
392
    def executor(self) -> QueryExecutor:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
393
        raise NotImplementedError()
394
395
    def _strip_by_key_in_place(self, data: Union[dict, list], bad_key: str) -> None:
396
        if isinstance(data, list):
397
            for x in data:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "x" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
398
                self._strip_by_key_in_place(x, bad_key)
399
        elif isinstance(data, dict):
400
            for k, v in list(data.items()):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "v" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
401
                if k == bad_key:
402
                    del data[k]
403
                elif isinstance(v, (list, dict)):
404
                    self._strip_by_key_in_place(v, bad_key)
405
406
407
__all__ = ["QueryingPubchemApi"]
408