Passed
Push — main ( 65730f...fad324 )
by Douglas
06:54 queued 02:27
created

CachingPubchemApi.get_links()   A

Complexity

Conditions 3

Size

Total Lines 10
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 10
nop 2
dl 0
loc 10
rs 9.9
c 0
b 0
f 0
1
"""
2
PubChem caching API.
3
"""
4
from __future__ import annotations
5
6
import gzip
7
import os
0 ignored issues
show
Unused Code introduced by
The import os seems to be unused.
Loading history...
8
from pathlib import Path
9
from typing import FrozenSet, Optional, Union
10
11
import orjson
0 ignored issues
show
introduced by
Unable to import 'orjson'
Loading history...
12
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
13
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
14
from pocketutils.core.exceptions import XValueError
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
15
16
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
17
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
18
from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi
19
from mandos.model.settings import SETTINGS
20
from mandos.model.utils.setup import logger
21
22
23
class CachingPubchemApi(PubchemApi):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
24
    def __init__(
25
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
26
        query: Optional[QueryingPubchemApi],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
27
        cache_dir: Path = SETTINGS.pubchem_cache_path,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
28
    ):
29
        self._cache_dir = cache_dir
30
        self._query = query
31
32
    def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]:
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'fetch_data' method
Loading history...
33
        path = self.data_path(inchikey_or_cid)
34
        if path.exists():
35
            logger.debug(f"Found cached PubChem data")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
36
            data = self._read_json(path)
37
            if data is None:
38
                raise PubchemCompoundLookupError(
39
                    f"{inchikey_or_cid} previously not found in PubChem"
40
                )
41
            self._write_siblings(data)  # TODO: remove
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
42
            return data
43
        return self._download(inchikey_or_cid)
44
45
    def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData:
46
        if self._query is None:
47
            raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached")
48
        # logger.debug(f"Downloading PubChem data for {inchikey_or_cid}")
49
        try:
50
            data: PubchemData = self._query.fetch_data(inchikey_or_cid)
51
        except PubchemCompoundLookupError:
52
            data = PubchemData(NestedDotDict({}))
53
            path = self.data_path(inchikey_or_cid)
54
            path.parent.mkdir(parents=True, exist_ok=True)
55
            path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8")))
56
            logger.debug(f"Wrote empty PubChem data to {path}")
57
            raise
58
        cid = data.parent_or_self  # if there's ever a parent of a parent, this will NOT work
59
        path = self.data_path(cid)
60
        if path.exists():
61
            logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} exists")
62
            logger.caution(f"Writing over {path} for {inchikey_or_cid}")
63
        else:
64
            logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} does not exist")
65
        path.parent.mkdir(parents=True, exist_ok=True)
66
        path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8")))
67
        self._write_siblings(data)
68
        logger.debug(f"Wrote PubChem data to {path.absolute()}")
69
        logger.info(f"Got PubChem data for {inchikey_or_cid}")
70
        return data
71
72
    def _write_siblings(self, data: PubchemData):
73
        cid = data.parent_or_self
74
        path = self.data_path(cid)
75
        aliases = {self.data_path(data.inchikey), *data.siblings}
76
        for sibling in aliases:
77
            link = self.data_path(sibling)
78
            link.unlink(missing_ok=True)
79
            path.link_to(link)
80
        logger.debug(f"Added aliases {','.join(aliases)} ⇌ {cid} ({path})")
81
82
    def data_path(self, inchikey_or_cid: Union[int, str]) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
83
        return self._cache_dir / "data" / f"{inchikey_or_cid}.json.gz"
84
85
    def _read_json(self, path: Path) -> Optional[PubchemData]:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
86
        deflated = gzip.decompress(path.read_bytes())
87
        read = orjson.loads(deflated)
88
        return PubchemData(NestedDotDict(read)) if len(read) > 0 else None
89
90
    def similarity_path(self, inchi: str, min_tc: float) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
91
        if not (min_tc * 100).is_integer():
92
            raise XValueError(f"min_tc {min_tc} is not an increment of 1%")
93
        percent = int(min_tc * 100)
94
        path = self._cache_dir / "similarity" / f"{inchi}_{percent}"
95
        return path.with_suffix(SETTINGS.archive_filename_suffix)
96
97
    def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]:
98
        logger.debug(f"Searching for {inchi} with min TC {min_tc}")
99
        path = self.similarity_path(inchi, min_tc)
100
        if path.exists():
101
            df = pd.read_file(path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
102
            return frozenset(set(df["cid"].values))
103
        found = self._query.find_similar_compounds(inchi, min_tc)
104
        df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
105
        path.parent.mkdir(parents=True, exist_ok=True)
106
        df.write_file(path)
107
        logger.debug(f"Wrote {len(df)} values for {inchi} with min TC {min_tc}")
108
        return frozenset(set(df["cid"].values))
109
110
111
__all__ = ["CachingPubchemApi"]
112