Passed
Push — main ( 5006f2...cee75c )
by Douglas
04:00
created

mandos.model.apis.caching_pubchem_api.CachingPubchemApi._read_inchikey_from_cid()   A

Complexity

Conditions 3

Size

Total Lines 10
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 9
nop 2
dl 0
loc 10
rs 9.95
c 0
b 0
f 0
1
"""
2
PubChem querying API.
3
"""
4
from __future__ import annotations
5
6
import gzip
7
from pathlib import Path
8
from typing import FrozenSet, Optional, Union, Set
9
10
import orjson
0 ignored issues
show
introduced by
Unable to import 'orjson'
Loading history...
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
13
14
from mandos import logger
15
from mandos.model.settings import MANDOS_SETTINGS
16
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
17
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
18
from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi
19
20
21
class CachingPubchemApi(PubchemApi):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
22
    def __init__(
23
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
24
        query: Optional[QueryingPubchemApi],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
25
        cache_dir: Path = MANDOS_SETTINGS.pubchem_cache_path,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
26
    ):
27
        self._cache_dir = cache_dir
28
        self._query = query
29
30
    def follow_link(self, inchikey_or_cid: Union[int, str]) -> Optional[Path]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
31
        link = self.link_path(inchikey_or_cid)
32
        cid = link.read_text(encoding="utf8").strip()
33
        if len(cid) == 0:
34
            return None
35
        return self.data_path(int(cid))
36
37
    def get_links(self, cid: int) -> Set[Path]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
38
        data = self._read_json(self.data_path(cid))
39
        siblings = set(data.siblings)
40
        for sibling in siblings:
41
            sibling_path = self.follow_link(sibling)
42
            if sibling_path is not None:
43
                inchikey_siblings = self._read_json(sibling_path).siblings
44
                siblings.update(inchikey_siblings)
45
        links = {data.inchikey, data.cid, *siblings}
46
        return {self.link_path(link) for link in links}
47
48
    def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]:
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'fetch_data' method
Loading history...
49
        followed = self.follow_link(inchikey_or_cid)
50
        if followed is not None:
51
            logger.debug(f"Found cached PubChem data")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
52
            return self._read_json(followed)
53
        return self._download(inchikey_or_cid)
54
55
    def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData:
56
        if self._query is None:
57
            raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached")
58
        data: PubchemData = self._query.fetch_data(inchikey_or_cid)
59
        cid = data.parent_or_self
60
        path = self.data_path(cid)
61
        self._write_json(data.to_json(), path)
62
        links = {inchikey_or_cid, *self.get_links(cid)}
63
        for link in links:
64
            if not link.exists():
65
                link.write_text(str(cid), encoding="utf8")
66
        logger.debug(f"Wrote PubChem data to {path.absolute()}")
67
        return data
68
69
    def link_path(self, inchikey_or_cid: Union[int, str]) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
70
        return self._cache_dir / "links" / f"{inchikey_or_cid}.txt"
71
72
    def data_path(self, cid: int) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
73
        return self._cache_dir / "data" / f"{cid}.json.gz"
74
75
    def similarity_path(self, inchi: str, min_tc: float) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
76
        if not (min_tc * 100).is_integer():
77
            raise ValueError(f"min_tc {min_tc} is not an increment of 1%")
78
        percent = int(min_tc * 100)
79
        path = self._cache_dir / "similarity" / f"{inchi}_{percent}"
80
        return path.with_suffix(MANDOS_SETTINGS.archive_filename_suffix)
81
82
    def _write_json(self, encoded: str, path: Path) -> None:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
83
        path.parent.mkdir(parents=True, exist_ok=True)
84
        path.write_bytes(gzip.compress(encoded.encode(encoding="utf8")))
85
86
    def _read_json(self, path: Path) -> Optional[PubchemData]:
0 ignored issues
show
Coding Style introduced by
This method could be written as a function/class method.

If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example

class Foo:
    def some_method(self, x, y):
        return x + y;

could be written as

class Foo:
    @classmethod
    def some_method(cls, x, y):
        return x + y;
Loading history...
87
        deflated = gzip.decompress(path.read_bytes())
88
        read = orjson.loads(deflated)
89
        return PubchemData(NestedDotDict(read)) if len(read) > 0 else None
90
91
    def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]:
92
        path = self.similarity_path(inchi, min_tc)
93
        if path.exists():
94
            df = pd.read_file(path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
95
            return frozenset(set(df["cid"].values))
96
        found = self._query.find_similar_compounds(inchi, min_tc)
97
        df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
98
        path.parent.mkdir(parents=True, exist_ok=True)
99
        df.write_file(path)
100
        return frozenset(set(df["cid"].values))
101
102
103
__all__ = ["CachingPubchemApi"]
104