Passed
Push — main ( d08a4e...a07aa0 )
by Douglas
01:59
created

CachingPubchemApi.fetch_data()   A

Complexity

Conditions 3

Size

Total Lines 16
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 15
nop 2
dl 0
loc 16
rs 9.65
c 0
b 0
f 0
1
"""
2
PubChem querying API.
3
"""
4
from __future__ import annotations
5
6
import abc
0 ignored issues
show
Unused Code introduced by
The import abc seems to be unused.
Loading history...
7
import logging
8
from pathlib import Path
9
from typing import Optional, Sequence, Union, FrozenSet, Mapping
0 ignored issues
show
Unused Code introduced by
Unused Mapping imported from typing
Loading history...
Unused Code introduced by
Unused Sequence imported from typing
Loading history...
10
11
import io
0 ignored issues
show
Unused Code introduced by
The import io seems to be unused.
Loading history...
12
import gzip
13
import orjson
0 ignored issues
show
introduced by
Unable to import 'orjson'
Loading history...
14
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
15
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
16
17
from mandos.model.pubchem_api import PubchemCompoundLookupError, PubchemApi
18
from mandos.model.pubchem_support.pubchem_data import PubchemData
19
from mandos.model.querying_pubchem_api import QueryingPubchemApi
20
21
logger = logging.getLogger("mandos")
22
23
24
class CachingPubchemApi(PubchemApi):
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
25
    def __init__(
26
        self, cache_dir: Path, querier: Optional[QueryingPubchemApi], compress: bool = True
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
27
    ):
28
        self._cache_dir = cache_dir
29
        self._querier = querier
30
        self._compress = compress
31
32
    def fetch_data(self, inchikey: str) -> Optional[PubchemData]:
33
        path = self.data_path(inchikey)
34
        if path.exists():
35
            logger.info(f"Found cached PubChem data at {path.absolute()}")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
36
        elif self._querier is None:
37
            raise PubchemCompoundLookupError(f"Key {inchikey} not found cached at {path}")
38
        else:
39
            logger.info(f"Downloading PubChem data for {inchikey} ...")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
40
            data = self._querier.fetch_data(inchikey)
41
            path.parent.mkdir(parents=True, exist_ok=True)
42
            encoded = data.to_json()
43
            self._write_json(encoded, path)
44
            logger.info(f"Wrote PubChem data to {path.absolute()}")
0 ignored issues
show
introduced by
Use lazy % formatting in logging functions
Loading history...
45
            return data
46
        read = self._read_json(path)
47
        return PubchemData(read)
48
49
    def data_path(self, inchikey: str):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
50
        ext = ".json.gz" if self._compress else ".json"
51
        return self._cache_dir / "data" / f"{inchikey}{ext}"
52
53
    def similarity_path(self, inchikey: str):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
54
        ext = ".tab.gz" if self._compress else ".tab"
55
        return self._cache_dir / "similarity" / f"{inchikey}{ext}"
56
57
    def _write_json(self, encoded: str, path: Path) -> None:
58
        if self._compress:
59
            path.write_bytes(gzip.compress(encoded.encode(encoding="utf8")))
60
        else:
61
            path.write_text(encoded, encoding="utf8")
62
63
    def _read_json(self, path: Path) -> NestedDotDict:
64
        if self._compress:
65
            deflated = gzip.decompress(path.read_bytes())
66
            read = orjson.loads(deflated)
67
        else:
68
            read = orjson.loads(path.read_text(encoding="utf8"))
69
        return NestedDotDict(read)
70
71
    def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]:
72
        path = self.similarity_path(inchi)
73
        if not path.exists():
74
            df = None
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
75
            existing = set()
76
        else:
77
            df = pd.read_csv(path, sep="\t")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
78
            df = df[df["min_tc"] < min_tc]
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
79
            existing = set(df["cid"].values)
80
        if len(existing) == 0:
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
81
            found = self._querier.find_similar_compounds(inchi, min_tc)
82
            path.parent.mkdir(parents=True, exist_ok=True)
83
            new_df = pd.DataFrame([pd.Series(dict(cid=cid, min_tc=min_tc)) for cid in found])
84
            if df is not None:
85
                new_df = pd.concat([df, new_df])
86
            new_df.to_csv(path, sep="\t")
87
            return frozenset(existing.union(found))
88
        else:
89
            return frozenset(existing)
90
91
92
__all__ = ["CachingPubchemApi"]
93