mandos.model.apis.caching_pubchem_api.CachingPubchemApi._add_cid() - Code Metrics - Inspection of "feat: misc" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 5006f2...cee75c )

by Douglas

created 2021-08-16 02:56 UTC

mandos.model.apis.caching_pubchem_api.CachingPubchemApi._add_cid() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	10
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	9
nop	3
dl	0
loc	10
rs	9.95
c	0
b	0
f	0

"""
PubChem querying API.
"""
from __future__ import annotations

import gzip
from pathlib import Path
from typing import FrozenSet, Optional, Union, Set

import orjson

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict


from mandos import logger
from mandos.model.settings import MANDOS_SETTINGS
from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi


class CachingPubchemApi(PubchemApi):

    def __init__(
        self,

        query: Optional[QueryingPubchemApi],

        cache_dir: Path = MANDOS_SETTINGS.pubchem_cache_path,

    ):
        self._cache_dir = cache_dir
        self._query = query

    def follow_link(self, inchikey_or_cid: Union[int, str]) -> Optional[Path]:

        link = self.link_path(inchikey_or_cid)
        cid = link.read_text(encoding="utf8").strip()
        if len(cid) == 0:
            return None
        return self.data_path(int(cid))

    def get_links(self, cid: int) -> Set[Path]:

        data = self._read_json(self.data_path(cid))
        siblings = set(data.siblings)
        for sibling in siblings:
            sibling_path = self.follow_link(sibling)
            if sibling_path is not None:
                inchikey_siblings = self._read_json(sibling_path).siblings
                siblings.update(inchikey_siblings)
        links = {data.inchikey, data.cid, *siblings}
        return {self.link_path(link) for link in links}

    def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]:

        followed = self.follow_link(inchikey_or_cid)
        if followed is not None:
            logger.debug(f"Found cached PubChem data")

            return self._read_json(followed)
        return self._download(inchikey_or_cid)

    def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData:
        if self._query is None:
            raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached")
        data: PubchemData = self._query.fetch_data(inchikey_or_cid)
        cid = data.parent_or_self
        path = self.data_path(cid)
        self._write_json(data.to_json(), path)
        links = {inchikey_or_cid, *self.get_links(cid)}
        for link in links:
            if not link.exists():
                link.write_text(str(cid), encoding="utf8")
        logger.debug(f"Wrote PubChem data to {path.absolute()}")
        return data

    def link_path(self, inchikey_or_cid: Union[int, str]) -> Path:

        return self._cache_dir / "links" / f"{inchikey_or_cid}.txt"

    def data_path(self, cid: int) -> Path:

        return self._cache_dir / "data" / f"{cid}.json.gz"

    def similarity_path(self, inchi: str, min_tc: float) -> Path:

        if not (min_tc * 100).is_integer():
            raise ValueError(f"min_tc {min_tc} is not an increment of 1%")
        percent = int(min_tc * 100)
        path = self._cache_dir / "similarity" / f"{inchi}_{percent}"
        return path.with_suffix(MANDOS_SETTINGS.archive_filename_suffix)

    def _write_json(self, encoded: str, path: Path) -> None:
class Foo:
    def some_method(self, x, y):
        return x + y;
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_bytes(gzip.compress(encoded.encode(encoding="utf8")))

    def _read_json(self, path: Path) -> Optional[PubchemData]:
class Foo:
    def some_method(self, x, y):
        return x + y;
        deflated = gzip.decompress(path.read_bytes())
        read = orjson.loads(deflated)
        return PubchemData(NestedDotDict(read)) if len(read) > 0 else None

    def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]:
        path = self.similarity_path(inchi, min_tc)
        if path.exists():
            df = pd.read_file(path)

            return frozenset(set(df["cid"].values))
        found = self._query.find_similar_compounds(inchi, min_tc)
        df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found])

        path.parent.mkdir(parents=True, exist_ok=True)
        df.write_file(path)
        return frozenset(set(df["cid"].values))


__all__ = ["CachingPubchemApi"]


1			"""
2			PubChem querying API.
3			"""
4			from __future__ import annotations
5
6			import gzip
7			from pathlib import Path
8			from typing import FrozenSet, Optional, Union, Set
9
10			import orjson
			0 ignored issues – show introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report Unable to import 'orjson' Loading history...
11			import pandas as pd
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
13
14			from mandos import logger
15			from mandos.model.settings import MANDOS_SETTINGS
16			from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
17			from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
18			from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi
19
20
21			class CachingPubchemApi(PubchemApi):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
22			def __init__(
23			self,
			0 ignored issues – show Coding Style introduced 2021-08-07 00:15 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
24			query: Optional[QueryingPubchemApi],
			0 ignored issues – show Coding Style introduced 2021-08-07 00:15 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
25			cache_dir: Path = MANDOS_SETTINGS.pubchem_cache_path,
			0 ignored issues – show Coding Style introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
26			):
27			self._cache_dir = cache_dir
28			self._query = query
29
30			def follow_link(self, inchikey_or_cid: Union[int, str]) -> Optional[Path]:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
31			link = self.link_path(inchikey_or_cid)
32			cid = link.read_text(encoding="utf8").strip()
33			if len(cid) == 0:
34			return None
35			return self.data_path(int(cid))
36
37			def get_links(self, cid: int) -> Set[Path]:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
38			data = self._read_json(self.data_path(cid))
39			siblings = set(data.siblings)
40			for sibling in siblings:
41			sibling_path = self.follow_link(sibling)
42			if sibling_path is not None:
43			inchikey_siblings = self._read_json(sibling_path).siblings
44			siblings.update(inchikey_siblings)
45			links = {data.inchikey, data.cid, *siblings}
46			return {self.link_path(link) for link in links}
47
48			def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]:
			0 ignored issues – show Bug introduced 2021-08-16 03:00 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'fetch_data' method Loading history...
49			followed = self.follow_link(inchikey_or_cid)
50			if followed is not None:
51			logger.debug(f"Found cached PubChem data")
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
52			return self._read_json(followed)
53			return self._download(inchikey_or_cid)
54
55			def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData:
56			if self._query is None:
57			raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached")
58			data: PubchemData = self._query.fetch_data(inchikey_or_cid)
59			cid = data.parent_or_self
60			path = self.data_path(cid)
61			self._write_json(data.to_json(), path)
62			links = {inchikey_or_cid, *self.get_links(cid)}
63			for link in links:
64			if not link.exists():
65			link.write_text(str(cid), encoding="utf8")
66			logger.debug(f"Wrote PubChem data to {path.absolute()}")
67			return data
68
69			def link_path(self, inchikey_or_cid: Union[int, str]) -> Path:
			0 ignored issues – show introduced 2021-08-07 00:15 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
70			return self._cache_dir / "links" / f"{inchikey_or_cid}.txt"
71
72			def data_path(self, cid: int) -> Path:
			0 ignored issues – show introduced 2021-08-07 00:15 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
73			return self._cache_dir / "data" / f"{cid}.json.gz"
74
75			def similarity_path(self, inchi: str, min_tc: float) -> Path:
			0 ignored issues – show introduced 2021-08-16 03:00 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
76			if not (min_tc * 100).is_integer():
77			raise ValueError(f"min_tc {min_tc} is not an increment of 1%")
78			percent = int(min_tc * 100)
79			path = self._cache_dir / "similarity" / f"{inchi}_{percent}"
80			return path.with_suffix(MANDOS_SETTINGS.archive_filename_suffix)
81
82			def _write_json(self, encoded: str, path: Path) -> None:
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
83			path.parent.mkdir(parents=True, exist_ok=True)
84			path.write_bytes(gzip.compress(encoded.encode(encoding="utf8")))
85
86			def _read_json(self, path: Path) -> Optional[PubchemData]:
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
87			deflated = gzip.decompress(path.read_bytes())
88			read = orjson.loads(deflated)
89			return PubchemData(NestedDotDict(read)) if len(read) > 0 else None
90
91			def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]:
92			path = self.similarity_path(inchi, min_tc)
93			if path.exists():
94			df = pd.read_file(path)
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
95			return frozenset(set(df["cid"].values))
96			found = self._query.find_similar_compounds(inchi, min_tc)
97			df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found])
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
98			path.parent.mkdir(parents=True, exist_ok=True)
99			df.write_file(path)
100			return frozenset(set(df["cid"].values))
101
102
103			__all__ = ["CachingPubchemApi"]
104

dmyersturnbull / mandos

Push — main ( 5006f2...cee75c )

mandos.model.apis.caching_pubchem_api.CachingPubchemApi._add_cid() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like