mandos.model.apis.caching_pubchem_api - Code Metrics - Inspection of "fix: various" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 65730f...fad324 )

by Douglas

created 2021-10-28 02:08 UTC

mandos.model.apis.caching_pubchem_api A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	112
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	91
dl	0
loc	112
rs	10
c	0
b	0
f	0
wmc	17

8 Methods

Rating	Name	Size	Complexity
A	CachingPubchemApi._download()	26	4
A	CachingPubchemApi.find_similar_compounds()	12	2
A	CachingPubchemApi.data_path()	2	1
A	CachingPubchemApi.fetch_data()	12	3
A	CachingPubchemApi.similarity_path()	6	2
A	CachingPubchemApi.__init__()	7	1
A	CachingPubchemApi._read_json()	4	2
A	CachingPubchemApi._write_siblings()	9	2

"""
PubChem caching API.
"""
from __future__ import annotations

import gzip
import os

from pathlib import Path
from typing import FrozenSet, Optional, Union

import orjson

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.core.exceptions import XValueError


from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi
from mandos.model.settings import SETTINGS
from mandos.model.utils.setup import logger


class CachingPubchemApi(PubchemApi):

    def __init__(
        self,

        query: Optional[QueryingPubchemApi],

        cache_dir: Path = SETTINGS.pubchem_cache_path,

    ):
        self._cache_dir = cache_dir
        self._query = query

    def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]:

        path = self.data_path(inchikey_or_cid)
        if path.exists():
            logger.debug(f"Found cached PubChem data")

            data = self._read_json(path)
            if data is None:
                raise PubchemCompoundLookupError(
                    f"{inchikey_or_cid} previously not found in PubChem"
                )
            self._write_siblings(data)  # TODO: remove

            return data
        return self._download(inchikey_or_cid)

    def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData:
        if self._query is None:
            raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached")
        # logger.debug(f"Downloading PubChem data for {inchikey_or_cid}")
        try:
            data: PubchemData = self._query.fetch_data(inchikey_or_cid)
        except PubchemCompoundLookupError:
            data = PubchemData(NestedDotDict({}))
            path = self.data_path(inchikey_or_cid)
            path.parent.mkdir(parents=True, exist_ok=True)
            path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8")))
            logger.debug(f"Wrote empty PubChem data to {path}")
            raise
        cid = data.parent_or_self  # if there's ever a parent of a parent, this will NOT work
        path = self.data_path(cid)
        if path.exists():
            logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} exists")
            logger.caution(f"Writing over {path} for {inchikey_or_cid}")
        else:
            logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} does not exist")
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8")))
        self._write_siblings(data)
        logger.debug(f"Wrote PubChem data to {path.absolute()}")
        logger.info(f"Got PubChem data for {inchikey_or_cid}")
        return data

    def _write_siblings(self, data: PubchemData):
        cid = data.parent_or_self
        path = self.data_path(cid)
        aliases = {self.data_path(data.inchikey), *data.siblings}
        for sibling in aliases:
            link = self.data_path(sibling)
            link.unlink(missing_ok=True)
            path.link_to(link)
        logger.debug(f"Added aliases {','.join(aliases)} ⇌ {cid} ({path})")

    def data_path(self, inchikey_or_cid: Union[int, str]) -> Path:

        return self._cache_dir / "data" / f"{inchikey_or_cid}.json.gz"

    def _read_json(self, path: Path) -> Optional[PubchemData]:
class Foo:
    def some_method(self, x, y):
        return x + y;
        deflated = gzip.decompress(path.read_bytes())
        read = orjson.loads(deflated)
        return PubchemData(NestedDotDict(read)) if len(read) > 0 else None

    def similarity_path(self, inchi: str, min_tc: float) -> Path:

        if not (min_tc * 100).is_integer():
            raise XValueError(f"min_tc {min_tc} is not an increment of 1%")
        percent = int(min_tc * 100)
        path = self._cache_dir / "similarity" / f"{inchi}_{percent}"
        return path.with_suffix(SETTINGS.archive_filename_suffix)

    def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]:
        logger.debug(f"Searching for {inchi} with min TC {min_tc}")
        path = self.similarity_path(inchi, min_tc)
        if path.exists():
            df = pd.read_file(path)

            return frozenset(set(df["cid"].values))
        found = self._query.find_similar_compounds(inchi, min_tc)
        df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found])

        path.parent.mkdir(parents=True, exist_ok=True)
        df.write_file(path)
        logger.debug(f"Wrote {len(df)} values for {inchi} with min TC {min_tc}")
        return frozenset(set(df["cid"].values))


__all__ = ["CachingPubchemApi"]


1			"""
2			PubChem caching API.
3			"""
4			from __future__ import annotations
5
6			import gzip
7			import os
			0 ignored issues – show Unused Code introduced 2021-10-28 02:14 UTC by Report Bug Copy Issue Report The import `os` seems to be unused. Loading history...
8			from pathlib import Path
9			from typing import FrozenSet, Optional, Union
10
11			import orjson
			0 ignored issues – show introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report Unable to import 'orjson' Loading history...
12			import pandas as pd
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
13			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
14			from pocketutils.core.exceptions import XValueError
			0 ignored issues – show introduced 2021-09-23 03:01 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.exceptions' Loading history...
15
16			from mandos.model.apis.pubchem_api import PubchemApi, PubchemCompoundLookupError
17			from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
18			from mandos.model.apis.querying_pubchem_api import QueryingPubchemApi
19			from mandos.model.settings import SETTINGS
20			from mandos.model.utils.setup import logger
21
22
23			class CachingPubchemApi(PubchemApi):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
24			def __init__(
25			self,
			0 ignored issues – show Coding Style introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
26			query: Optional[QueryingPubchemApi],
			0 ignored issues – show Coding Style introduced 2021-08-07 00:15 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
27			cache_dir: Path = SETTINGS.pubchem_cache_path,
			0 ignored issues – show Coding Style introduced 2021-08-07 00:15 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
28			):
29			self._cache_dir = cache_dir
30			self._query = query
31
32			def fetch_data(self, inchikey_or_cid: Union[str, int]) -> Optional[PubchemData]:
			0 ignored issues – show Bug introduced 2021-08-16 03:00 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'fetch_data' method Loading history...
33			path = self.data_path(inchikey_or_cid)
34			if path.exists():
35			logger.debug(f"Found cached PubChem data")
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
36			data = self._read_json(path)
37			if data is None:
38			raise PubchemCompoundLookupError(
39			f"{inchikey_or_cid} previously not found in PubChem"
40			)
41			self._write_siblings(data) # TODO: remove
			0 ignored issues – show Coding Style introduced 2021-10-28 02:14 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
42			return data
43			return self._download(inchikey_or_cid)
44
45			def _download(self, inchikey_or_cid: Union[int, str]) -> PubchemData:
46			if self._query is None:
47			raise PubchemCompoundLookupError(f"{inchikey_or_cid} not cached")
48			# logger.debug(f"Downloading PubChem data for {inchikey_or_cid}")
49			try:
50			data: PubchemData = self._query.fetch_data(inchikey_or_cid)
51			except PubchemCompoundLookupError:
52			data = PubchemData(NestedDotDict({}))
53			path = self.data_path(inchikey_or_cid)
54			path.parent.mkdir(parents=True, exist_ok=True)
55			path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8")))
56			logger.debug(f"Wrote empty PubChem data to {path}")
57			raise
58			cid = data.parent_or_self # if there's ever a parent of a parent, this will NOT work
59			path = self.data_path(cid)
60			if path.exists():
61			logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} exists")
62			logger.caution(f"Writing over {path} for {inchikey_or_cid}")
63			else:
64			logger.debug(f"PubChem data for {inchikey_or_cid} parent CID {cid} does not exist")
65			path.parent.mkdir(parents=True, exist_ok=True)
66			path.write_bytes(gzip.compress(data.to_json().encode(encoding="utf8")))
67			self._write_siblings(data)
68			logger.debug(f"Wrote PubChem data to {path.absolute()}")
69			logger.info(f"Got PubChem data for {inchikey_or_cid}")
70			return data
71
72			def _write_siblings(self, data: PubchemData):
73			cid = data.parent_or_self
74			path = self.data_path(cid)
75			aliases = {self.data_path(data.inchikey), *data.siblings}
76			for sibling in aliases:
77			link = self.data_path(sibling)
78			link.unlink(missing_ok=True)
79			path.link_to(link)
80			logger.debug(f"Added aliases {','.join(aliases)} ⇌ {cid} ({path})")
81
82			def data_path(self, inchikey_or_cid: Union[int, str]) -> Path:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
83			return self._cache_dir / "data" / f"{inchikey_or_cid}.json.gz"
84
85			def _read_json(self, path: Path) -> Optional[PubchemData]:
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
86			deflated = gzip.decompress(path.read_bytes())
87			read = orjson.loads(deflated)
88			return PubchemData(NestedDotDict(read)) if len(read) > 0 else None
89
90			def similarity_path(self, inchi: str, min_tc: float) -> Path:
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
91			if not (min_tc * 100).is_integer():
92			raise XValueError(f"min_tc {min_tc} is not an increment of 1%")
93			percent = int(min_tc * 100)
94			path = self._cache_dir / "similarity" / f"{inchi}_{percent}"
95			return path.with_suffix(SETTINGS.archive_filename_suffix)
96
97			def find_similar_compounds(self, inchi: str, min_tc: float) -> FrozenSet[int]:
98			logger.debug(f"Searching for {inchi} with min TC {min_tc}")
99			path = self.similarity_path(inchi, min_tc)
100			if path.exists():
101			df = pd.read_file(path)
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
102			return frozenset(set(df["cid"].values))
103			found = self._query.find_similar_compounds(inchi, min_tc)
104			df = pd.DataFrame([pd.Series(dict(cid=cid)) for cid in found])
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
105			path.parent.mkdir(parents=True, exist_ok=True)
106			df.write_file(path)
107			logger.debug(f"Wrote {len(df)} values for {inchi} with min TC {min_tc}")
108			return frozenset(set(df["cid"].values))
109
110
111			__all__ = ["CachingPubchemApi"]
112

dmyersturnbull / mandos

Push — main ( 65730f...fad324 )

mandos.model.apis.caching_pubchem_api A

Complexity

Size/Duplication

Importance

8 Methods

Duplication Side-by-Side

Filter issues like