mandos.model.caching_pubchem_api.CachingPubchemApi.fetch_data() - Code Metrics - Inspection of "feat: split search and entries, api implementation..." - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( d08a4e...a07aa0 )

by Douglas

created 2021-03-22 20:03 UTC

CachingPubchemApi.fetch_data() A

↳ Parent: mandos.model.caching_pubchem_api

Complexity

Conditions

Size

Total Lines	16
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	15
nop	2
dl	0
loc	16
rs	9.65
c	0
b	0
f	0

"""
PubChem querying API.
"""
from __future__ import annotations

import abc

import logging
from pathlib import Path
from typing import Optional, Sequence, Union, FrozenSet, Mapping


import io

import gzip
import orjson

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict


from mandos.model.pubchem_api import PubchemCompoundLookupError, PubchemApi
from mandos.model.pubchem_support.pubchem_data import PubchemData
from mandos.model.querying_pubchem_api import QueryingPubchemApi

logger = logging.getLogger("mandos")


class CachingPubchemApi(PubchemApi):

    def __init__(
        self, cache_dir: Path, querier: Optional[QueryingPubchemApi], compress: bool = True

    ):
        self._cache_dir = cache_dir
        self._querier = querier
        self._compress = compress

    def fetch_data(self, inchikey: str) -> Optional[PubchemData]:
        path = self.data_path(inchikey)
        if path.exists():
            logger.info(f"Found cached PubChem data at {path.absolute()}")

        elif self._querier is None:
            raise PubchemCompoundLookupError(f"Key {inchikey} not found cached at {path}")
        else:
            logger.info(f"Downloading PubChem data for {inchikey} ...")

            data = self._querier.fetch_data(inchikey)
            path.parent.mkdir(parents=True, exist_ok=True)
            encoded = data.to_json()
            self._write_json(encoded, path)
            logger.info(f"Wrote PubChem data to {path.absolute()}")

            return data
        read = self._read_json(path)
        return PubchemData(read)

    def data_path(self, inchikey: str):

        ext = ".json.gz" if self._compress else ".json"
        return self._cache_dir / "data" / f"{inchikey}{ext}"

    def similarity_path(self, inchikey: str):

        ext = ".tab.gz" if self._compress else ".tab"
        return self._cache_dir / "similarity" / f"{inchikey}{ext}"

    def _write_json(self, encoded: str, path: Path) -> None:
        if self._compress:
            path.write_bytes(gzip.compress(encoded.encode(encoding="utf8")))
        else:
            path.write_text(encoded, encoding="utf8")

    def _read_json(self, path: Path) -> NestedDotDict:
        if self._compress:
            deflated = gzip.decompress(path.read_bytes())
            read = orjson.loads(deflated)
        else:
            read = orjson.loads(path.read_text(encoding="utf8"))
        return NestedDotDict(read)

    def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]:
        path = self.similarity_path(inchi)
        if not path.exists():
            df = None

            existing = set()
        else:
            df = pd.read_csv(path, sep="\t")

            df = df[df["min_tc"] < min_tc]

            existing = set(df["cid"].values)
        if len(existing) == 0:

            found = self._querier.find_similar_compounds(inchi, min_tc)
            path.parent.mkdir(parents=True, exist_ok=True)
            new_df = pd.DataFrame([pd.Series(dict(cid=cid, min_tc=min_tc)) for cid in found])
            if df is not None:
                new_df = pd.concat([df, new_df])
            new_df.to_csv(path, sep="\t")
            return frozenset(existing.union(found))
        else:
            return frozenset(existing)


__all__ = ["CachingPubchemApi"]


1			"""
2			PubChem querying API.
3			"""
4			from __future__ import annotations
5
6			import abc
			0 ignored issues – show Unused Code introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report The import `abc` seems to be unused. Loading history...
7			import logging
8			from pathlib import Path
9			from typing import Optional, Sequence, Union, FrozenSet, Mapping
			0 ignored issues – show Unused Code introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report Unused Mapping imported from typing Loading history... Unused Code introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report Unused Sequence imported from typing Loading history...
10
11			import io
			0 ignored issues – show Unused Code introduced 2021-03-22 20:05 UTC by Report Bug Copy Issue Report The import `io` seems to be unused. Loading history...
12			import gzip
13			import orjson
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'orjson' Loading history...
14			import pandas as pd
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
15			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
16
17			from mandos.model.pubchem_api import PubchemCompoundLookupError, PubchemApi
18			from mandos.model.pubchem_support.pubchem_data import PubchemData
19			from mandos.model.querying_pubchem_api import QueryingPubchemApi
20
21			logger = logging.getLogger("mandos")
22
23
24			class CachingPubchemApi(PubchemApi):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
25			def __init__(
26			self, cache_dir: Path, querier: Optional[QueryingPubchemApi], compress: bool = True
			0 ignored issues – show Coding Style introduced 2021-02-01 07:03 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
27			):
28			self._cache_dir = cache_dir
29			self._querier = querier
30			self._compress = compress
31
32			def fetch_data(self, inchikey: str) -> Optional[PubchemData]:
33			path = self.data_path(inchikey)
34			if path.exists():
35			logger.info(f"Found cached PubChem data at {path.absolute()}")
			0 ignored issues – show introduced 2021-02-01 07:03 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
36			elif self._querier is None:
37			raise PubchemCompoundLookupError(f"Key {inchikey} not found cached at {path}")
38			else:
39			logger.info(f"Downloading PubChem data for {inchikey} ...")
			0 ignored issues – show introduced 2021-02-01 07:03 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
40			data = self._querier.fetch_data(inchikey)
41			path.parent.mkdir(parents=True, exist_ok=True)
42			encoded = data.to_json()
43			self._write_json(encoded, path)
44			logger.info(f"Wrote PubChem data to {path.absolute()}")
			0 ignored issues – show introduced 2021-02-01 07:03 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
45			return data
46			read = self._read_json(path)
47			return PubchemData(read)
48
49			def data_path(self, inchikey: str):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
50			ext = ".json.gz" if self._compress else ".json"
51			return self._cache_dir / "data" / f"{inchikey}{ext}"
52
53			def similarity_path(self, inchikey: str):
			0 ignored issues – show introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
54			ext = ".tab.gz" if self._compress else ".tab"
55			return self._cache_dir / "similarity" / f"{inchikey}{ext}"
56
57			def _write_json(self, encoded: str, path: Path) -> None:
58			if self._compress:
59			path.write_bytes(gzip.compress(encoded.encode(encoding="utf8")))
60			else:
61			path.write_text(encoded, encoding="utf8")
62
63			def _read_json(self, path: Path) -> NestedDotDict:
64			if self._compress:
65			deflated = gzip.decompress(path.read_bytes())
66			read = orjson.loads(deflated)
67			else:
68			read = orjson.loads(path.read_text(encoding="utf8"))
69			return NestedDotDict(read)
70
71			def find_similar_compounds(self, inchi: Union[int, str], min_tc: float) -> FrozenSet[int]:
72			path = self.similarity_path(inchi)
73			if not path.exists():
74			df = None
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
75			existing = set()
76			else:
77			df = pd.read_csv(path, sep="\t")
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
78			df = df[df["min_tc"] < min_tc]
			0 ignored issues – show Coding Style Naming introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
79			existing = set(df["cid"].values)
80			if len(existing) == 0:
			0 ignored issues – show unused-code introduced 2021-01-25 23:06 UTC by Report Bug Copy Issue Report Unnecessary "else" after "return" Loading history...
81			found = self._querier.find_similar_compounds(inchi, min_tc)
82			path.parent.mkdir(parents=True, exist_ok=True)
83			new_df = pd.DataFrame([pd.Series(dict(cid=cid, min_tc=min_tc)) for cid in found])
84			if df is not None:
85			new_df = pd.concat([df, new_df])
86			new_df.to_csv(path, sep="\t")
87			return frozenset(existing.union(found))
88			else:
89			return frozenset(existing)
90
91
92			__all__ = ["CachingPubchemApi"]
93

dmyersturnbull / mandos

Push — main ( d08a4e...a07aa0 )

CachingPubchemApi.fetch_data() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like