mandos.entries.searcher.Searcher._output_path_of() - Code Metrics - Inspection of "Bump flake8-bugbear from 20.11.1 to 21.3.2" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — dependabot/pip/flake8-bugbear-... ( 16d864...b4f9fc )

unknown

created 2021-03-31 03:48 UTC

mandos.entries.searcher.Searcher._output_path_of() A

↳ Parent: mandos.entries.searcher

Complexity

Conditions

Size

Total Lines	7
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	6
nop	3
dl	0
loc	7
rs	10
c	0
b	0
f	0

"""
Run searches and write files.
"""

from __future__ import annotations

import gzip

from pathlib import Path
from typing import Sequence, Optional, Dict

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.tools.common_tools import CommonTools

from pocketutils.tools.path_tools import PathTools

from typeddfs import TypedDfs


from mandos import logger
from mandos.model import CompoundNotFoundError
from mandos.model.chembl_support.chembl_utils import ChemblUtils
from mandos.model.searches import Search
from mandos.model.settings import MANDOS_SETTINGS
from mandos.search.chembl import ChemblSearch
from mandos.search.pubchem import PubchemSearch
from mandos.entries.api_singletons import Apis

InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()

IdMatchFrame = (
    TypedDfs.typed("IdMatchFrame")
    .require("inchikey")
    .require("chembl_id")
    .require("pubchem_id")
    .strict()
).build()


class SearcherUtils:

    @classmethod
    def dl(

        cls,

        inchikeys: Sequence[str],

        pubchem: bool = True,

        chembl: bool = True,

        hmdb: bool = True,

    ) -> IdMatchFrame:
        # we actually cache the results, even though the underlying APIs cache
        # the reasons for this are a little obscure --
        # when running a Searcher, we want to run before the FIRST search
        # for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search

        # but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning

        # the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard

        # this is much cleaner, even though it's redundant
        # if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results

        # when running this command
        # to fix that, we need to delete the cached /match dataframes
        # now that I'm writing this down, I realize this is pretty bad
        # TODO

        # noinspection PyPep8Naming
        Chembl, Pubchem = Apis.Chembl, Apis.Pubchem

        logger.notice(f"Using {Chembl}, {Pubchem}")
        key = hash(",".join(inchikeys))
        cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
        if cached_path.exists():
            logger.info(f"Found ID matching results at {cached_path}")
            return IdMatchFrame.read_feather(cached_path)
        found_chembl: Dict[str, str] = {}
        found_pubchem: Dict[str, str] = {}
        if pubchem:
            for inchikey in inchikeys:
                try:
                    cid = Pubchem.fetch_data(inchikey).cid
                    found_pubchem[inchikey] = str(cid)
                    logger.info(f"Found:      PubChem {inchikey} ({cid})")
                except CompoundNotFoundError:
                    logger.error(f"NOT FOUND: PubChem {inchikey}")
                    logger.debug(f"Did not find PubChem {inchikey}", exc_info=True)
        if chembl:
            for inchikey in inchikeys:
                try:
                    chid = ChemblUtils(Chembl).get_compound(inchikey).chid
                    found_chembl[inchikey] = chid
                    logger.info(f"Found:      ChEMBL {inchikey} ({chid})")
                except CompoundNotFoundError:
                    logger.error(f"NOT FOUND: ChEMBL {inchikey}")
                    logger.debug(f"Did not find ChEMBL {inchikey}", exc_info=True)
        df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])

        df["chembl_id"] = df["inchikey"].map(found_chembl.get)
        df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
        df = IdMatchFrame(df)

        df.to_feather(cached_path)
        logger.info(f"Wrote {cached_path}")

    @classmethod
    def read(cls, input_path: Path) -> InputFrame:

        df = TypedDfs.untyped("Input").read_file(input_path, header=None, comment="#")

        if "inchikey" in df.columns_names:

            return InputFrame.convert(df)
        elif ".lines" in input_path.name or ".txt" in input_path.name:
            df.columns = ["inchikey"]
            return InputFrame.convert(df)
        raise ValueError(f"Could not parse {input_path}; no column 'inchikey'")


class Searcher:
    """
    Executes one or more searches and saves the results to CSV files.
    Create and use once.
    """

    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
        """
        Constructor.

        Args:
            searches:
            input_path: Path to the input file of one of the formats:
                - .txt containing one InChI Key per line
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey

        """
        self.what = searches
        self.input_path: Optional[Path] = input_path
        self.input_df: InputFrame = None
        self.output_paths = {
            what.key: self._output_path_of(path, path)
            for what, path in CommonTools.zip_list(searches, to)
        }
        if str(to).startswith("."):
            pass

    def search(self) -> Searcher:
        """
        Performs the search, and writes data.
        """
        if self.input_df is not None:
            raise ValueError(f"Already ran a search")

        self.input_df = SearcherUtils.read(self.input_path)
        inchikeys = self.input_df["inchikey"].unique()
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))

        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
        # find the compounds first so the user knows what's missing before proceeding
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
        for what in self.what:
            output_path = self.output_paths[what.key]
            df = what.find_to_df(inchikeys)

            # TODO keep any other columns in input_df

            df.to_csv(output_path)
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
            metadata.write_json(output_path.with_suffix(".json"))
            logger.notice(f"Wrote {what.key} to {output_path}")
        return self

    def _output_path_of(self, what: Search, to: Optional[Path]) -> Path:

        if to is None:

            return self._default_path_of(what)
        elif str(to).startswith("."):
            return self._default_path_of(what).with_suffix(str(to))
        else:
            return to

    def _default_path_of(self, what: Search) -> Path:
        parent = self.input_path.parent / (self.input_path.stem + "-output")
        parent.mkdir(exist_ok=True)
        child = what.key + ".csv"
        node = PathTools.sanitize_path_node(child)
        if (parent / node).resolve() != (parent / child).resolve():
            logger.debug(f"Path {child} sanitized to {node}")
        return parent / node


__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]


1			"""
2			Run searches and write files.
3			"""
4
5			from __future__ import annotations
6
7			import gzip
			0 ignored issues – show Unused Code introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report The import `gzip` seems to be unused. Loading history...
8			from pathlib import Path
9			from typing import Sequence, Optional, Dict
10
11			import pandas as pd
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
13			from pocketutils.tools.common_tools import CommonTools
			0 ignored issues – show introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.tools.common_tools' Loading history...
14			from pocketutils.tools.path_tools import PathTools
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.tools.path_tools' Loading history...
15			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
16
17			from mandos import logger
18			from mandos.model import CompoundNotFoundError
19			from mandos.model.chembl_support.chembl_utils import ChemblUtils
20			from mandos.model.searches import Search
21			from mandos.model.settings import MANDOS_SETTINGS
22			from mandos.search.chembl import ChemblSearch
23			from mandos.search.pubchem import PubchemSearch
24			from mandos.entries.api_singletons import Apis
25
26			InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()
27
28			IdMatchFrame = (
29			TypedDfs.typed("IdMatchFrame")
30			.require("inchikey")
31			.require("chembl_id")
32			.require("pubchem_id")
33			.strict()
34			).build()
35
36
37			class SearcherUtils:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
38			@classmethod
39			def dl(
			0 ignored issues – show Coding Style Naming introduced 2021-03-24 04:56 UTC by Report Bug Copy Issue Report Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... introduced 2021-03-24 04:56 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Unused Code introduced 2021-03-24 04:56 UTC by Report Bug Copy Issue Report Either all return statements in a function should return an expression, or none of them should. Loading history...
40			cls,
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
41			inchikeys: Sequence[str],
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
42			pubchem: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
43			chembl: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
44			hmdb: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history... Unused Code introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report The argument `hmdb` seems to be unused. Loading history...
45			) -> IdMatchFrame:
46			# we actually cache the results, even though the underlying APIs cache
47			# the reasons for this are a little obscure --
48			# when running a Searcher, we want to run before the FIRST search
49			# for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (118/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
50			# but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (113/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
51			# the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (115/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
52			# this is much cleaner, even though it's redundant
53			# if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (112/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
54			# when running this command
55			# to fix that, we need to delete the cached /match dataframes
56			# now that I'm writing this down, I realize this is pretty bad
57			# TODO
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
58			# noinspection PyPep8Naming
59			Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
			0 ignored issues – show Coding Style Naming introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Variable name "Chembl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Variable name "Pubchem" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
60			logger.notice(f"Using {Chembl}, {Pubchem}")
61			key = hash(",".join(inchikeys))
62			cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
63			if cached_path.exists():
64			logger.info(f"Found ID matching results at {cached_path}")
65			return IdMatchFrame.read_feather(cached_path)
66			found_chembl: Dict[str, str] = {}
67			found_pubchem: Dict[str, str] = {}
68			if pubchem:
69			for inchikey in inchikeys:
70			try:
71			cid = Pubchem.fetch_data(inchikey).cid
72			found_pubchem[inchikey] = str(cid)
73			logger.info(f"Found: PubChem {inchikey} ({cid})")
74			except CompoundNotFoundError:
75			logger.error(f"NOT FOUND: PubChem {inchikey}")
76			logger.debug(f"Did not find PubChem {inchikey}", exc_info=True)
77			if chembl:
78			for inchikey in inchikeys:
79			try:
80			chid = ChemblUtils(Chembl).get_compound(inchikey).chid
81			found_chembl[inchikey] = chid
82			logger.info(f"Found: ChEMBL {inchikey} ({chid})")
83			except CompoundNotFoundError:
84			logger.error(f"NOT FOUND: ChEMBL {inchikey}")
85			logger.debug(f"Did not find ChEMBL {inchikey}", exc_info=True)
86			df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
87			df["chembl_id"] = df["inchikey"].map(found_chembl.get)
88			df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
89			df = IdMatchFrame(df)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
90			df.to_feather(cached_path)
91			logger.info(f"Wrote {cached_path}")
92
93			@classmethod
94			def read(cls, input_path: Path) -> InputFrame:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
95			df = TypedDfs.untyped("Input").read_file(input_path, header=None, comment="#")
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
96			if "inchikey" in df.columns_names:
			0 ignored issues – show unused-code introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unnecessary "elif" after "return" Loading history...
97			return InputFrame.convert(df)
98			elif ".lines" in input_path.name or ".txt" in input_path.name:
99			df.columns = ["inchikey"]
100			return InputFrame.convert(df)
101			raise ValueError(f"Could not parse {input_path}; no column 'inchikey'")
102
103
104			class Searcher:
105			"""
106			Executes one or more searches and saves the results to CSV files.
107			Create and use once.
108			"""
109
110			def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
111			"""
112			Constructor.
113
114			Args:
115			searches:
116			input_path: Path to the input file of one of the formats:
117			- .txt containing one InChI Key per line
118			- .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
119			"""
120			self.what = searches
121			self.input_path: Optional[Path] = input_path
122			self.input_df: InputFrame = None
123			self.output_paths = {
124			what.key: self._output_path_of(path, path)
125			for what, path in CommonTools.zip_list(searches, to)
126			}
127			if str(to).startswith("."):
128			pass
129
130			def search(self) -> Searcher:
131			"""
132			Performs the search, and writes data.
133			"""
134			if self.input_df is not None:
135			raise ValueError(f"Already ran a search")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
136			self.input_df = SearcherUtils.read(self.input_path)
137			inchikeys = self.input_df["inchikey"].unique()
138			has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report The variable `what` does not seem to be defined. Loading history...
139			has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
140			# find the compounds first so the user knows what's missing before proceeding
141			SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
142			for what in self.what:
143			output_path = self.output_paths[what.key]
144			df = what.find_to_df(inchikeys)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
145			# TODO keep any other columns in input_df
			0 ignored issues – show Coding Style introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
146			df.to_csv(output_path)
147			params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
148			metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
149			metadata.write_json(output_path.with_suffix(".json"))
150			logger.notice(f"Wrote {what.key} to {output_path}")
151			return self
152
153			def _output_path_of(self, what: Search, to: Optional[Path]) -> Path:
			0 ignored issues – show Coding Style Naming introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Argument name "to" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
154			if to is None:
			0 ignored issues – show unused-code introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unnecessary "elif" after "return" Loading history...
155			return self._default_path_of(what)
156			elif str(to).startswith("."):
157			return self._default_path_of(what).with_suffix(str(to))
158			else:
159			return to
160
161			def _default_path_of(self, what: Search) -> Path:
162			parent = self.input_path.parent / (self.input_path.stem + "-output")
163			parent.mkdir(exist_ok=True)
164			child = what.key + ".csv"
165			node = PathTools.sanitize_path_node(child)
166			if (parent / node).resolve() != (parent / child).resolve():
167			logger.debug(f"Path {child} sanitized to {node}")
168			return parent / node
169
170
171			__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
172

dmyersturnbull / mandos

Push — dependabot/pip/flake8-bugbear-... ( 16d864...b4f9fc )

mandos.entries.searcher.Searcher._output_path_of() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like