mandos.entries.searcher.SearcherUtils._get_sep() - Code Metrics - Inspection of "feat: split search and entries, api implementation..." - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( d08a4e...a07aa0 )

by Douglas

created 2021-03-22 20:03 UTC

mandos.entries.searcher.SearcherUtils._get_sep() B

↳ Parent: mandos.entries.searcher

Complexity

Conditions

Size

Total Lines	14
Code Lines	13

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	6
eloc	13
nop	2
dl	0
loc	14
rs	8.6666
c	0
b	0
f	0

"""
Run searches and write files.
"""

from __future__ import annotations

import gzip
import logging
from pathlib import Path
from typing import Sequence, Optional, Dict

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.tools.path_tools import PathTools

from typeddfs import TypedDfs


from mandos.model import CompoundNotFoundError
from mandos.model.chembl_support.chembl_utils import ChemblUtils
from mandos.model.searches import Search
from mandos.model.settings import MANDOS_SETTINGS
from mandos.search.chembl import ChemblSearch
from mandos.search.pubchem import PubchemSearch
from mandos.entries.api_singletons import Apis

Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
logger = logging.getLogger(__package__)

IdMatchFrame = (
    TypedDfs.typed("IdMatchFrame")
    .require("inchikey")
    .require("chembl_id")
    .require("pubchem_id")
    .strict()
).build()


class SearcherUtils:

    @classmethod
    def dl(

        cls, inchikeys: Sequence[str], pubchem: bool = True, chembl: bool = True

    ) -> IdMatchFrame:
        # we actually cache the results, even though the underlying APIs cache
        # the reasons for this are a little obscure --
        # when running a Searcher, we want to run before the FIRST search
        # for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search

        # but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning

        # the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard

        # this is much cleaner, even though it's redundant
        # if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results

        # when running this command
        # to fix that, we need to delete the cached /match dataframes
        # now that I'm writing this down, I realize this is pretty bad
        # TODO

        key = hash(",".join(inchikeys))
        cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
        if cached_path.exists():
            logger.info(f"Found ID matching results at {cached_path}")

            return IdMatchFrame.read_feather(cached_path)
        found_chembl: Dict[str, str] = {}
        found_pubchem: Dict[str, str] = {}
        if pubchem:
            for inchikey in inchikeys:
                try:
                    found_pubchem[inchikey] = str(Pubchem.fetch_data(inchikey).cid)
                except CompoundNotFoundError:
                    logger.error(f"Did not find compound {inchikey}")

                    logger.debug(f"Did not find compound {inchikey}", exc_info=True)

        if chembl:
            for inchikey in inchikeys:
                try:
                    found_chembl[inchikey] = ChemblUtils(Chembl).get_compound(inchikey).chid
                except CompoundNotFoundError:
                    logger.error(f"Did not find compound {inchikey}")

                    logger.debug(f"Did not find compound {inchikey}", exc_info=True)

        df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])

        df["chembl_id"] = df["inchikey"].map(found_chembl.get)
        df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
        df = IdMatchFrame(df)

        df.to_feather(cached_path)

    @classmethod
    def read(cls, input_path: Path) -> Sequence[str]:

        sep = cls._get_sep(input_path)
        if sep in {"\t", ","}:

            df = pd.read_csv(input_path, sep=sep)

            return cls._from_df(df)
        elif sep == "feather":
            df = pd.read_feather(input_path)

            return cls._from_df(df)
        elif sep == "gz":
            with gzip.open(input_path, "rt") as f:

                return cls._from_txt(f.read())
        elif sep == "txt":
            return cls._from_txt(input_path.read_text(encoding="utf8"))
        else:
            raise AssertionError(sep)

    @classmethod
    def _from_df(cls, df: pd.DataFrame) -> Sequence[str]:

        df.columns = [c.lower() if isinstance(c, str) else c for c in df.columns]
        if "inchikey" not in df.columns:
            raise KeyError("For a CSV or TSV file, include a column called 'inchikey'")
        return df["inchikey"].values.tolist()

    @classmethod
    def _from_txt(cls, text: str) -> Sequence[str]:
        return [line.strip() for line in text.splitlines() if len(line.strip()) > 0]

    @classmethod
    def _get_sep(cls, input_path: Path) -> str:
        if any((str(input_path).endswith(z) for z in {".tab", ".tsv", ".tab.gz", ".tsv.gz"})):

            return "\t"
        elif any((str(input_path).endswith(z) for z in {".csv", ".csv.gz"})):
            return ","
        elif any((str(input_path).endswith(z) for z in {".feather"})):
            return "feather"
        elif any((str(input_path).endswith(z) for z in {".txt.gz", ".lines.gz"})):
            return "gz"
        elif any((str(input_path).endswith(z) for z in {".txt", ".lines"})):
            return "txt"
        else:
            raise ValueError(f"{input_path} should end in .tab, .tsv, .csv, .txt, .lines, or .gz")


class Searcher:
    """
    Executes one or more searches and saves the results to CSV files.
    Create and use once.
    """

    def __init__(self, searches: Sequence[Search], input_path: Path):
        """
        Constructor.

        Args:
            searches:
            input_path: Path to the input file of one of the formats:
                - .txt containing one InChI Key per line
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey

        """
        self.what = searches
        self.input_path: Optional[Path] = input_path
        self.inchikeys: Optional[Sequence[str]] = []

    def search(self) -> Searcher:
        """
        Performs the search, and writes data.
        """
        if self.inchikeys is not None:
            raise ValueError(f"Already ran a search")

        self.inchikeys = SearcherUtils.read(self.input_path)
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))

        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
        # find the compounds first so the user knows what's missing before proceeding
        SearcherUtils.dl(self.inchikeys, pubchem=has_pubchem, chembl=has_chembl)
        for what in self.what:
            output_path = self.output_path_of(what)
            df = what.find_to_df(self.inchikeys)

            df.to_csv(output_path)
            metadata = NestedDotDict(
                dict(key=what.key, search=what.search_class, params=what.get_params())
            )
            metadata.write_json(output_path.with_suffix(".json"))
        return self

    def paths(self) -> Sequence[Path]:

        return [self.output_path_of(what) for what in self.what]

    def output_path_of(self, what: Search) -> Path:

        parent = self.input_path.parent
        child = self.input_path.stem + what.key + ".tab"
        node = PathTools.sanitize_path_node(child)
        if (parent / node).resolve() != (parent / child).resolve():
            logger.debug(f"Path {child} sanitized to {node}")

        return parent / node


__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]


1			"""
2			Run searches and write files.
3			"""
4
5			from __future__ import annotations
6
7			import gzip
8			import logging
9			from pathlib import Path
10			from typing import Sequence, Optional, Dict
11
12			import pandas as pd
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
13			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
14			from pocketutils.tools.path_tools import PathTools
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.tools.path_tools' Loading history...
15			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
16
17			from mandos.model import CompoundNotFoundError
18			from mandos.model.chembl_support.chembl_utils import ChemblUtils
19			from mandos.model.searches import Search
20			from mandos.model.settings import MANDOS_SETTINGS
21			from mandos.search.chembl import ChemblSearch
22			from mandos.search.pubchem import PubchemSearch
23			from mandos.entries.api_singletons import Apis
24
25			Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
26			logger = logging.getLogger(__package__)
27
28			IdMatchFrame = (
29			TypedDfs.typed("IdMatchFrame")
30			.require("inchikey")
31			.require("chembl_id")
32			.require("pubchem_id")
33			.strict()
34			).build()
35
36
37			class SearcherUtils:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
38			@classmethod
39			def dl(
			0 ignored issues – show Unused Code introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Either all return statements in a function should return an expression, or none of them should. Loading history... introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
40			cls, inchikeys: Sequence[str], pubchem: bool = True, chembl: bool = True
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
41			) -> IdMatchFrame:
42			# we actually cache the results, even though the underlying APIs cache
43			# the reasons for this are a little obscure --
44			# when running a Searcher, we want to run before the FIRST search
45			# for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (118/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
46			# but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (113/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
47			# the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (115/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
48			# this is much cleaner, even though it's redundant
49			# if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (112/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
50			# when running this command
51			# to fix that, we need to delete the cached /match dataframes
52			# now that I'm writing this down, I realize this is pretty bad
53			# TODO
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
54			key = hash(",".join(inchikeys))
55			cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
56			if cached_path.exists():
57			logger.info(f"Found ID matching results at {cached_path}")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
58			return IdMatchFrame.read_feather(cached_path)
59			found_chembl: Dict[str, str] = {}
60			found_pubchem: Dict[str, str] = {}
61			if pubchem:
62			for inchikey in inchikeys:
63			try:
64			found_pubchem[inchikey] = str(Pubchem.fetch_data(inchikey).cid)
65			except CompoundNotFoundError:
66			logger.error(f"Did not find compound {inchikey}")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
67			logger.debug(f"Did not find compound {inchikey}", exc_info=True)
			0 ignored issues – show introduced 2021-03-10 02:41 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
68			if chembl:
69			for inchikey in inchikeys:
70			try:
71			found_chembl[inchikey] = ChemblUtils(Chembl).get_compound(inchikey).chid
72			except CompoundNotFoundError:
73			logger.error(f"Did not find compound {inchikey}")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
74			logger.debug(f"Did not find compound {inchikey}", exc_info=True)
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
75			df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
76			df["chembl_id"] = df["inchikey"].map(found_chembl.get)
77			df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
78			df = IdMatchFrame(df)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
79			df.to_feather(cached_path)
80
81			@classmethod
82			def read(cls, input_path: Path) -> Sequence[str]:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
83			sep = cls._get_sep(input_path)
84			if sep in {"\t", ","}:
			0 ignored issues – show unused-code introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unnecessary "elif" after "return" Loading history...
85			df = pd.read_csv(input_path, sep=sep)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
86			return cls._from_df(df)
87			elif sep == "feather":
88			df = pd.read_feather(input_path)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
89			return cls._from_df(df)
90			elif sep == "gz":
91			with gzip.open(input_path, "rt") as f:
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "f" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
92			return cls._from_txt(f.read())
93			elif sep == "txt":
94			return cls._from_txt(input_path.read_text(encoding="utf8"))
95			else:
96			raise AssertionError(sep)
97
98			@classmethod
99			def _from_df(cls, df: pd.DataFrame) -> Sequence[str]:
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
100			df.columns = [c.lower() if isinstance(c, str) else c for c in df.columns]
101			if "inchikey" not in df.columns:
102			raise KeyError("For a CSV or TSV file, include a column called 'inchikey'")
103			return df["inchikey"].values.tolist()
104
105			@classmethod
106			def _from_txt(cls, text: str) -> Sequence[str]:
107			return [line.strip() for line in text.splitlines() if len(line.strip()) > 0]
108
109			@classmethod
110			def _get_sep(cls, input_path: Path) -> str:
111			if any((str(input_path).endswith(z) for z in {".tab", ".tsv", ".tab.gz", ".tsv.gz"})):
			0 ignored issues – show unused-code introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unnecessary "elif" after "return" Loading history... Comprehensibility Best Practice introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report The variable `z` does not seem to be defined. Loading history...
112			return "\t"
113			elif any((str(input_path).endswith(z) for z in {".csv", ".csv.gz"})):
114			return ","
115			elif any((str(input_path).endswith(z) for z in {".feather"})):
116			return "feather"
117			elif any((str(input_path).endswith(z) for z in {".txt.gz", ".lines.gz"})):
118			return "gz"
119			elif any((str(input_path).endswith(z) for z in {".txt", ".lines"})):
120			return "txt"
121			else:
122			raise ValueError(f"{input_path} should end in .tab, .tsv, .csv, .txt, .lines, or .gz")
123
124
125			class Searcher:
126			"""
127			Executes one or more searches and saves the results to CSV files.
128			Create and use once.
129			"""
130
131			def __init__(self, searches: Sequence[Search], input_path: Path):
132			"""
133			Constructor.
134
135			Args:
136			searches:
137			input_path: Path to the input file of one of the formats:
138			- .txt containing one InChI Key per line
139			- .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
140			"""
141			self.what = searches
142			self.input_path: Optional[Path] = input_path
143			self.inchikeys: Optional[Sequence[str]] = []
144
145			def search(self) -> Searcher:
146			"""
147			Performs the search, and writes data.
148			"""
149			if self.inchikeys is not None:
150			raise ValueError(f"Already ran a search")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
151			self.inchikeys = SearcherUtils.read(self.input_path)
152			has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report The variable `what` does not seem to be defined. Loading history...
153			has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
154			# find the compounds first so the user knows what's missing before proceeding
155			SearcherUtils.dl(self.inchikeys, pubchem=has_pubchem, chembl=has_chembl)
156			for what in self.what:
157			output_path = self.output_path_of(what)
158			df = what.find_to_df(self.inchikeys)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
159			df.to_csv(output_path)
160			metadata = NestedDotDict(
161			dict(key=what.key, search=what.search_class, params=what.get_params())
162			)
163			metadata.write_json(output_path.with_suffix(".json"))
164			return self
165
166			def paths(self) -> Sequence[Path]:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
167			return [self.output_path_of(what) for what in self.what]
168
169			def output_path_of(self, what: Search) -> Path:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
170			parent = self.input_path.parent
171			child = self.input_path.stem + what.key + ".tab"
172			node = PathTools.sanitize_path_node(child)
173			if (parent / node).resolve() != (parent / child).resolve():
174			logger.debug(f"Path {child} sanitized to {node}")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
175			return parent / node
176
177
178			__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
179

dmyersturnbull / mandos

Push — main ( d08a4e...a07aa0 )

mandos.entries.searcher.SearcherUtils._get_sep() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like