mandos.entries.searcher.SearcherUtils.dl() - Code Metrics - Inspection of "build(deps): bump pyarrow from 3.0.0 to 4.0.1" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — dependabot/pip/pyarrow-4.0.1 ( ca09ce...b2836e )

unknown

created 2021-07-05 18:49 UTC

mandos.entries.searcher.SearcherUtils.dl() A

↳ Parent: mandos.entries.searcher

Complexity

Conditions

Size

Total Lines	15
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	14
nop	6
dl	0
loc	15
rs	9.7
c	0
b	0
f	0

"""
Run searches and write files.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Optional, Sequence

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.tools.common_tools import CommonTools

from typeddfs import TypedDfs


from mandos import logger
from mandos.entries.api_singletons import Apis
from mandos.entries.paths import EntryPaths
from mandos.model import CompoundNotFoundError
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
from mandos.model.searches import Search
from mandos.search.chembl import ChemblSearch
from mandos.search.pubchem import PubchemSearch

InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()

IdMatchFrame = (
    TypedDfs.typed("IdMatchFrame")
    .require("inchikey", dtype=str)
    .reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
    .strict()
).build()


@dataclass(frozen=True, repr=True)

class ChemFinder:
    what: str
    how: Callable[[str], str]
    complain: bool = False

    @classmethod
    def chembl(cls, complain: bool = False) -> ChemFinder:

        def how(inchikey: str) -> str:
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid

        return ChemFinder("ChEMBL", how, complain=complain)

    @classmethod
    def pubchem(cls, complain: bool = False) -> ChemFinder:

        def how(inchikey: str) -> str:
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid

        return ChemFinder("PubChem", how, complain=complain)

    def find(self, inchikey: str) -> Optional[str]:

        try:
            return self.how(inchikey)
        except CompoundNotFoundError:
            if self.complain:
                logger.info(f"NOT FOUND: {self.what.rjust(8)}  ] {inchikey}")
            logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
        return None


class SearcherUtils:

    @classmethod
    def dl(

        cls,

        inchikeys: Sequence[str],

        pubchem: bool = True,

        chembl: bool = True,

        hmdb: bool = True,

        complain: bool = False,

    ) -> IdMatchFrame:
        df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])

        if chembl:
            df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl(complain=complain).find)
        if pubchem:
            df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem(complain=complain).find)
        return df

    @classmethod
    def read(cls, input_path: Path) -> InputFrame:

        df = InputFrame.read_file(input_path)

        logger.info(f"Read {len(df)} input compounds")
        return df


class Searcher:
    """
    Executes one or more searches and saves the results to CSV files.
    Create and use once.
    """

    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
        """
        Constructor.

        Args:
            searches:
            input_path: Path to the input file of one of the formats:
                - .txt containing one InChI Key per line
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey

        """
        self.what = searches
        self.input_path: Optional[Path] = input_path
        self.input_df: InputFrame = None
        self.output_paths = {
            what.key: EntryPaths.output_path_of(what, input_path, path)
            for what, path in CommonTools.zip_list(searches, to)
        }

    def search(self) -> Searcher:
        """
        Performs the search, and writes data.
        """
        if self.input_df is not None:
            raise ValueError(f"Already ran a search")

        self.input_df = SearcherUtils.read(self.input_path)
        inchikeys = self.input_df["inchikey"].unique()
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))

        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
        # find the compounds first so the user knows what's missing before proceeding
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
        for what in self.what:
            output_path = self.output_paths[what.key]
            metadata_path = output_path.with_suffix(".metadata.json")
            df = what.find_to_df(inchikeys)

            # TODO keep any other columns in input_df

            df.to_csv(output_path)
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
            metadata.write_json(metadata_path)
            logger.info(f"Wrote {what.key} to {output_path}")
        return self


__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]


1			"""
2			Run searches and write files.
3			"""
4
5			from __future__ import annotations
6
7			from dataclasses import dataclass
8			from pathlib import Path
9			from typing import Callable, Optional, Sequence
10
11			import pandas as pd
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
13			from pocketutils.tools.common_tools import CommonTools
			0 ignored issues – show introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.tools.common_tools' Loading history...
14			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
15
16			from mandos import logger
17			from mandos.entries.api_singletons import Apis
18			from mandos.entries.paths import EntryPaths
19			from mandos.model import CompoundNotFoundError
20			from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
21			from mandos.model.searches import Search
22			from mandos.search.chembl import ChemblSearch
23			from mandos.search.pubchem import PubchemSearch
24
25			InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()
26
27			IdMatchFrame = (
28			TypedDfs.typed("IdMatchFrame")
29			.require("inchikey", dtype=str)
30			.reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
31			.strict()
32			).build()
33
34
35			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
36			class ChemFinder:
37			what: str
38			how: Callable[[str], str]
39			complain: bool = False
40
41			@classmethod
42			def chembl(cls, complain: bool = False) -> ChemFinder:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
43			def how(inchikey: str) -> str:
44			return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
45
46			return ChemFinder("ChEMBL", how, complain=complain)
47
48			@classmethod
49			def pubchem(cls, complain: bool = False) -> ChemFinder:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
50			def how(inchikey: str) -> str:
51			return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
52
53			return ChemFinder("PubChem", how, complain=complain)
54
55			def find(self, inchikey: str) -> Optional[str]:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
56			try:
57			return self.how(inchikey)
58			except CompoundNotFoundError:
59			if self.complain:
60			logger.info(f"NOT FOUND: {self.what.rjust(8)} ] {inchikey}")
61			logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
62			return None
63
64
65			class SearcherUtils:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
66			@classmethod
67			def dl(
			0 ignored issues – show best-practice introduced 2021-04-03 02:07 UTC by Report Bug Copy Issue Report Too many arguments (6/5) Loading history... Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... introduced 2021-03-24 04:56 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
68			cls,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
69			inchikeys: Sequence[str],
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
70			pubchem: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
71			chembl: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
72			hmdb: bool = True,
			0 ignored issues – show Unused Code introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report The argument `hmdb` seems to be unused. Loading history... Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
73			complain: bool = False,
			0 ignored issues – show Coding Style introduced 2021-04-03 02:07 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
74			) -> IdMatchFrame:
75			df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
76			if chembl:
77			df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl(complain=complain).find)
78			if pubchem:
79			df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem(complain=complain).find)
80			return df
81
82			@classmethod
83			def read(cls, input_path: Path) -> InputFrame:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
84			df = InputFrame.read_file(input_path)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
85			logger.info(f"Read {len(df)} input compounds")
86			return df
87
88
89			class Searcher:
90			"""
91			Executes one or more searches and saves the results to CSV files.
92			Create and use once.
93			"""
94
95			def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
96			"""
97			Constructor.
98
99			Args:
100			searches:
101			input_path: Path to the input file of one of the formats:
102			- .txt containing one InChI Key per line
103			- .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
104			"""
105			self.what = searches
106			self.input_path: Optional[Path] = input_path
107			self.input_df: InputFrame = None
108			self.output_paths = {
109			what.key: EntryPaths.output_path_of(what, input_path, path)
110			for what, path in CommonTools.zip_list(searches, to)
111			}
112
113			def search(self) -> Searcher:
114			"""
115			Performs the search, and writes data.
116			"""
117			if self.input_df is not None:
118			raise ValueError(f"Already ran a search")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
119			self.input_df = SearcherUtils.read(self.input_path)
120			inchikeys = self.input_df["inchikey"].unique()
121			has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report The variable `what` does not seem to be defined. Loading history...
122			has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
123			# find the compounds first so the user knows what's missing before proceeding
124			SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
125			for what in self.what:
126			output_path = self.output_paths[what.key]
127			metadata_path = output_path.with_suffix(".metadata.json")
128			df = what.find_to_df(inchikeys)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
129			# TODO keep any other columns in input_df
			0 ignored issues – show Coding Style introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
130			df.to_csv(output_path)
131			params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
132			metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
133			metadata.write_json(metadata_path)
134			logger.info(f"Wrote {what.key} to {output_path}")
135			return self
136
137
138			__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
139

dmyersturnbull / mandos

Push — dependabot/pip/pyarrow-4.0.1 ( ca09ce...b2836e )

mandos.entries.searcher.SearcherUtils.dl() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like