mandos.entries.searcher.CompoundIdFiller.fill() - Code Metrics - Inspection of "feat: fill missing compound info" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 82dd22...9813db )

by Douglas

created 2021-08-03 04:49 UTC

mandos.entries.searcher.CompoundIdFiller.fill() A

↳ Parent: mandos.entries.searcher

Complexity

Conditions

Size

Total Lines	20
Code Lines	17

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	17
nop	2
dl	0
loc	20
rs	9.55
c	0
b	0
f	0

"""
Run searches and write files.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Optional, Sequence

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.tools.common_tools import CommonTools

from typeddfs import TypedDfs


from mandos import logger
from mandos.entries.api_singletons import Apis
from mandos.entries.paths import EntryPaths
from mandos.model import CompoundNotFoundError
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
from mandos.model.apis.pubchem_api import PubchemApi

from mandos.model.apis.pubchem_support.pubchem_data import PubchemData

from mandos.model.searches import Search
from mandos.search.chembl import ChemblSearch
from mandos.search.pubchem import PubchemSearch


def _get_structure(df) -> Optional[Sequence[str]]:

    if "inchi" in df.columns:
        return df["inchi"].values
    if "smiles" in df.columns:
        return df["smiles"].values
    return None


def _fix_cols(df):

    return df.rename(columns={s: s.lower() for s in df.columns})


InputFrame = (
    TypedDfs.typed("InputFrame")
    .require("inchikey")
    .reserve("inchi", "smiles", "compound_id", dtype=str)
    .post(_fix_cols)
    .strict(index=True, cols=False)
).build()
InputFrame.get_structures = _get_structure


IdMatchFrame = (
    TypedDfs.typed("IdMatchFrame")
    .reserve("inchikey", dtype=str)
    .reserve("inchi", "smiles", "compound_id", dtype=str)
    .reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
    .reserve("origin_inchikey", "origin_smiles", dtype=str)
    .reserve("library", dtype=str)
    .strict(index=True, cols=False)
).build()


@dataclass(frozen=True, repr=True)

class ChemFinder:
    what: str
    how: Callable[[str], str]

    @classmethod
    def chembl(cls) -> ChemFinder:

        def how(inchikey: str) -> str:
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid

        return ChemFinder("ChEMBL", how)

    @classmethod
    def pubchem(cls) -> ChemFinder:

        def how(inchikey: str) -> str:
            # noinspection PyTypeChecker
            return Apis.Pubchem.find_id(inchikey)

        return ChemFinder("PubChem", how)

    def find(self, inchikey: str) -> Optional[str]:

        try:
            data = self.how(inchikey)
        except CompoundNotFoundError:
            data = None
            logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
        if data is None:
            logger.info(f"NOT FOUND: {self.what.rjust(8)}  ] {inchikey}")
        return str(data)


class SearcherUtils:

    @classmethod
    def dl(

        cls,

        inchikeys: Sequence[str],

        pubchem: bool = True,

        chembl: bool = True,

        hmdb: bool = True,

    ) -> IdMatchFrame:
        df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])

        if chembl:
            df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl().find)
        if pubchem:
            df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem().find)
        return df

    @classmethod
    def read(cls, input_path: Path) -> InputFrame:

        df = InputFrame.read_file(input_path)

        logger.info(f"Read {len(df)} input compounds")
        return df


class Searcher:
    """
    Executes one or more searches and saves the results to CSV files.
    Create and use once.
    """

    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
        """
        Constructor.

        Args:
            searches:
            input_path: Path to the input file of one of the formats:
                - .txt containing one InChI Key per line
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey

        """
        self.what = searches
        self.input_path: Optional[Path] = input_path
        self.input_df: InputFrame = None
        self.output_paths = {
            what.key: EntryPaths.output_path_of(what, input_path, path)
            for what, path in CommonTools.zip_list(searches, to)
        }

    def search(self) -> Searcher:
        """
        Performs the search, and writes data.
        """
        if self.input_df is not None:
            raise ValueError(f"Already ran a search")

        self.input_df = SearcherUtils.read(self.input_path)
        inchikeys = self.input_df["inchikey"].unique()
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))

        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
        # find the compounds first so the user knows what's missing before proceeding
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
        for what in self.what:
            output_path = self.output_paths[what.key]
            metadata_path = output_path.with_suffix(".json.metadata")
            df = what.find_to_df(inchikeys)

            # keep all of the original extra columns from the input
            # e.g. if the user had 'inchi' or 'smiles' or 'pretty_name'
            for extra_col in [c for c in self.input_df.columns if c != "inchikey"]:
                extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict()
                df[extra_col] = df["lookup"].map(extra_mp.get)
            # write the (intermediate) file
            df.write_file(output_path)
            # write metadata
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
            metadata.write_json(metadata_path)
            logger.info(f"Wrote {what.key} to {output_path}")
        return self


__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils", "InputFrame"]


1			"""
2			Run searches and write files.
3			"""
4
5			from __future__ import annotations
6
7			from dataclasses import dataclass
8			from pathlib import Path
9			from typing import Callable, Optional, Sequence
10
11			import pandas as pd
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
13			from pocketutils.tools.common_tools import CommonTools
			0 ignored issues – show introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.tools.common_tools' Loading history...
14			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
15
16			from mandos import logger
17			from mandos.entries.api_singletons import Apis
18			from mandos.entries.paths import EntryPaths
19			from mandos.model import CompoundNotFoundError
20			from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
21			from mandos.model.apis.pubchem_api import PubchemApi
			0 ignored issues – show Unused Code introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Unused PubchemApi imported from mandos.model.apis.pubchem_api Loading history...
22			from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
			0 ignored issues – show Unused Code introduced 2021-08-03 04:51 UTC by Report Bug Copy Issue Report Unused PubchemData imported from mandos.model.apis.pubchem_support.pubchem_data Loading history...
23			from mandos.model.searches import Search
24			from mandos.search.chembl import ChemblSearch
25			from mandos.search.pubchem import PubchemSearch
26
27
28			def _get_structure(df) -> Optional[Sequence[str]]:
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
29			if "inchi" in df.columns:
30			return df["inchi"].values
31			if "smiles" in df.columns:
32			return df["smiles"].values
33			return None
34
35
36			def _fix_cols(df):
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
37			return df.rename(columns={s: s.lower() for s in df.columns})
38
39
40			InputFrame = (
41			TypedDfs.typed("InputFrame")
42			.require("inchikey")
43			.reserve("inchi", "smiles", "compound_id", dtype=str)
44			.post(_fix_cols)
45			.strict(index=True, cols=False)
46			).build()
47			InputFrame.get_structures = _get_structure
48
49
50			IdMatchFrame = (
51			TypedDfs.typed("IdMatchFrame")
52			.reserve("inchikey", dtype=str)
53			.reserve("inchi", "smiles", "compound_id", dtype=str)
54			.reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
55			.reserve("origin_inchikey", "origin_smiles", dtype=str)
56			.reserve("library", dtype=str)
57			.strict(index=True, cols=False)
58			).build()
59
60
61			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
62			class ChemFinder:
63			what: str
64			how: Callable[[str], str]
65
66			@classmethod
67			def chembl(cls) -> ChemFinder:
			0 ignored issues – show introduced 2021-03-24 04:56 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
68			def how(inchikey: str) -> str:
69			return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
70
71			return ChemFinder("ChEMBL", how)
72
73			@classmethod
74			def pubchem(cls) -> ChemFinder:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
75			def how(inchikey: str) -> str:
76			# noinspection PyTypeChecker
77			return Apis.Pubchem.find_id(inchikey)
78
79			return ChemFinder("PubChem", how)
80
81			def find(self, inchikey: str) -> Optional[str]:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
82			try:
83			data = self.how(inchikey)
84			except CompoundNotFoundError:
85			data = None
86			logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
87			if data is None:
88			logger.info(f"NOT FOUND: {self.what.rjust(8)} ] {inchikey}")
89			return str(data)
90
91
92			class SearcherUtils:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
93			@classmethod
94			def dl(
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
95			cls,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
96			inchikeys: Sequence[str],
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
97			pubchem: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
98			chembl: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
99			hmdb: bool = True,
			0 ignored issues – show Coding Style introduced 2021-04-03 02:07 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history... Unused Code introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report The argument `hmdb` seems to be unused. Loading history...
100			) -> IdMatchFrame:
101			df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
102			if chembl:
103			df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl().find)
104			if pubchem:
105			df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem().find)
106			return df
107
108			@classmethod
109			def read(cls, input_path: Path) -> InputFrame:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
110			df = InputFrame.read_file(input_path)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
111			logger.info(f"Read {len(df)} input compounds")
112			return df
113
114
115			class Searcher:
116			"""
117			Executes one or more searches and saves the results to CSV files.
118			Create and use once.
119			"""
120
121			def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
122			"""
123			Constructor.
124
125			Args:
126			searches:
127			input_path: Path to the input file of one of the formats:
128			- .txt containing one InChI Key per line
129			- .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
130			"""
131			self.what = searches
132			self.input_path: Optional[Path] = input_path
133			self.input_df: InputFrame = None
134			self.output_paths = {
135			what.key: EntryPaths.output_path_of(what, input_path, path)
136			for what, path in CommonTools.zip_list(searches, to)
137			}
138
139			def search(self) -> Searcher:
140			"""
141			Performs the search, and writes data.
142			"""
143			if self.input_df is not None:
144			raise ValueError(f"Already ran a search")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
145			self.input_df = SearcherUtils.read(self.input_path)
146			inchikeys = self.input_df["inchikey"].unique()
147			has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report The variable `what` does not seem to be defined. Loading history...
148			has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
149			# find the compounds first so the user knows what's missing before proceeding
150			SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
151			for what in self.what:
152			output_path = self.output_paths[what.key]
153			metadata_path = output_path.with_suffix(".json.metadata")
154			df = what.find_to_df(inchikeys)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
155			# keep all of the original extra columns from the input
156			# e.g. if the user had 'inchi' or 'smiles' or 'pretty_name'
157			for extra_col in [c for c in self.input_df.columns if c != "inchikey"]:
158			extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict()
159			df[extra_col] = df["lookup"].map(extra_mp.get)
160			# write the (intermediate) file
161			df.write_file(output_path)
162			# write metadata
163			params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
164			metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
165			metadata.write_json(metadata_path)
166			logger.info(f"Wrote {what.key} to {output_path}")
167			return self
168
169
170			__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils", "InputFrame"]
171

dmyersturnbull / mandos

Push — main ( 82dd22...9813db )

mandos.entries.searcher.CompoundIdFiller.fill() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like