mandos.entries.searcher.CompoundIdFiller._matches() - Code Metrics - Inspection of "feat: no clue" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( ec3fe3...82dd22 )

by Douglas

created 2021-08-02 23:37 UTC

CompoundIdFiller._matches() C

↳ Parent: mandos.entries.searcher

Complexity

Conditions

Size

Total Lines	24
Code Lines	19

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	9
eloc	19
nop	4
dl	0
loc	24
rs	6.6666
c	0
b	0
f	0

"""
Run searches and write files.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Optional, Sequence, Mapping, Set

import pandas as pd

from pocketutils.core.dot_dict import NestedDotDict

from pocketutils.tools.common_tools import CommonTools

from typeddfs import TypedDfs


from mandos import logger
from mandos.entries.api_singletons import Apis
from mandos.entries.paths import EntryPaths
from mandos.model import CompoundNotFoundError
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
from mandos.model.apis.pubchem_api import PubchemApi
from mandos.model.hits import HitFrame

from mandos.model.searches import Search
from mandos.search.chembl import ChemblSearch
from mandos.search.pubchem import PubchemSearch


def _get_structure(df) -> Optional[Sequence[str]]:

    if "inchi" in df.columns:
        return df["inchi"].values
    if "smiles" in df.columns:
        return df["smiles"].values
    return None


def _fix_cols(df):

    return df.rename(columns={s: s.lower() for s in df.columns})


InputFrame = (
    TypedDfs.typed("InputFrame")
    .require("inchikey")
    .reserve("inchi", "smiles", "compound_id", dtype=str)
    .post(_fix_cols)
    .strict(index=True, cols=False)
).build()
InputFrame.get_structures = _get_structure


IdMatchFrame = (
    TypedDfs.typed("IdMatchFrame")
    .reserve("inchikey", dtype=str)
    .reserve("inchi", "smiles", "compound_id", dtype=str)
    .reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
    .reserve("origin_inchikey", "origin_smiles", dtype=str)
    .reserve("library", dtype=str)
    .strict(index=True, cols=False)
).build()


@dataclass(frozen=True, repr=True)

class ChemFinder:
    what: str
    how: Callable[[str], str]

    @classmethod
    def chembl(cls) -> ChemFinder:

        def how(inchikey: str) -> str:
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid

        return ChemFinder("ChEMBL", how)

    @classmethod
    def pubchem(cls) -> ChemFinder:

        def how(inchikey: str) -> str:
            api: PubchemApi = Apis.Pubchem
            return str(api.find_id(inchikey))

        return ChemFinder("PubChem", how)

    def find(self, inchikey: str) -> Optional[str]:

        try:
            return self.how(inchikey)
        except CompoundNotFoundError:
            logger.info(f"NOT FOUND: {self.what.rjust(8)}  ] {inchikey}")
            logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
        return None


class SearcherUtils:

    @classmethod
    def dl(

        cls,

        inchikeys: Sequence[str],

        pubchem: bool = True,

        chembl: bool = True,

        hmdb: bool = True,

    ) -> IdMatchFrame:
        df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])

        if chembl:
            df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl().find)
        if pubchem:
            df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem().find)
        return df

    @classmethod
    def read(cls, input_path: Path) -> InputFrame:

        df = InputFrame.read_file(input_path)

        logger.info(f"Read {len(df)} input compounds")
        return df


class CompoundIdFiller:

    @classmethod
    def fill(

        cls,

        df: IdMatchFrame,

    ) -> IdMatchFrame:
        matchable = {"inchikey", "pubchem_id", "chembl_id"}
        sources = {s for s in matchable if s in df.columns and not df[s].isnull().all()}
        targets = {s for s in matchable if s not in df.columns or df[s].isnull().all()}
        # noinspection PyUnresolvedReferences
        logger.notice(f"Copying {sources} to {targets}")
        source = next(iter(sources))
        # watch out! these are simply in order, nothing more
        remapped = {t: [] for t in targets}
        for source_val in df[source].values:
            matches = cls._matches(source, source_val, targets)
            for target, target_val in matches.items():
                remapped[target].append(target_val)
            remapped.update(matches)
        for target in targets:
            df[target] = remapped[target]

    @classmethod
    def _matches(cls, source: str, source_val: str, targets: Set[str]) -> Mapping[str, str]:
        if source == "pubchem_id":
            inchikey = Apis.Pubchem.find_inchikey(int(source_val))
        elif source == "chembl_id":
            # TODO

            # get_compound wants an inchikey,
            # but we're secretly passing a CHEMBLxxxx ID instead
            # we just know that that works
            inchikey = ChemblUtils(Apis.Chembl).get_compound(source_val).inchikey
        elif source == "inchikey":
            inchikey = source
        else:
            raise AssertionError(source)
        matched = {} if source == "inchikey" else dict(inchikey=inchikey)
        if "pubchem_id" in targets:
            pubchem_id = ChemFinder.pubchem().find(inchikey)
            if pubchem_id is not None:
                matched["pubchem_id"] = str(pubchem_id)
        if "chembl_id" in targets:
            chembl_id = ChemFinder.chembl().find(inchikey)
            if chembl_id is not None:
                matched["chembl_id"] = chembl_id
        return matched


class Searcher:
    """
    Executes one or more searches and saves the results to CSV files.
    Create and use once.
    """

    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
        """
        Constructor.

        Args:
            searches:
            input_path: Path to the input file of one of the formats:
                - .txt containing one InChI Key per line
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey

        """
        self.what = searches
        self.input_path: Optional[Path] = input_path
        self.input_df: InputFrame = None
        self.output_paths = {
            what.key: EntryPaths.output_path_of(what, input_path, path)
            for what, path in CommonTools.zip_list(searches, to)
        }

    def search(self) -> Searcher:
        """
        Performs the search, and writes data.
        """
        if self.input_df is not None:
            raise ValueError(f"Already ran a search")

        self.input_df = SearcherUtils.read(self.input_path)
        inchikeys = self.input_df["inchikey"].unique()
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))

        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
        # find the compounds first so the user knows what's missing before proceeding
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
        for what in self.what:
            output_path = self.output_paths[what.key]
            metadata_path = output_path.with_suffix(".json.metadata")
            df = what.find_to_df(inchikeys)

            # keep all of the original extra columns from the input
            # e.g. if the user had 'inchi' or 'smiles' or 'pretty_name'
            for extra_col in [c for c in self.input_df.columns if c != "inchikey"]:
                extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict()
                df[extra_col] = df["lookup"].map(extra_mp.get)
            # write the (intermediate) file
            df.write_file(output_path)
            # write metadata
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
            metadata.write_json(metadata_path)
            logger.info(f"Wrote {what.key} to {output_path}")
        return self


__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils", "CompoundIdFiller", "InputFrame"]


1			"""
2			Run searches and write files.
3			"""
4
5			from __future__ import annotations
6
7			from dataclasses import dataclass
8			from pathlib import Path
9			from typing import Callable, Optional, Sequence, Mapping, Set
10
11			import pandas as pd
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			from pocketutils.core.dot_dict import NestedDotDict
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.dot_dict' Loading history...
13			from pocketutils.tools.common_tools import CommonTools
			0 ignored issues – show introduced 2021-03-31 03:41 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.tools.common_tools' Loading history...
14			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
15
16			from mandos import logger
17			from mandos.entries.api_singletons import Apis
18			from mandos.entries.paths import EntryPaths
19			from mandos.model import CompoundNotFoundError
20			from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
21			from mandos.model.apis.pubchem_api import PubchemApi
22			from mandos.model.hits import HitFrame
			0 ignored issues – show Unused Code introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Unused HitFrame imported from mandos.model.hits Loading history...
23			from mandos.model.searches import Search
24			from mandos.search.chembl import ChemblSearch
25			from mandos.search.pubchem import PubchemSearch
26
27
28			def _get_structure(df) -> Optional[Sequence[str]]:
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
29			if "inchi" in df.columns:
30			return df["inchi"].values
31			if "smiles" in df.columns:
32			return df["smiles"].values
33			return None
34
35
36			def _fix_cols(df):
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
37			return df.rename(columns={s: s.lower() for s in df.columns})
38
39
40			InputFrame = (
41			TypedDfs.typed("InputFrame")
42			.require("inchikey")
43			.reserve("inchi", "smiles", "compound_id", dtype=str)
44			.post(_fix_cols)
45			.strict(index=True, cols=False)
46			).build()
47			InputFrame.get_structures = _get_structure
48
49
50			IdMatchFrame = (
51			TypedDfs.typed("IdMatchFrame")
52			.reserve("inchikey", dtype=str)
53			.reserve("inchi", "smiles", "compound_id", dtype=str)
54			.reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
55			.reserve("origin_inchikey", "origin_smiles", dtype=str)
56			.reserve("library", dtype=str)
57			.strict(index=True, cols=False)
58			).build()
59
60
61			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
62			class ChemFinder:
63			what: str
64			how: Callable[[str], str]
65
66			@classmethod
67			def chembl(cls) -> ChemFinder:
			0 ignored issues – show introduced 2021-03-24 04:56 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
68			def how(inchikey: str) -> str:
69			return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
70
71			return ChemFinder("ChEMBL", how)
72
73			@classmethod
74			def pubchem(cls) -> ChemFinder:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
75			def how(inchikey: str) -> str:
76			api: PubchemApi = Apis.Pubchem
77			return str(api.find_id(inchikey))
78
79			return ChemFinder("PubChem", how)
80
81			def find(self, inchikey: str) -> Optional[str]:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
82			try:
83			return self.how(inchikey)
84			except CompoundNotFoundError:
85			logger.info(f"NOT FOUND: {self.what.rjust(8)} ] {inchikey}")
86			logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
87			return None
88
89
90			class SearcherUtils:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
91			@classmethod
92			def dl(
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
93			cls,
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
94			inchikeys: Sequence[str],
			0 ignored issues – show Coding Style introduced 2021-04-03 02:07 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
95			pubchem: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
96			chembl: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
97			hmdb: bool = True,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history... Unused Code introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report The argument `hmdb` seems to be unused. Loading history...
98			) -> IdMatchFrame:
99			df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
100			if chembl:
101			df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl().find)
102			if pubchem:
103			df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem().find)
104			return df
105
106			@classmethod
107			def read(cls, input_path: Path) -> InputFrame:
			0 ignored issues – show introduced 2021-06-30 21:07 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
108			df = InputFrame.read_file(input_path)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
109			logger.info(f"Read {len(df)} input compounds")
110			return df
111
112
113			class CompoundIdFiller:
			0 ignored issues – show introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
114			@classmethod
115			def fill(
			0 ignored issues – show Coding Style Naming introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
116			cls,
			0 ignored issues – show Coding Style introduced 2021-03-24 04:52 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
117			df: IdMatchFrame,
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
118			) -> IdMatchFrame:
119			matchable = {"inchikey", "pubchem_id", "chembl_id"}
120			sources = {s for s in matchable if s in df.columns and not df[s].isnull().all()}
121			targets = {s for s in matchable if s not in df.columns or df[s].isnull().all()}
122			# noinspection PyUnresolvedReferences
123			logger.notice(f"Copying {sources} to {targets}")
124			source = next(iter(sources))
125			# watch out! these are simply in order, nothing more
126			remapped = {t: [] for t in targets}
127			for source_val in df[source].values:
128			matches = cls._matches(source, source_val, targets)
129			for target, target_val in matches.items():
130			remapped[target].append(target_val)
131			remapped.update(matches)
132			for target in targets:
133			df[target] = remapped[target]
134
135			@classmethod
136			def _matches(cls, source: str, source_val: str, targets: Set[str]) -> Mapping[str, str]:
137			if source == "pubchem_id":
138			inchikey = Apis.Pubchem.find_inchikey(int(source_val))
139			elif source == "chembl_id":
140			# TODO
			0 ignored issues – show Coding Style introduced 2021-08-02 23:39 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
141			# get_compound wants an inchikey,
142			# but we're secretly passing a CHEMBLxxxx ID instead
143			# we just know that that works
144			inchikey = ChemblUtils(Apis.Chembl).get_compound(source_val).inchikey
145			elif source == "inchikey":
146			inchikey = source
147			else:
148			raise AssertionError(source)
149			matched = {} if source == "inchikey" else dict(inchikey=inchikey)
150			if "pubchem_id" in targets:
151			pubchem_id = ChemFinder.pubchem().find(inchikey)
152			if pubchem_id is not None:
153			matched["pubchem_id"] = str(pubchem_id)
154			if "chembl_id" in targets:
155			chembl_id = ChemFinder.chembl().find(inchikey)
156			if chembl_id is not None:
157			matched["chembl_id"] = chembl_id
158			return matched
159
160
161			class Searcher:
162			"""
163			Executes one or more searches and saves the results to CSV files.
164			Create and use once.
165			"""
166
167			def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
168			"""
169			Constructor.
170
171			Args:
172			searches:
173			input_path: Path to the input file of one of the formats:
174			- .txt containing one InChI Key per line
175			- .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
			0 ignored issues – show Coding Style introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report This line is too long as per the coding-style (109/100). This check looks for lines that are too long. You can specify the maximum line length. Loading history...
176			"""
177			self.what = searches
178			self.input_path: Optional[Path] = input_path
179			self.input_df: InputFrame = None
180			self.output_paths = {
181			what.key: EntryPaths.output_path_of(what, input_path, path)
182			for what, path in CommonTools.zip_list(searches, to)
183			}
184
185			def search(self) -> Searcher:
186			"""
187			Performs the search, and writes data.
188			"""
189			if self.input_df is not None:
190			raise ValueError(f"Already ran a search")
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
191			self.input_df = SearcherUtils.read(self.input_path)
192			inchikeys = self.input_df["inchikey"].unique()
193			has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
			0 ignored issues – show Comprehensibility Best Practice introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report The variable `what` does not seem to be defined. Loading history...
194			has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
195			# find the compounds first so the user knows what's missing before proceeding
196			SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
197			for what in self.what:
198			output_path = self.output_paths[what.key]
199			metadata_path = output_path.with_suffix(".json.metadata")
200			df = what.find_to_df(inchikeys)
			0 ignored issues – show Coding Style Naming introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
201			# keep all of the original extra columns from the input
202			# e.g. if the user had 'inchi' or 'smiles' or 'pretty_name'
203			for extra_col in [c for c in self.input_df.columns if c != "inchikey"]:
204			extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict()
205			df[extra_col] = df["lookup"].map(extra_mp.get)
206			# write the (intermediate) file
207			df.write_file(output_path)
208			# write metadata
209			params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
210			metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
211			metadata.write_json(metadata_path)
212			logger.info(f"Wrote {what.key} to {output_path}")
213			return self
214
215
216			__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils", "CompoundIdFiller", "InputFrame"]
217

dmyersturnbull / mandos

Push — main ( ec3fe3...82dd22 )

CompoundIdFiller._matches() C

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like