Passed
Push — main ( 82dd22...9813db )
by Douglas
01:58
created

mandos.entries.searcher.CompoundIdFiller.fill()   A

Complexity

Conditions 4

Size

Total Lines 20
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 17
nop 2
dl 0
loc 20
rs 9.55
c 0
b 0
f 0
1
"""
2
Run searches and write files.
3
"""
4
5
from __future__ import annotations
6
7
from dataclasses import dataclass
8
from pathlib import Path
9
from typing import Callable, Optional, Sequence
10
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
13
from pocketutils.tools.common_tools import CommonTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.common_tools'
Loading history...
14
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
15
16
from mandos import logger
17
from mandos.entries.api_singletons import Apis
18
from mandos.entries.paths import EntryPaths
19
from mandos.model import CompoundNotFoundError
20
from mandos.model.apis.chembl_support.chembl_utils import ChemblUtils
21
from mandos.model.apis.pubchem_api import PubchemApi
0 ignored issues
show
Unused Code introduced by
Unused PubchemApi imported from mandos.model.apis.pubchem_api
Loading history...
22
from mandos.model.apis.pubchem_support.pubchem_data import PubchemData
0 ignored issues
show
Unused Code introduced by
Unused PubchemData imported from mandos.model.apis.pubchem_support.pubchem_data
Loading history...
23
from mandos.model.searches import Search
24
from mandos.search.chembl import ChemblSearch
25
from mandos.search.pubchem import PubchemSearch
26
27
28
def _get_structure(df) -> Optional[Sequence[str]]:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
29
    if "inchi" in df.columns:
30
        return df["inchi"].values
31
    if "smiles" in df.columns:
32
        return df["smiles"].values
33
    return None
34
35
36
def _fix_cols(df):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
37
    return df.rename(columns={s: s.lower() for s in df.columns})
38
39
40
InputFrame = (
41
    TypedDfs.typed("InputFrame")
42
    .require("inchikey")
43
    .reserve("inchi", "smiles", "compound_id", dtype=str)
44
    .post(_fix_cols)
45
    .strict(index=True, cols=False)
46
).build()
47
InputFrame.get_structures = _get_structure
48
49
50
IdMatchFrame = (
51
    TypedDfs.typed("IdMatchFrame")
52
    .reserve("inchikey", dtype=str)
53
    .reserve("inchi", "smiles", "compound_id", dtype=str)
54
    .reserve("chembl_id", "pubchem_id", "hmdb_id", dtype=str)
55
    .reserve("origin_inchikey", "origin_smiles", dtype=str)
56
    .reserve("library", dtype=str)
57
    .strict(index=True, cols=False)
58
).build()
59
60
61
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
62
class ChemFinder:
63
    what: str
64
    how: Callable[[str], str]
65
66
    @classmethod
67
    def chembl(cls) -> ChemFinder:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
68
        def how(inchikey: str) -> str:
69
            return ChemblUtils(Apis.Chembl).get_compound(inchikey).chid
70
71
        return ChemFinder("ChEMBL", how)
72
73
    @classmethod
74
    def pubchem(cls) -> ChemFinder:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
75
        def how(inchikey: str) -> str:
76
            # noinspection PyTypeChecker
77
            return Apis.Pubchem.find_id(inchikey)
78
79
        return ChemFinder("PubChem", how)
80
81
    def find(self, inchikey: str) -> Optional[str]:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
82
        try:
83
            data = self.how(inchikey)
84
        except CompoundNotFoundError:
85
            data = None
86
            logger.debug(f"Did not find {self.what} {inchikey}", exc_info=True)
87
        if data is None:
88
            logger.info(f"NOT FOUND: {self.what.rjust(8)}  ] {inchikey}")
89
        return str(data)
90
91
92
class SearcherUtils:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
93
    @classmethod
94
    def dl(
0 ignored issues
show
Coding Style Naming introduced by
Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
95
        cls,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
96
        inchikeys: Sequence[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
97
        pubchem: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
98
        chembl: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
99
        hmdb: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
Unused Code introduced by
The argument hmdb seems to be unused.
Loading history...
100
    ) -> IdMatchFrame:
101
        df = IdMatchFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
102
        if chembl:
103
            df["chembl_id"] = df["inchikey"].map(ChemFinder.chembl().find)
104
        if pubchem:
105
            df["pubchem_id"] = df["inchikey"].map(ChemFinder.pubchem().find)
106
        return df
107
108
    @classmethod
109
    def read(cls, input_path: Path) -> InputFrame:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
110
        df = InputFrame.read_file(input_path)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
111
        logger.info(f"Read {len(df)} input compounds")
112
        return df
113
114
115
class Searcher:
116
    """
117
    Executes one or more searches and saves the results to CSV files.
118
    Create and use once.
119
    """
120
121
    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
122
        """
123
        Constructor.
124
125
        Args:
126
            searches:
127
            input_path: Path to the input file of one of the formats:
128
                - .txt containing one InChI Key per line
129
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
130
        """
131
        self.what = searches
132
        self.input_path: Optional[Path] = input_path
133
        self.input_df: InputFrame = None
134
        self.output_paths = {
135
            what.key: EntryPaths.output_path_of(what, input_path, path)
136
            for what, path in CommonTools.zip_list(searches, to)
137
        }
138
139
    def search(self) -> Searcher:
140
        """
141
        Performs the search, and writes data.
142
        """
143
        if self.input_df is not None:
144
            raise ValueError(f"Already ran a search")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
145
        self.input_df = SearcherUtils.read(self.input_path)
146
        inchikeys = self.input_df["inchikey"].unique()
147
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable what does not seem to be defined.
Loading history...
148
        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
149
        # find the compounds first so the user knows what's missing before proceeding
150
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
151
        for what in self.what:
152
            output_path = self.output_paths[what.key]
153
            metadata_path = output_path.with_suffix(".json.metadata")
154
            df = what.find_to_df(inchikeys)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
155
            # keep all of the original extra columns from the input
156
            # e.g. if the user had 'inchi' or 'smiles' or 'pretty_name'
157
            for extra_col in [c for c in self.input_df.columns if c != "inchikey"]:
158
                extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict()
159
                df[extra_col] = df["lookup"].map(extra_mp.get)
160
            # write the (intermediate) file
161
            df.write_file(output_path)
162
            # write metadata
163
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
164
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
165
            metadata.write_json(metadata_path)
166
            logger.info(f"Wrote {what.key} to {output_path}")
167
        return self
168
169
170
__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils", "InputFrame"]
171