Passed
Push — dependabot/pip/flake8-bugbear-... ( 16d864...b4f9fc )
by
unknown
01:45
created

mandos.entries.searcher.Searcher.paths()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
"""
2
Run searches and write files.
3
"""
4
5
from __future__ import annotations
6
7
import gzip
0 ignored issues
show
Unused Code introduced by
The import gzip seems to be unused.
Loading history...
8
from pathlib import Path
9
from typing import Sequence, Optional, Dict
10
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
13
from pocketutils.tools.common_tools import CommonTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.common_tools'
Loading history...
14
from pocketutils.tools.path_tools import PathTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.path_tools'
Loading history...
15
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
16
17
from mandos import logger
18
from mandos.model import CompoundNotFoundError
19
from mandos.model.chembl_support.chembl_utils import ChemblUtils
20
from mandos.model.searches import Search
21
from mandos.model.settings import MANDOS_SETTINGS
22
from mandos.search.chembl import ChemblSearch
23
from mandos.search.pubchem import PubchemSearch
24
from mandos.entries.api_singletons import Apis
25
26
InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()
27
28
IdMatchFrame = (
29
    TypedDfs.typed("IdMatchFrame")
30
    .require("inchikey")
31
    .require("chembl_id")
32
    .require("pubchem_id")
33
    .strict()
34
).build()
35
36
37
class SearcherUtils:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
38
    @classmethod
39
    def dl(
0 ignored issues
show
Coding Style Naming introduced by
Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
introduced by
Missing function or method docstring
Loading history...
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
40
        cls,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
41
        inchikeys: Sequence[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
42
        pubchem: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
43
        chembl: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
44
        hmdb: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
Unused Code introduced by
The argument hmdb seems to be unused.
Loading history...
45
    ) -> IdMatchFrame:
46
        # we actually cache the results, even though the underlying APIs cache
47
        # the reasons for this are a little obscure --
48
        # when running a Searcher, we want to run before the FIRST search
49
        # for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (118/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
50
        # but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (113/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
51
        # the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (115/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
52
        # this is much cleaner, even though it's redundant
53
        # if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
54
        # when running this command
55
        # to fix that, we need to delete the cached /match dataframes
56
        # now that I'm writing this down, I realize this is pretty bad
57
        # TODO
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
58
        # noinspection PyPep8Naming
59
        Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
0 ignored issues
show
Coding Style Naming introduced by
Variable name "Chembl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Variable name "Pubchem" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
60
        logger.notice(f"Using {Chembl}, {Pubchem}")
61
        key = hash(",".join(inchikeys))
62
        cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
63
        if cached_path.exists():
64
            logger.info(f"Found ID matching results at {cached_path}")
65
            return IdMatchFrame.read_feather(cached_path)
66
        found_chembl: Dict[str, str] = {}
67
        found_pubchem: Dict[str, str] = {}
68
        if pubchem:
69
            for inchikey in inchikeys:
70
                try:
71
                    cid = Pubchem.fetch_data(inchikey).cid
72
                    found_pubchem[inchikey] = str(cid)
73
                    logger.info(f"Found:      PubChem {inchikey} ({cid})")
74
                except CompoundNotFoundError:
75
                    logger.error(f"NOT FOUND: PubChem {inchikey}")
76
                    logger.debug(f"Did not find PubChem {inchikey}", exc_info=True)
77
        if chembl:
78
            for inchikey in inchikeys:
79
                try:
80
                    chid = ChemblUtils(Chembl).get_compound(inchikey).chid
81
                    found_chembl[inchikey] = chid
82
                    logger.info(f"Found:      ChEMBL {inchikey} ({chid})")
83
                except CompoundNotFoundError:
84
                    logger.error(f"NOT FOUND: ChEMBL {inchikey}")
85
                    logger.debug(f"Did not find ChEMBL {inchikey}", exc_info=True)
86
        df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
87
        df["chembl_id"] = df["inchikey"].map(found_chembl.get)
88
        df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
89
        df = IdMatchFrame(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
90
        df.to_feather(cached_path)
91
        logger.info(f"Wrote {cached_path}")
92
93
    @classmethod
94
    def read(cls, input_path: Path) -> InputFrame:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
95
        df = TypedDfs.untyped("Input").read_file(input_path, header=None, comment="#")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
96
        if "inchikey" in df.columns_names:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
97
            return InputFrame.convert(df)
98
        elif ".lines" in input_path.name or ".txt" in input_path.name:
99
            df.columns = ["inchikey"]
100
            return InputFrame.convert(df)
101
        raise ValueError(f"Could not parse {input_path}; no column 'inchikey'")
102
103
104
class Searcher:
105
    """
106
    Executes one or more searches and saves the results to CSV files.
107
    Create and use once.
108
    """
109
110
    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
111
        """
112
        Constructor.
113
114
        Args:
115
            searches:
116
            input_path: Path to the input file of one of the formats:
117
                - .txt containing one InChI Key per line
118
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
119
        """
120
        self.what = searches
121
        self.input_path: Optional[Path] = input_path
122
        self.input_df: InputFrame = None
123
        self.output_paths = {
124
            what.key: self._output_path_of(path, path)
125
            for what, path in CommonTools.zip_list(searches, to)
126
        }
127
        if str(to).startswith("."):
128
            pass
129
130
    def search(self) -> Searcher:
131
        """
132
        Performs the search, and writes data.
133
        """
134
        if self.input_df is not None:
135
            raise ValueError(f"Already ran a search")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
136
        self.input_df = SearcherUtils.read(self.input_path)
137
        inchikeys = self.input_df["inchikey"].unique()
138
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable what does not seem to be defined.
Loading history...
139
        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
140
        # find the compounds first so the user knows what's missing before proceeding
141
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
142
        for what in self.what:
143
            output_path = self.output_paths[what.key]
144
            df = what.find_to_df(inchikeys)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
145
            # TODO keep any other columns in input_df
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
146
            df.to_csv(output_path)
147
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
148
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
149
            metadata.write_json(output_path.with_suffix(".json"))
150
            logger.notice(f"Wrote {what.key} to {output_path}")
151
        return self
152
153
    def _output_path_of(self, what: Search, to: Optional[Path]) -> Path:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "to" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
154
        if to is None:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
155
            return self._default_path_of(what)
156
        elif str(to).startswith("."):
157
            return self._default_path_of(what).with_suffix(str(to))
158
        else:
159
            return to
160
161
    def _default_path_of(self, what: Search) -> Path:
162
        parent = self.input_path.parent / (self.input_path.stem + "-output")
163
        parent.mkdir(exist_ok=True)
164
        child = what.key + ".csv"
165
        node = PathTools.sanitize_path_node(child)
166
        if (parent / node).resolve() != (parent / child).resolve():
167
            logger.debug(f"Path {child} sanitized to {node}")
168
        return parent / node
169
170
171
__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
172