Passed
Push — main ( 4e4203...cdf0f7 )
by Douglas
01:39
created

mandos.entries.searcher.Searcher._output_path_of()   A

Complexity

Conditions 3

Size

Total Lines 7
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 6
nop 3
dl 0
loc 7
rs 10
c 0
b 0
f 0
1
"""
2
Run searches and write files.
3
"""
4
5
from __future__ import annotations
6
7
import gzip
0 ignored issues
show
Unused Code introduced by
The import gzip seems to be unused.
Loading history...
8
from pathlib import Path
9
from typing import Sequence, Optional, Dict
10
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
13
from pocketutils.tools.common_tools import CommonTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.common_tools'
Loading history...
14
from pocketutils.tools.path_tools import PathTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.path_tools'
Loading history...
15
from typeddfs import TypedDfs, UntypedDf
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
16
17
from mandos import logger
18
from mandos.model import CompoundNotFoundError
19
from mandos.model.chembl_support.chembl_utils import ChemblUtils
20
from mandos.model.searches import Search
21
from mandos.model.settings import MANDOS_SETTINGS
22
from mandos.search.chembl import ChemblSearch
23
from mandos.search.pubchem import PubchemSearch
24
from mandos.entries.api_singletons import Apis
25
26
InputFrame = (TypedDfs.typed("InputFrame").require("inchikey")).build()
27
28
IdMatchFrame = (
29
    TypedDfs.typed("IdMatchFrame")
30
    .require("inchikey")
31
    .require("chembl_id")
32
    .require("pubchem_id")
33
    .strict()
34
).build()
35
36
37
class SearcherUtils:
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
38
    @classmethod
39
    def dl(
0 ignored issues
show
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
introduced by
Missing function or method docstring
Loading history...
Coding Style Naming introduced by
Method name "dl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
40
        cls,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
41
        inchikeys: Sequence[str],
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
42
        pubchem: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
43
        chembl: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
44
        hmdb: bool = True,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
Unused Code introduced by
The argument hmdb seems to be unused.
Loading history...
45
    ) -> IdMatchFrame:
46
        # we actually cache the results, even though the underlying APIs cache
47
        # the reasons for this are a little obscure --
48
        # when running a Searcher, we want to run before the FIRST search
49
        # for the typer commands to be replicas of the ``Entry.run`` methods, Searcher fetches before running a search
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (118/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
50
        # but if we have multiple searches (as in ``mandos search --config``), we only want that at the beginning
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (113/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
51
        # the alternative was having ``mandos search`` dynamically subclass each ``Entry`` -- which was really hard
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (115/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
52
        # this is much cleaner, even though it's redundant
53
        # if the cached results under /pubchem and /chembl are deleted, we unfortunately won't cache the results
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (112/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
54
        # when running this command
55
        # to fix that, we need to delete the cached /match dataframes
56
        # now that I'm writing this down, I realize this is pretty bad
57
        # TODO
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
58
        # noinspection PyPep8Naming
59
        Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
0 ignored issues
show
Coding Style Naming introduced by
Variable name "Chembl" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
Coding Style Naming introduced by
Variable name "Pubchem" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
60
        logger.notice(f"Using {Chembl}, {Pubchem}")
61
        key = hash(",".join(inchikeys))
62
        cached_path = (MANDOS_SETTINGS.match_cache_path / str(key)).with_suffix(".feather")
63
        if cached_path.exists():
64
            logger.info(f"Found ID matching results at {cached_path}")
65
            return IdMatchFrame.read_feather(cached_path)
66
        found_chembl: Dict[str, str] = {}
67
        found_pubchem: Dict[str, str] = {}
68
        if pubchem:
69
            for inchikey in inchikeys:
70
                try:
71
                    cid = Pubchem.fetch_data(inchikey).cid
72
                    found_pubchem[inchikey] = str(cid)
73
                    logger.info(f"Found:      PubChem {inchikey} ({cid})")
74
                except CompoundNotFoundError:
75
                    logger.info(f"NOT FOUND: PubChem {inchikey}")
76
                    logger.debug(f"Did not find PubChem {inchikey}", exc_info=True)
77
        if chembl:
78
            for inchikey in inchikeys:
79
                try:
80
                    chid = ChemblUtils(Chembl).get_compound(inchikey).chid
81
                    found_chembl[inchikey] = chid
82
                    logger.info(f"Found:      ChEMBL {inchikey} ({chid})")
83
                except CompoundNotFoundError:
84
                    logger.info(f"NOT FOUND: ChEMBL {inchikey}")
85
                    logger.debug(f"Did not find ChEMBL {inchikey}", exc_info=True)
86
        df = pd.DataFrame([pd.Series(dict(inchikey=c)) for c in inchikeys])
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
87
        df["chembl_id"] = df["inchikey"].map(found_chembl.get)
88
        df["pubchem_id"] = df["inchikey"].map(found_pubchem.get)
89
        df = IdMatchFrame(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
90
        df.to_feather(cached_path)
91
        logger.info(f"Wrote {cached_path}")
92
93
    @classmethod
94
    def read(cls, input_path: Path) -> InputFrame:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
95
        df: UntypedDf = TypedDfs.untyped("Input").read_file(input_path, header=None, comment="#")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
96
        if "inchikey" in df.column_names():
97
            df = InputFrame.convert(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
98
        elif ".lines" in input_path.name or ".txt" in input_path.name:
99
            df.columns = ["inchikey"]
100
            df = InputFrame.convert(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
101
        else:
102
            raise ValueError(f"Could not parse {input_path}; no column 'inchikey'")
103
        # find duplicates
104
        # in hindsight, this wasn't worth the amount of code
105
        n0 = len(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n0" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
106
        # noinspection PyTypeChecker
107
        df: UntypedDf = df.drop_duplicates()
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
108
        n1 = len(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n1" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
109
        logger.info("Read {n1} input compounds")
110
        if n0 == n1:
111
            logger.info(f"There were no duplicate rows")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
112
        else:
113
            logger.info(f"Dropped {n1-n0} duplicated rows")
114
        duplicated = df[df.duplicated("inchikey", keep=False)]
115
        duplicated_inchikeys = set(duplicated["inchikey"])
116
        # noinspection PyTypeChecker
117
        df = df.drop_duplicates(subset=["inchikey"], keep="first")
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
118
        n2 = len(df)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n2" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
119
        if len(duplicated) > 1:
120
            logger.error(
121
                f"{len(duplicated)} rows contain the same inchikey but have differences in other columns"
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (105/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
122
            )
123
            logger.error(f"Dropped {n2-n1} rows with duplicate inchikeys")
124
            logger.error(f"The offending inchikeys are {duplicated_inchikeys}")
125
        return df
126
127
128
class Searcher:
129
    """
130
    Executes one or more searches and saves the results to CSV files.
131
    Create and use once.
132
    """
133
134
    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
135
        """
136
        Constructor.
137
138
        Args:
139
            searches:
140
            input_path: Path to the input file of one of the formats:
141
                - .txt containing one InChI Key per line
142
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
143
        """
144
        self.what = searches
145
        self.input_path: Optional[Path] = input_path
146
        self.input_df: InputFrame = None
147
        self.output_paths = {
148
            what.key: self._output_path_of(what, path)
149
            for what, path in CommonTools.zip_list(searches, to)
150
        }
151
152
    def search(self) -> Searcher:
153
        """
154
        Performs the search, and writes data.
155
        """
156
        if self.input_df is not None:
157
            raise ValueError(f"Already ran a search")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
158
        self.input_df = SearcherUtils.read(self.input_path)
159
        inchikeys = self.input_df["inchikey"].unique()
160
        has_pubchem = any((isinstance(what, PubchemSearch) for what in self.what))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable what does not seem to be defined.
Loading history...
161
        has_chembl = any((isinstance(what, ChemblSearch) for what in self.what))
162
        # find the compounds first so the user knows what's missing before proceeding
163
        SearcherUtils.dl(inchikeys, pubchem=has_pubchem, chembl=has_chembl)
164
        for what in self.what:
165
            output_path = self.output_paths[what.key]
166
            df = what.find_to_df(inchikeys)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
167
            # TODO keep any other columns in input_df
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
168
            df.to_csv(output_path)
169
            params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
170
            metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
171
            metadata.write_json(output_path.with_suffix(".json"))
172
            logger.notice(f"Wrote {what.key} to {output_path}")
173
        return self
174
175
    def _output_path_of(self, what: Search, to: Optional[Path]) -> Path:
0 ignored issues
show
Coding Style Naming introduced by
Argument name "to" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
176
        if to is None:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
177
            return self._default_path_of(what)
178
        elif str(to).startswith("."):
179
            return self._default_path_of(what).with_suffix(str(to))
180
        else:
181
            return to
182
183
    def _default_path_of(self, what: Search) -> Path:
184
        parent = self.input_path.parent / (self.input_path.stem + "-output")
185
        parent.mkdir(exist_ok=True)
186
        child = what.key + ".csv"
187
        node = PathTools.sanitize_path_node(child)
188
        if (parent / node).resolve() != (parent / child).resolve():
189
            logger.debug(f"Path {child} sanitized to {node}")
190
        return parent / node
191
192
193
__all__ = ["Searcher", "IdMatchFrame", "SearcherUtils"]
194