Passed
Push — main ( 83a9fb...fa90c4 )
by Douglas
03:43
created

mandos.entry.multi_searches.MultiSearch.build()   A

Complexity

Conditions 1

Size

Total Lines 21
Code Lines 20

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 20
nop 7
dl 0
loc 21
rs 9.4
c 0
b 0
f 0
1
"""
2
Runner.
3
"""
4
5
from __future__ import annotations
6
7
from dataclasses import dataclass
8
from pathlib import Path
9
from typing import Any, Mapping, MutableMapping, Optional, Sequence, Type, Union
10
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
import typer
0 ignored issues
show
introduced by
Unable to import 'typer'
Loading history...
13
from pocketutils.core.exceptions import PathExistsError
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
14
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
15
from typeddfs.abs_dfs import AbsDf
0 ignored issues
show
introduced by
Unable to import 'typeddfs.abs_dfs'
Loading history...
16
from typeddfs.checksums import Checksums
0 ignored issues
show
introduced by
Unable to import 'typeddfs.checksums'
Loading history...
17
from typeddfs.utils import Utils
0 ignored issues
show
introduced by
Unable to import 'typeddfs.utils'
Loading history...
18
19
from mandos.entry._arg_utils import EntryUtils
20
from mandos.entry.abstract_entries import Entry
21
from mandos.entry.api_singletons import Apis
22
from mandos.entry.entry_commands import Entries
23
from mandos.model.hit_dfs import HitDf
24
from mandos.model.settings import SETTINGS
25
from mandos.model.utils.reflection_utils import InjectionError
26
from mandos.model.utils.setup import MandosLogging, logger, LogSinkInfo
0 ignored issues
show
Unused Code introduced by
Unused MandosLogging imported from mandos.model.utils.setup
Loading history...
27
28
cli = typer.Typer()
29
Apis.set_default()
30
Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
31
32
EntriesByCmd: MutableMapping[str, Type[Entry]] = {e.cmd(): e for e in Entries}
33
34
# these are not permitted in individual searches
35
forbidden_keys = {"to", "no_setup"}
36
37
SearchExplainDf = (
38
    TypedDfs.typed("SearchExplainDf")
39
    .require("key", "search", "source", dtype=str)
40
    .require("desc", "args", dtype=str)
41
    .reserve("category", dtype=str)
42
    .strict()
43
    .secure()
44
).build()
45
46
47
def _no_duplicate_keys(self: AbsDf):
0 ignored issues
show
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
48
    group = self[["key"]].groupby("key").count().to_dict()
49
    bad = {k for k, v in group.items() if v > 1}
50
    if len(bad) > 0:
51
        return f"Duplicate keys: {', '.join(bad)}"
52
53
54
def _no_illegal_cols(self: AbsDf):
0 ignored issues
show
Unused Code introduced by
Either all return statements in a function should return an expression, or none of them should.
Loading history...
55
    if "to" in self.columns:
56
        return "Illegal key 'to'"
57
    if "path" in self.columns:
58
        return "Illegal key 'path'"
59
60
61
SearchConfigDf = (
62
    TypedDfs.typed("SearchConfigDf")
63
    .require("key", "source", dtype=str)
64
    .verify(_no_duplicate_keys)
65
    .verify(_no_illegal_cols)
66
    .add_read_kwargs("toml", aot="search")
67
    .add_write_kwargs("toml", aot="search")
68
    .secure()
69
    .build()
70
)
71
72
73
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
74
class MultiSearch:
75
    config: SearchConfigDf
76
    input_path: Path
77
    out_dir: Path
78
    suffix: str
79
    replace: bool
80
    log_path: Optional[Path]
81
82
    @property
83
    def final_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
84
        name = "search_" + self.input_path.name + self.suffix
85
        return self.out_dir / name
86
87
    @property
88
    def doc_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
89
        return Path(str(self.final_path.with_suffix("")) + "_doc.tsv")
90
91
    def __post_init__(self):
92
        if not self.replace and self.final_path.exists():
93
            raise PathExistsError(f"Path {self.final_path} exists but --replace is not set")
94
        if not self.replace and self.doc_path.exists():
95
            raise PathExistsError(f"Path {self.doc_path} exists but --replace is not set")
96
97
    def run(self) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
98
        # build up the list of Entry classes first, and run ``test`` on each one
99
        # that's to check that the parameters are correct before running anything
100
        commands = self._build_commands()
101
        if len(commands) == 0:
102
            logger.warning(f"No searches; nothing to do")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
103
            return
104
        # write a file describing all of the searches
105
        self.write_docs(commands)
106
        # build and test
107
        for cmd in commands:
108
            try:
109
                cmd.test()
110
            except Exception:
111
                logger.error(f"Bad search {cmd}")
112
                raise
113
        logger.notice("Searches look ok.")
114
        # start!
115
        for cmd in commands:
116
            cmd.run()
117
        logger.notice("Done with all searches!")
118
        # write the final file
119
        df = HitDf(pd.concat([HitDf.read_file(cmd.output_path) for cmd in commands]))
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
120
        df.write_file(self.final_path, file_hash=True)
121
        logger.notice(f"Concatenated file to {self.final_path}")
122
123
    def _build_commands(self) -> Sequence[CmdRunner]:
124
        commands = {}
125
        for i in range(len(self.config)):
126
            data = {
127
                k: v
128
                for k, v in self.config.iloc[i].to_dict().items()
129
                if v is not None and not pd.isna(v)
130
            }
131
            key = data["key"]
132
            default_to = self.input_path.parent / (key + SETTINGS.table_suffix)
133
            data["to"] = EntryUtils.adjust_filename(None, default=default_to, replace=self.replace)
134
            data["log"] = self._get_log_path(key)
135
            cmd = CmdRunner.build(data, self.input_path)
136
            commands[cmd.key] = cmd
137
        # log about replacing
138
        replacing = {k for k, v in commands.items() if v.was_run}
139
        if len(replacing) > 0:
140
            replacing = Utils.join_to_str(replacing, last="and")
141
            logger.notice(f"Overwriting results for {replacing}.")
142
        return list(commands.values())
143
144
    def write_docs(self, commands: Sequence[CmdRunner]) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
145
        rows = []
146
        for cmd in commands:
147
            name = cmd.cmd.get_search_type().search_name()
148
            cat = cmd.category
149
            src = cmd.cmd.get_search_type().primary_data_source()
150
            desc = cmd.cmd.describe()
151
            args = ", ".join([f"{k}={v}" for k, v in cmd.params.items()])
152
            ser = dict(key=cmd.key, search=name, category=cat, source=src, desc=desc, args=args)
153
            rows.append(pd.Series(ser))
154
        df = SearchExplainDf(rows)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
155
        df.write_file(self.doc_path, mkdirs=True)
156
157
    def _get_log_path(self, key: str):
158
        if self.log_path is None:
159
            suffix = SETTINGS.log_suffix
160
        else:
161
            suffix = LogSinkInfo.guess(self.log_path).suffix
162
        return self.out_dir / (key + suffix)
163
164
165
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
166
class CmdRunner:
167
    cmd: Type[Entry]
168
    params: MutableMapping[str, Union[int, str, float]]
169
    input_path: Path
170
    category: Optional[str]
171
172
    @property
173
    def key(self) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
174
        return self.params["key"]
175
176
    @property
177
    def output_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
178
        return Path(self.params["to"])
179
180
    @property
181
    def done_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
182
        return Checksums.get_hash_dir(self.output_path.parent)
183
184
    @property
185
    def was_run(self) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
186
        if not self.done_path.exists():
187
            return False
188
        sums = Checksums.parse_hash_file_resolved(self.done_path)
189
        return self.output_path in sums
190
191
    def test(self) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
192
        self.cmd.test(self.input_path, **self.params)
193
194
    def run(self) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
195
        self.cmd.run(self.input_path, **self.params)
196
197
    @classmethod
198
    def build(cls, data: Mapping[str, Any], input_path: Path):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
199
        key, cmd = data["key"], data["source"]
200
        try:
201
            cmd = EntriesByCmd[cmd]
202
        except KeyError:
203
            raise InjectionError(f"Search command {cmd} (key {key}) does not exist") from None
204
        params = {}
205
        # we need to explicitly add the defaults from the OptionInfo instances
206
        params.update(cmd.default_param_values().items())
207
        # do this after: the defaults had path, key, and to
208
        params["key"] = key
209
        # now add the params we got for this command's section
210
        params.update({k: v for k, v in data.items() if k != "source" and k != "category"})
0 ignored issues
show
Unused Code introduced by
Consider merging these comparisons with "in" to "k not in ('source', 'category')"
Loading history...
211
        category = data.get("category")
212
        runner = CmdRunner(cmd, params, input_path, category)
213
        if runner.output_path.exists() and not runner.done_path.exists():
214
            logger.error(f"Path {runner.output_path} exists but not marked as complete.")
215
        return runner
216
217
218
__all__ = ["MultiSearch", "SearchExplainDf", "SearchConfigDf"]
219