Passed
Push — main ( da77b5...65730f )
by Douglas
02:28
created

mandos.entry.tools.multi_searches   B

Complexity

Total Complexity 45

Size/Duplication

Total Lines 242
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 187
dl 0
loc 242
rs 8.8
c 0
b 0
f 0
wmc 45

2 Functions

Rating   Name   Duplication   Size   Complexity  
A _no_illegal_cols() 0 5 2
A _no_duplicate_keys() 0 6 2

16 Methods

Rating   Name   Duplication   Size   Complexity  
A MultiSearch.final_path() 0 4 1
A CmdRunner.build() 0 17 2
A CmdRunner.was_run() 0 9 4
A CmdRunner.run() 0 5 4
B CmdRunner.test() 0 5 6
A MultiSearch._build_commands() 0 22 4
A MultiSearch.final_checksum_path() 0 3 1
A MultiSearch.doc_path() 0 3 1
A CmdRunner.done_path() 0 3 1
A MultiSearch._get_log_path() 0 6 2
A MultiSearch.__post_init__() 0 5 5
A MultiSearch.is_complete() 0 3 1
A CmdRunner.output_path() 0 3 1
B MultiSearch.run() 0 25 5
A CmdRunner.key() 0 3 1
A MultiSearch.write_docs() 0 12 2

How to fix   Complexity   

Complexity

Complex classes like mandos.entry.tools.multi_searches often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""
2
Runner.
3
"""
4
5
from __future__ import annotations
6
7
from dataclasses import dataclass
8
from pathlib import Path
9
from typing import Any, Mapping, MutableMapping, Optional, Sequence, Type, Union
10
11
import pandas as pd
0 ignored issues
show
introduced by
Unable to import 'pandas'
Loading history...
12
import typer
0 ignored issues
show
introduced by
Unable to import 'typer'
Loading history...
13
from pocketutils.core.exceptions import (
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.exceptions'
Loading history...
14
    IllegalStateError,
15
    InjectionError,
16
    PathExistsError,
17
)
18
from pocketutils.misc.fancy_loguru import LogSinkInfo
0 ignored issues
show
introduced by
Unable to import 'pocketutils.misc.fancy_loguru'
Loading history...
19
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
20
from typeddfs.abs_dfs import AbsDf
0 ignored issues
show
introduced by
Unable to import 'typeddfs.abs_dfs'
Loading history...
21
from typeddfs.checksums import Checksums
0 ignored issues
show
introduced by
Unable to import 'typeddfs.checksums'
Loading history...
22
from typeddfs.utils import Utils
0 ignored issues
show
introduced by
Unable to import 'typeddfs.utils'
Loading history...
23
24
from mandos import logger
25
from mandos.entry.abstract_entries import Entry
26
from mandos.entry.api_singletons import Apis
27
from mandos.entry.entry_commands import Entries
28
from mandos.entry.utils._arg_utils import EntryUtils
29
from mandos.model.hit_dfs import HitDf
30
from mandos.model.settings import SETTINGS
31
32
cli = typer.Typer()
33
Apis.set_default()
34
Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
35
36
EntriesByCmd: MutableMapping[str, Type[Entry]] = {e.cmd(): e for e in Entries}
37
38
# these are not permitted in individual searches
39
forbidden_keys = {"to", "no_setup"}
40
41
SearchExplainDf = (
42
    TypedDfs.typed("SearchExplainDf")
43
    .require("key", "search", "source", dtype=str)
44
    .require("desc", "args", dtype=str)
45
    .reserve("category", dtype=str)
46
    .strict()
47
    .secure()
48
).build()
49
50
51
def _no_duplicate_keys(self: AbsDf) -> Optional[str]:
52
    group = self[["key"]].groupby("key").count().to_dict()
53
    bad = {k for k, v in group.items() if v > 1}
54
    if len(bad) > 0:
55
        return f"Duplicate keys: {', '.join(bad)}"
56
    return None
57
58
59
def _no_illegal_cols(self: AbsDf) -> Optional[str]:
60
    illegal = {c for c in ["to", "path"] if c in self.columns}
61
    if len(illegal) > 0:
62
        return f"Illegal keys {', '.join(illegal)}"
63
    return None
64
65
66
SearchConfigDf = (
67
    TypedDfs.typed("SearchConfigDf")
68
    .require("key", "source", dtype=str)
69
    .verify(_no_duplicate_keys)
70
    .verify(_no_illegal_cols)
71
    .add_read_kwargs("toml", aot="search")
72
    .add_write_kwargs("toml", aot="search")
73
    .secure()
74
    .build()
75
)
76
77
78
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
79
class MultiSearch:
80
    config: SearchConfigDf
81
    input_path: Path
82
    out_dir: Path
83
    suffix: str
84
    replace: bool
85
    proceed: bool
86
    log_path: Optional[Path]
87
88
    @property
89
    def final_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
90
        name = "search_" + self.input_path.name + self.suffix
91
        return self.out_dir / name
92
93
    @property
94
    def final_checksum_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
95
        return Checksums.get_hash_file(self.final_path, algorithm=SETTINGS.search_checksum_alg)
96
97
    @property
98
    def is_complete(self):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
99
        return self.final_checksum_path.exists()
100
101
    @property
102
    def doc_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
103
        return Path(str(self.final_path.with_suffix("")) + "_doc.tsv")
104
105
    def __post_init__(self):
106
        if not self.replace and self.is_complete:
107
            raise PathExistsError(f"Path {self.final_path} is complete but --replace is not set")
108
        if not self.proceed and self.final_path.exists():
109
            raise PathExistsError(f"Path {self.final_path} exists but --proceed is not set")
110
111
    def run(self) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
112
        # build up the list of Entry classes first, and run ``test`` on each one
113
        # that's to check that the parameters are correct before running anything
114
        commands = self._build_commands()
115
        if len(commands) == 0:
116
            logger.warning(f"No searches — nothing to do")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
117
            return
118
        # write a file describing all of the searches
119
        self.write_docs(commands)
120
        # build and test
121
        for cmd in commands:
122
            try:
123
                cmd.test(replace=self.replace, proceed=self.proceed)
124
            except Exception:
125
                logger.error(f"Bad search {cmd}")
126
                raise
127
        logger.notice("Searches look ok")
128
        # start!
129
        for cmd in commands:
130
            cmd.run(replace=self.replace, proceed=self.proceed)
131
        logger.notice("Done with all searches!")
132
        # write the final file
133
        df = HitDf(pd.concat([HitDf.read_file(cmd.output_path) for cmd in commands]))
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
134
        df.write_file(self.final_path, file_hash=True)
135
        logger.notice(f"Concatenated results to {self.final_path}")
136
137
    def _build_commands(self) -> Sequence[CmdRunner]:
138
        commands = {}
139
        for i in range(len(self.config)):
140
            data = {
141
                k: v
142
                for k, v in self.config.iloc[i].to_dict().items()
143
                if v is not None and not pd.isna(v)
144
            }
145
            key = data["key"]
146
            with logger.contextualize(key=key):
147
                default_to = self.out_dir / (key + SETTINGS.table_suffix)
148
                # TODO: produces bad logging about being overwritten
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
149
                data["to"] = EntryUtils.adjust_filename(None, default=default_to, replace=True)
150
                data["log"] = self._get_log_path(key)
151
                cmd = CmdRunner.build(data, self.input_path)
152
                commands[cmd.key] = cmd
153
        # log about replacing
154
        replacing = {k for k, v in commands.items() if v.was_run}
155
        if len(replacing) > 0:
156
            replacing = Utils.join_to_str(replacing, last="and")
157
            logger.notice(f"Overwriting results for {replacing}")
158
        return list(commands.values())
159
160
    def write_docs(self, commands: Sequence[CmdRunner]) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
161
        rows = []
162
        for cmd in commands:
163
            name = cmd.cmd.get_search_type().search_name()
164
            cat = cmd.category
165
            src = cmd.cmd.get_search_type().primary_data_source()
166
            desc = cmd.cmd.describe()
167
            args = ", ".join([f"{k}={v}" for k, v in cmd.params.items()])
168
            ser = dict(key=cmd.key, search=name, category=cat, source=src, desc=desc, args=args)
169
            rows.append(pd.Series(ser))
170
        df = SearchExplainDf(rows)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
171
        df.write_file(self.doc_path, mkdirs=True)
172
173
    def _get_log_path(self, key: str):
174
        if self.log_path is None:
175
            suffix = SETTINGS.log_suffix
176
        else:
177
            suffix = LogSinkInfo.guess(self.log_path).suffix
178
        return self.out_dir / (key + suffix)
179
180
181
@dataclass(frozen=True, repr=True)
0 ignored issues
show
introduced by
Missing class docstring
Loading history...
182
class CmdRunner:
183
    cmd: Type[Entry]
184
    params: MutableMapping[str, Union[int, str, float]]
185
    input_path: Path
186
    category: Optional[str]
187
188
    @property
189
    def key(self) -> str:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
190
        return self.params["key"]
191
192
    @property
193
    def output_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
194
        return Path(self.params["to"])
195
196
    @property
197
    def done_path(self) -> Path:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
198
        return Checksums.get_hash_dir(self.output_path, algorithm=SETTINGS.search_checksum_alg)
199
200
    @property
201
    def was_run(self) -> bool:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
202
        if not self.done_path.exists():
203
            return False
204
        sums = Checksums.parse_hash_file_resolved(self.done_path)
205
        done = self.output_path in sums
206
        if done and not self.output_path.exists():
207
            raise IllegalStateError(f"{self.output_path} marked complete but does not exist")
208
        return done
209
210
    def test(self, *, replace: bool, proceed: bool) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
211
        if self.output_path.exists() and not self.was_run and not proceed and not replace:
212
            raise PathExistsError(f"Path {self.output_path} exists but not finished")
213
        with logger.contextualize(key=self.key):
214
            self.cmd.test(self.input_path, **self.params)
215
216
    def run(self, *, replace: bool, proceed: bool) -> None:
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
Unused Code introduced by
The argument proceed seems to be unused.
Loading history...
217
        # we already checked that we're allowed to proceed
218
        if replace or not self.was_run:
219
            with logger.contextualize(key=self.key):
220
                self.cmd.run(self.input_path, **self.params)
221
222
    @classmethod
223
    def build(cls, data: Mapping[str, Any], input_path: Path):
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
224
        key, cmd = data["key"], data["source"]
225
        try:
226
            cmd = EntriesByCmd[cmd]
227
        except KeyError:
228
            raise InjectionError(f"Search command {cmd} (key {key}) does not exist") from None
229
        params = {}
230
        # we need to explicitly add the defaults from the OptionInfo instances
231
        params.update(cmd.default_param_values().items())
232
        # do this after: the defaults had path, key, and to
233
        params["key"] = key
234
        # now add the params we got for this command's section
235
        params.update({k: v for k, v in data.items() if k != "source" and k != "category"})
0 ignored issues
show
Unused Code introduced by
Consider merging these comparisons with "in" to "k not in ('source', 'category')"
Loading history...
236
        category = data.get("category")
237
        runner = CmdRunner(cmd, params, input_path, category)
238
        return runner
239
240
241
__all__ = ["MultiSearch", "SearchExplainDf", "SearchConfigDf"]
242