mandos.entry.tools.multi_searches - Code Metrics - Inspection of "refactor: update pocketutils" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( da77b5...65730f )

by Douglas

created 2021-10-12 03:34 UTC

mandos.entry.tools.multi_searches B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	242
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	187
dl	0
loc	242
rs	8.8
c	0
b	0
f	0
wmc	45

2 Functions

Rating	Name	Duplication	Size	Complexity
A	_no_illegal_cols()	0	5	2
A	_no_duplicate_keys()	0	6	2

16 Methods

Rating	Name	Size	Complexity
A	MultiSearch.final_path()	4	1
A	CmdRunner.build()	17	2
A	CmdRunner.was_run()	9	4
A	CmdRunner.run()	5	4
B	CmdRunner.test()	5	6
A	MultiSearch._build_commands()	22	4
A	MultiSearch.final_checksum_path()	3	1
A	MultiSearch.doc_path()	3	1
A	CmdRunner.done_path()	3	1
A	MultiSearch._get_log_path()	6	2
A	MultiSearch.__post_init__()	5	5
A	MultiSearch.is_complete()	3	1
A	CmdRunner.output_path()	3	1
B	MultiSearch.run()	25	5
A	CmdRunner.key()	3	1
A	MultiSearch.write_docs()	12	2

How to fix Complexity

"""
Runner.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Mapping, MutableMapping, Optional, Sequence, Type, Union

import pandas as pd

import typer

from pocketutils.core.exceptions import (

    IllegalStateError,
    InjectionError,
    PathExistsError,
)
from pocketutils.misc.fancy_loguru import LogSinkInfo

from typeddfs import TypedDfs

from typeddfs.abs_dfs import AbsDf

from typeddfs.checksums import Checksums

from typeddfs.utils import Utils


from mandos import logger
from mandos.entry.abstract_entries import Entry
from mandos.entry.api_singletons import Apis
from mandos.entry.entry_commands import Entries
from mandos.entry.utils._arg_utils import EntryUtils
from mandos.model.hit_dfs import HitDf
from mandos.model.settings import SETTINGS

cli = typer.Typer()
Apis.set_default()
Chembl, Pubchem = Apis.Chembl, Apis.Pubchem

EntriesByCmd: MutableMapping[str, Type[Entry]] = {e.cmd(): e for e in Entries}

# these are not permitted in individual searches
forbidden_keys = {"to", "no_setup"}

SearchExplainDf = (
    TypedDfs.typed("SearchExplainDf")
    .require("key", "search", "source", dtype=str)
    .require("desc", "args", dtype=str)
    .reserve("category", dtype=str)
    .strict()
    .secure()
).build()


def _no_duplicate_keys(self: AbsDf) -> Optional[str]:
    group = self[["key"]].groupby("key").count().to_dict()
    bad = {k for k, v in group.items() if v > 1}
    if len(bad) > 0:
        return f"Duplicate keys: {', '.join(bad)}"
    return None


def _no_illegal_cols(self: AbsDf) -> Optional[str]:
    illegal = {c for c in ["to", "path"] if c in self.columns}
    if len(illegal) > 0:
        return f"Illegal keys {', '.join(illegal)}"
    return None


SearchConfigDf = (
    TypedDfs.typed("SearchConfigDf")
    .require("key", "source", dtype=str)
    .verify(_no_duplicate_keys)
    .verify(_no_illegal_cols)
    .add_read_kwargs("toml", aot="search")
    .add_write_kwargs("toml", aot="search")
    .secure()
    .build()
)


@dataclass(frozen=True, repr=True)

class MultiSearch:
    config: SearchConfigDf
    input_path: Path
    out_dir: Path
    suffix: str
    replace: bool
    proceed: bool
    log_path: Optional[Path]

    @property
    def final_path(self) -> Path:

        name = "search_" + self.input_path.name + self.suffix
        return self.out_dir / name

    @property
    def final_checksum_path(self) -> Path:

        return Checksums.get_hash_file(self.final_path, algorithm=SETTINGS.search_checksum_alg)

    @property
    def is_complete(self):

        return self.final_checksum_path.exists()

    @property
    def doc_path(self) -> Path:

        return Path(str(self.final_path.with_suffix("")) + "_doc.tsv")

    def __post_init__(self):
        if not self.replace and self.is_complete:
            raise PathExistsError(f"Path {self.final_path} is complete but --replace is not set")
        if not self.proceed and self.final_path.exists():
            raise PathExistsError(f"Path {self.final_path} exists but --proceed is not set")

    def run(self) -> None:

        # build up the list of Entry classes first, and run ``test`` on each one
        # that's to check that the parameters are correct before running anything
        commands = self._build_commands()
        if len(commands) == 0:
            logger.warning(f"No searches — nothing to do")

            return
        # write a file describing all of the searches
        self.write_docs(commands)
        # build and test
        for cmd in commands:
            try:
                cmd.test(replace=self.replace, proceed=self.proceed)
            except Exception:
                logger.error(f"Bad search {cmd}")
                raise
        logger.notice("Searches look ok")
        # start!
        for cmd in commands:
            cmd.run(replace=self.replace, proceed=self.proceed)
        logger.notice("Done with all searches!")
        # write the final file
        df = HitDf(pd.concat([HitDf.read_file(cmd.output_path) for cmd in commands]))

        df.write_file(self.final_path, file_hash=True)
        logger.notice(f"Concatenated results to {self.final_path}")

    def _build_commands(self) -> Sequence[CmdRunner]:
        commands = {}
        for i in range(len(self.config)):
            data = {
                k: v
                for k, v in self.config.iloc[i].to_dict().items()
                if v is not None and not pd.isna(v)
            }
            key = data["key"]
            with logger.contextualize(key=key):
                default_to = self.out_dir / (key + SETTINGS.table_suffix)
                # TODO: produces bad logging about being overwritten

                data["to"] = EntryUtils.adjust_filename(None, default=default_to, replace=True)
                data["log"] = self._get_log_path(key)
                cmd = CmdRunner.build(data, self.input_path)
                commands[cmd.key] = cmd
        # log about replacing
        replacing = {k for k, v in commands.items() if v.was_run}
        if len(replacing) > 0:
            replacing = Utils.join_to_str(replacing, last="and")
            logger.notice(f"Overwriting results for {replacing}")
        return list(commands.values())

    def write_docs(self, commands: Sequence[CmdRunner]) -> None:

        rows = []
        for cmd in commands:
            name = cmd.cmd.get_search_type().search_name()
            cat = cmd.category
            src = cmd.cmd.get_search_type().primary_data_source()
            desc = cmd.cmd.describe()
            args = ", ".join([f"{k}={v}" for k, v in cmd.params.items()])
            ser = dict(key=cmd.key, search=name, category=cat, source=src, desc=desc, args=args)
            rows.append(pd.Series(ser))
        df = SearchExplainDf(rows)

        df.write_file(self.doc_path, mkdirs=True)

    def _get_log_path(self, key: str):
        if self.log_path is None:
            suffix = SETTINGS.log_suffix
        else:
            suffix = LogSinkInfo.guess(self.log_path).suffix
        return self.out_dir / (key + suffix)


@dataclass(frozen=True, repr=True)

class CmdRunner:
    cmd: Type[Entry]
    params: MutableMapping[str, Union[int, str, float]]
    input_path: Path
    category: Optional[str]

    @property
    def key(self) -> str:

        return self.params["key"]

    @property
    def output_path(self) -> Path:

        return Path(self.params["to"])

    @property
    def done_path(self) -> Path:

        return Checksums.get_hash_dir(self.output_path, algorithm=SETTINGS.search_checksum_alg)

    @property
    def was_run(self) -> bool:

        if not self.done_path.exists():
            return False
        sums = Checksums.parse_hash_file_resolved(self.done_path)
        done = self.output_path in sums
        if done and not self.output_path.exists():
            raise IllegalStateError(f"{self.output_path} marked complete but does not exist")
        return done

    def test(self, *, replace: bool, proceed: bool) -> None:

        if self.output_path.exists() and not self.was_run and not proceed and not replace:
            raise PathExistsError(f"Path {self.output_path} exists but not finished")
        with logger.contextualize(key=self.key):
            self.cmd.test(self.input_path, **self.params)

    def run(self, *, replace: bool, proceed: bool) -> None:

        # we already checked that we're allowed to proceed
        if replace or not self.was_run:
            with logger.contextualize(key=self.key):
                self.cmd.run(self.input_path, **self.params)

    @classmethod
    def build(cls, data: Mapping[str, Any], input_path: Path):

        key, cmd = data["key"], data["source"]
        try:
            cmd = EntriesByCmd[cmd]
        except KeyError:
            raise InjectionError(f"Search command {cmd} (key {key}) does not exist") from None
        params = {}
        # we need to explicitly add the defaults from the OptionInfo instances
        params.update(cmd.default_param_values().items())
        # do this after: the defaults had path, key, and to
        params["key"] = key
        # now add the params we got for this command's section
        params.update({k: v for k, v in data.items() if k != "source" and k != "category"})

        category = data.get("category")
        runner = CmdRunner(cmd, params, input_path, category)
        return runner


__all__ = ["MultiSearch", "SearchExplainDf", "SearchConfigDf"]


1			"""
2			Runner.
3			"""
4
5			from __future__ import annotations
6
7			from dataclasses import dataclass
8			from pathlib import Path
9			from typing import Any, Mapping, MutableMapping, Optional, Sequence, Type, Union
10
11			import pandas as pd
			0 ignored issues – show introduced 2021-10-12 03:37 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			import typer
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typer' Loading history...
13			from pocketutils.core.exceptions import (
			0 ignored issues – show introduced 2021-10-12 03:37 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.exceptions' Loading history...
14			IllegalStateError,
15			InjectionError,
16			PathExistsError,
17			)
18			from pocketutils.misc.fancy_loguru import LogSinkInfo
			0 ignored issues – show introduced 2021-10-12 03:37 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.misc.fancy_loguru' Loading history...
19			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
20			from typeddfs.abs_dfs import AbsDf
			0 ignored issues – show introduced 2021-09-29 03:10 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.abs_dfs' Loading history...
21			from typeddfs.checksums import Checksums
			0 ignored issues – show introduced 2021-10-12 03:37 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.checksums' Loading history...
22			from typeddfs.utils import Utils
			0 ignored issues – show introduced 2021-10-12 03:37 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.utils' Loading history...
23
24			from mandos import logger
25			from mandos.entry.abstract_entries import Entry
26			from mandos.entry.api_singletons import Apis
27			from mandos.entry.entry_commands import Entries
28			from mandos.entry.utils._arg_utils import EntryUtils
29			from mandos.model.hit_dfs import HitDf
30			from mandos.model.settings import SETTINGS
31
32			cli = typer.Typer()
33			Apis.set_default()
34			Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
35
36			EntriesByCmd: MutableMapping[str, Type[Entry]] = {e.cmd(): e for e in Entries}
37
38			# these are not permitted in individual searches
39			forbidden_keys = {"to", "no_setup"}
40
41			SearchExplainDf = (
42			TypedDfs.typed("SearchExplainDf")
43			.require("key", "search", "source", dtype=str)
44			.require("desc", "args", dtype=str)
45			.reserve("category", dtype=str)
46			.strict()
47			.secure()
48			).build()
49
50
51			def _no_duplicate_keys(self: AbsDf) -> Optional[str]:
52			group = self[["key"]].groupby("key").count().to_dict()
53			bad = {k for k, v in group.items() if v > 1}
54			if len(bad) > 0:
55			return f"Duplicate keys: {', '.join(bad)}"
56			return None
57
58
59			def _no_illegal_cols(self: AbsDf) -> Optional[str]:
60			illegal = {c for c in ["to", "path"] if c in self.columns}
61			if len(illegal) > 0:
62			return f"Illegal keys {', '.join(illegal)}"
63			return None
64
65
66			SearchConfigDf = (
67			TypedDfs.typed("SearchConfigDf")
68			.require("key", "source", dtype=str)
69			.verify(_no_duplicate_keys)
70			.verify(_no_illegal_cols)
71			.add_read_kwargs("toml", aot="search")
72			.add_write_kwargs("toml", aot="search")
73			.secure()
74			.build()
75			)
76
77
78			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
79			class MultiSearch:
80			config: SearchConfigDf
81			input_path: Path
82			out_dir: Path
83			suffix: str
84			replace: bool
85			proceed: bool
86			log_path: Optional[Path]
87
88			@property
89			def final_path(self) -> Path:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
90			name = "search_" + self.input_path.name + self.suffix
91			return self.out_dir / name
92
93			@property
94			def final_checksum_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
95			return Checksums.get_hash_file(self.final_path, algorithm=SETTINGS.search_checksum_alg)
96
97			@property
98			def is_complete(self):
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
99			return self.final_checksum_path.exists()
100
101			@property
102			def doc_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
103			return Path(str(self.final_path.with_suffix("")) + "_doc.tsv")
104
105			def __post_init__(self):
106			if not self.replace and self.is_complete:
107			raise PathExistsError(f"Path {self.final_path} is complete but --replace is not set")
108			if not self.proceed and self.final_path.exists():
109			raise PathExistsError(f"Path {self.final_path} exists but --proceed is not set")
110
111			def run(self) -> None:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
112			# build up the list of Entry classes first, and run ``test`` on each one
113			# that's to check that the parameters are correct before running anything
114			commands = self._build_commands()
115			if len(commands) == 0:
116			logger.warning(f"No searches — nothing to do")
			0 ignored issues – show introduced 2021-09-23 03:01 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
117			return
118			# write a file describing all of the searches
119			self.write_docs(commands)
120			# build and test
121			for cmd in commands:
122			try:
123			cmd.test(replace=self.replace, proceed=self.proceed)
124			except Exception:
125			logger.error(f"Bad search {cmd}")
126			raise
127			logger.notice("Searches look ok")
128			# start!
129			for cmd in commands:
130			cmd.run(replace=self.replace, proceed=self.proceed)
131			logger.notice("Done with all searches!")
132			# write the final file
133			df = HitDf(pd.concat([HitDf.read_file(cmd.output_path) for cmd in commands]))
			0 ignored issues – show Coding Style Naming introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
134			df.write_file(self.final_path, file_hash=True)
135			logger.notice(f"Concatenated results to {self.final_path}")
136
137			def _build_commands(self) -> Sequence[CmdRunner]:
138			commands = {}
139			for i in range(len(self.config)):
140			data = {
141			k: v
142			for k, v in self.config.iloc[i].to_dict().items()
143			if v is not None and not pd.isna(v)
144			}
145			key = data["key"]
146			with logger.contextualize(key=key):
147			default_to = self.out_dir / (key + SETTINGS.table_suffix)
148			# TODO: produces bad logging about being overwritten
			0 ignored issues – show Coding Style introduced 2021-09-29 16:53 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
149			data["to"] = EntryUtils.adjust_filename(None, default=default_to, replace=True)
150			data["log"] = self._get_log_path(key)
151			cmd = CmdRunner.build(data, self.input_path)
152			commands[cmd.key] = cmd
153			# log about replacing
154			replacing = {k for k, v in commands.items() if v.was_run}
155			if len(replacing) > 0:
156			replacing = Utils.join_to_str(replacing, last="and")
157			logger.notice(f"Overwriting results for {replacing}")
158			return list(commands.values())
159
160			def write_docs(self, commands: Sequence[CmdRunner]) -> None:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
161			rows = []
162			for cmd in commands:
163			name = cmd.cmd.get_search_type().search_name()
164			cat = cmd.category
165			src = cmd.cmd.get_search_type().primary_data_source()
166			desc = cmd.cmd.describe()
167			args = ", ".join([f"{k}={v}" for k, v in cmd.params.items()])
168			ser = dict(key=cmd.key, search=name, category=cat, source=src, desc=desc, args=args)
169			rows.append(pd.Series(ser))
170			df = SearchExplainDf(rows)
			0 ignored issues – show Coding Style Naming introduced 2021-09-29 03:10 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
171			df.write_file(self.doc_path, mkdirs=True)
172
173			def _get_log_path(self, key: str):
174			if self.log_path is None:
175			suffix = SETTINGS.log_suffix
176			else:
177			suffix = LogSinkInfo.guess(self.log_path).suffix
178			return self.out_dir / (key + suffix)
179
180
181			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
182			class CmdRunner:
183			cmd: Type[Entry]
184			params: MutableMapping[str, Union[int, str, float]]
185			input_path: Path
186			category: Optional[str]
187
188			@property
189			def key(self) -> str:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
190			return self.params["key"]
191
192			@property
193			def output_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
194			return Path(self.params["to"])
195
196			@property
197			def done_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
198			return Checksums.get_hash_dir(self.output_path, algorithm=SETTINGS.search_checksum_alg)
199
200			@property
201			def was_run(self) -> bool:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
202			if not self.done_path.exists():
203			return False
204			sums = Checksums.parse_hash_file_resolved(self.done_path)
205			done = self.output_path in sums
206			if done and not self.output_path.exists():
207			raise IllegalStateError(f"{self.output_path} marked complete but does not exist")
208			return done
209
210			def test(self, *, replace: bool, proceed: bool) -> None:
			0 ignored issues – show introduced 2021-07-05 19:05 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
211			if self.output_path.exists() and not self.was_run and not proceed and not replace:
212			raise PathExistsError(f"Path {self.output_path} exists but not finished")
213			with logger.contextualize(key=self.key):
214			self.cmd.test(self.input_path, **self.params)
215
216			def run(self, *, replace: bool, proceed: bool) -> None:
			0 ignored issues – show introduced 2021-10-02 18:29 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history... Unused Code introduced 2021-09-29 16:53 UTC by Report Bug Copy Issue Report The argument `proceed` seems to be unused. Loading history...
217			# we already checked that we're allowed to proceed
218			if replace or not self.was_run:
219			with logger.contextualize(key=self.key):
220			self.cmd.run(self.input_path, **self.params)
221
222			@classmethod
223			def build(cls, data: Mapping[str, Any], input_path: Path):
			0 ignored issues – show introduced 2021-10-02 18:29 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
224			key, cmd = data["key"], data["source"]
225			try:
226			cmd = EntriesByCmd[cmd]
227			except KeyError:
228			raise InjectionError(f"Search command {cmd} (key {key}) does not exist") from None
229			params = {}
230			# we need to explicitly add the defaults from the OptionInfo instances
231			params.update(cmd.default_param_values().items())
232			# do this after: the defaults had path, key, and to
233			params["key"] = key
234			# now add the params we got for this command's section
235			params.update({k: v for k, v in data.items() if k != "source" and k != "category"})
			0 ignored issues – show Unused Code introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Consider merging these comparisons with "in" to "k not in ('source', 'category')" Loading history...
236			category = data.get("category")
237			runner = CmdRunner(cmd, params, input_path, category)
238			return runner
239
240
241			__all__ = ["MultiSearch", "SearchExplainDf", "SearchConfigDf"]
242

dmyersturnbull / mandos

Push — main ( da77b5...65730f )

mandos.entry.tools.multi_searches B

Complexity

Size/Duplication

Importance

2 Functions

16 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like