mandos.entry.multi_searches.MultiSearch.final_path() - Code Metrics - Inspection of "refactor: split out logging framework" - dmyersturnbull/mandos - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — main ( 5ce644...da77b5 )

by Douglas

created 2021-10-02 18:28 UTC

MultiSearch.final_path() A

↳ Parent: mandos.entry.multi_searches

Complexity

Conditions

Size

Total Lines	4
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	4
nop	1
dl	0
loc	4
rs	10
c	0
b	0
f	0

"""
Runner.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Mapping, MutableMapping, Optional, Sequence, Type, Union

import pandas as pd

import typer

from pocketutils.core.exceptions import PathExistsError, IllegalStateError

from typeddfs import TypedDfs

from typeddfs.abs_dfs import AbsDf

from typeddfs.checksums import Checksums

from typeddfs.utils import Utils


from mandos.entry._arg_utils import EntryUtils
from mandos.entry.abstract_entries import Entry
from mandos.entry.api_singletons import Apis
from mandos.entry.entry_commands import Entries
from mandos.model.hit_dfs import HitDf
from mandos.model.settings import SETTINGS
from mandos.model.utils.fancy_logger import LogSinkInfo
from mandos.model.utils.reflection_utils import InjectionError
from mandos.model.utils.setup import logger

cli = typer.Typer()
Apis.set_default()
Chembl, Pubchem = Apis.Chembl, Apis.Pubchem

EntriesByCmd: MutableMapping[str, Type[Entry]] = {e.cmd(): e for e in Entries}

# these are not permitted in individual searches
forbidden_keys = {"to", "no_setup"}

SearchExplainDf = (
    TypedDfs.typed("SearchExplainDf")
    .require("key", "search", "source", dtype=str)
    .require("desc", "args", dtype=str)
    .reserve("category", dtype=str)
    .strict()
    .secure()
).build()


def _no_duplicate_keys(self: AbsDf) -> Optional[str]:
    group = self[["key"]].groupby("key").count().to_dict()
    bad = {k for k, v in group.items() if v > 1}
    if len(bad) > 0:
        return f"Duplicate keys: {', '.join(bad)}"
    return None


def _no_illegal_cols(self: AbsDf) -> Optional[str]:
    illegal = {c for c in ["to", "path"] if c in self.columns}
    if len(illegal) > 0:
        return f"Illegal keys {', '.join(illegal)}"
    return None


SearchConfigDf = (
    TypedDfs.typed("SearchConfigDf")
    .require("key", "source", dtype=str)
    .verify(_no_duplicate_keys)
    .verify(_no_illegal_cols)
    .add_read_kwargs("toml", aot="search")
    .add_write_kwargs("toml", aot="search")
    .secure()
    .build()
)


@dataclass(frozen=True, repr=True)

class MultiSearch:
    config: SearchConfigDf
    input_path: Path
    out_dir: Path
    suffix: str
    replace: bool
    proceed: bool
    log_path: Optional[Path]

    @property
    def final_path(self) -> Path:

        name = "search_" + self.input_path.name + self.suffix
        return self.out_dir / name

    @property
    def final_checksum_path(self) -> Path:

        return Checksums.get_hash_file(self.final_path, algorithm=SETTINGS.search_checksum_alg)

    @property
    def is_complete(self):

        return self.final_checksum_path.exists()

    @property
    def doc_path(self) -> Path:

        return Path(str(self.final_path.with_suffix("")) + "_doc.tsv")

    def __post_init__(self):
        if not self.replace and self.is_complete:
            raise PathExistsError(f"Path {self.final_path} is complete but --replace is not set")
        if not self.proceed and self.final_path.exists():
            raise PathExistsError(f"Path {self.final_path} exists but --proceed is not set")

    def run(self) -> None:

        # build up the list of Entry classes first, and run ``test`` on each one
        # that's to check that the parameters are correct before running anything
        commands = self._build_commands()
        if len(commands) == 0:
            logger.warning(f"No searches — nothing to do")

            return
        # write a file describing all of the searches
        self.write_docs(commands)
        # build and test
        for cmd in commands:
            try:
                cmd.test(replace=self.replace, proceed=self.proceed)
            except Exception:
                logger.error(f"Bad search {cmd}")
                raise
        logger.notice("Searches look ok")
        # start!
        for cmd in commands:
            cmd.run(replace=self.replace, proceed=self.proceed)
        logger.notice("Done with all searches!")
        # write the final file
        df = HitDf(pd.concat([HitDf.read_file(cmd.output_path) for cmd in commands]))

        df.write_file(self.final_path, file_hash=True)
        logger.notice(f"Concatenated results to {self.final_path}")

    def _build_commands(self) -> Sequence[CmdRunner]:
        commands = {}
        for i in range(len(self.config)):
            data = {
                k: v
                for k, v in self.config.iloc[i].to_dict().items()
                if v is not None and not pd.isna(v)
            }
            key = data["key"]
            with logger.contextualize(key=key):
                default_to = self.out_dir / (key + SETTINGS.table_suffix)
                # TODO: produces bad logging about being overwritten

                data["to"] = EntryUtils.adjust_filename(None, default=default_to, replace=True)
                data["log"] = self._get_log_path(key)
                cmd = CmdRunner.build(data, self.input_path)
                commands[cmd.key] = cmd
        # log about replacing
        replacing = {k for k, v in commands.items() if v.was_run}
        if len(replacing) > 0:
            replacing = Utils.join_to_str(replacing, last="and")
            logger.notice(f"Overwriting results for {replacing}")
        return list(commands.values())

    def write_docs(self, commands: Sequence[CmdRunner]) -> None:

        rows = []
        for cmd in commands:
            name = cmd.cmd.get_search_type().search_name()
            cat = cmd.category
            src = cmd.cmd.get_search_type().primary_data_source()
            desc = cmd.cmd.describe()
            args = ", ".join([f"{k}={v}" for k, v in cmd.params.items()])
            ser = dict(key=cmd.key, search=name, category=cat, source=src, desc=desc, args=args)
            rows.append(pd.Series(ser))
        df = SearchExplainDf(rows)

        df.write_file(self.doc_path, mkdirs=True)

    def _get_log_path(self, key: str):
        if self.log_path is None:
            suffix = SETTINGS.log_suffix
        else:
            suffix = LogSinkInfo.guess(self.log_path).suffix
        return self.out_dir / (key + suffix)


@dataclass(frozen=True, repr=True)

class CmdRunner:
    cmd: Type[Entry]
    params: MutableMapping[str, Union[int, str, float]]
    input_path: Path
    category: Optional[str]

    @property
    def key(self) -> str:

        return self.params["key"]

    @property
    def output_path(self) -> Path:

        return Path(self.params["to"])

    @property
    def done_path(self) -> Path:

        return Checksums.get_hash_dir(self.output_path, algorithm=SETTINGS.search_checksum_alg)

    @property
    def was_run(self) -> bool:

        if not self.done_path.exists():
            return False
        sums = Checksums.parse_hash_file_resolved(self.done_path)
        done = self.output_path in sums
        if done and not self.output_path.exists():
            raise IllegalStateError(f"{self.output_path} marked complete but does not exist")
        return done

    def test(self, *, replace: bool, proceed: bool) -> None:

        if self.output_path.exists() and not self.was_run and not proceed and not replace:
            raise PathExistsError(f"Path {self.output_path} exists but not finished")
        with logger.contextualize(key=self.key):
            self.cmd.test(self.input_path, **self.params)

    def run(self, *, replace: bool, proceed: bool) -> None:

        # we already checked that we're allowed to proceed
        if replace or not self.was_run:
            with logger.contextualize(key=self.key):
                self.cmd.run(self.input_path, **self.params)

    @classmethod
    def build(cls, data: Mapping[str, Any], input_path: Path):

        key, cmd = data["key"], data["source"]
        try:
            cmd = EntriesByCmd[cmd]
        except KeyError:
            raise InjectionError(f"Search command {cmd} (key {key}) does not exist") from None
        params = {}
        # we need to explicitly add the defaults from the OptionInfo instances
        params.update(cmd.default_param_values().items())
        # do this after: the defaults had path, key, and to
        params["key"] = key
        # now add the params we got for this command's section
        params.update({k: v for k, v in data.items() if k != "source" and k != "category"})

        category = data.get("category")
        runner = CmdRunner(cmd, params, input_path, category)
        return runner


__all__ = ["MultiSearch", "SearchExplainDf", "SearchConfigDf"]


1			"""
2			Runner.
3			"""
4
5			from __future__ import annotations
6
7			from dataclasses import dataclass
8			from pathlib import Path
9			from typing import Any, Mapping, MutableMapping, Optional, Sequence, Type, Union
10
11			import pandas as pd
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Unable to import 'pandas' Loading history...
12			import typer
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Unable to import 'typer' Loading history...
13			from pocketutils.core.exceptions import PathExistsError, IllegalStateError
			0 ignored issues – show introduced 2021-09-23 03:01 UTC by Report Bug Copy Issue Report Unable to import 'pocketutils.core.exceptions' Loading history...
14			from typeddfs import TypedDfs
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs' Loading history...
15			from typeddfs.abs_dfs import AbsDf
			0 ignored issues – show introduced 2021-09-29 03:10 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.abs_dfs' Loading history...
16			from typeddfs.checksums import Checksums
			0 ignored issues – show introduced 2021-09-23 03:01 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.checksums' Loading history...
17			from typeddfs.utils import Utils
			0 ignored issues – show introduced 2021-09-29 03:10 UTC by Report Bug Copy Issue Report Unable to import 'typeddfs.utils' Loading history...
18
19			from mandos.entry._arg_utils import EntryUtils
20			from mandos.entry.abstract_entries import Entry
21			from mandos.entry.api_singletons import Apis
22			from mandos.entry.entry_commands import Entries
23			from mandos.model.hit_dfs import HitDf
24			from mandos.model.settings import SETTINGS
25			from mandos.model.utils.fancy_logger import LogSinkInfo
26			from mandos.model.utils.reflection_utils import InjectionError
27			from mandos.model.utils.setup import logger
28
29			cli = typer.Typer()
30			Apis.set_default()
31			Chembl, Pubchem = Apis.Chembl, Apis.Pubchem
32
33			EntriesByCmd: MutableMapping[str, Type[Entry]] = {e.cmd(): e for e in Entries}
34
35			# these are not permitted in individual searches
36			forbidden_keys = {"to", "no_setup"}
37
38			SearchExplainDf = (
39			TypedDfs.typed("SearchExplainDf")
40			.require("key", "search", "source", dtype=str)
41			.require("desc", "args", dtype=str)
42			.reserve("category", dtype=str)
43			.strict()
44			.secure()
45			).build()
46
47
48			def _no_duplicate_keys(self: AbsDf) -> Optional[str]:
49			group = self[["key"]].groupby("key").count().to_dict()
50			bad = {k for k, v in group.items() if v > 1}
51			if len(bad) > 0:
52			return f"Duplicate keys: {', '.join(bad)}"
53			return None
54
55
56			def _no_illegal_cols(self: AbsDf) -> Optional[str]:
57			illegal = {c for c in ["to", "path"] if c in self.columns}
58			if len(illegal) > 0:
59			return f"Illegal keys {', '.join(illegal)}"
60			return None
61
62
63			SearchConfigDf = (
64			TypedDfs.typed("SearchConfigDf")
65			.require("key", "source", dtype=str)
66			.verify(_no_duplicate_keys)
67			.verify(_no_illegal_cols)
68			.add_read_kwargs("toml", aot="search")
69			.add_write_kwargs("toml", aot="search")
70			.secure()
71			.build()
72			)
73
74
75			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
76			class MultiSearch:
77			config: SearchConfigDf
78			input_path: Path
79			out_dir: Path
80			suffix: str
81			replace: bool
82			proceed: bool
83			log_path: Optional[Path]
84
85			@property
86			def final_path(self) -> Path:
			0 ignored issues – show introduced 2021-03-21 02:08 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
87			name = "search_" + self.input_path.name + self.suffix
88			return self.out_dir / name
89
90			@property
91			def final_checksum_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
92			return Checksums.get_hash_file(self.final_path, algorithm=SETTINGS.search_checksum_alg)
93
94			@property
95			def is_complete(self):
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
96			return self.final_checksum_path.exists()
97
98			@property
99			def doc_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
100			return Path(str(self.final_path.with_suffix("")) + "_doc.tsv")
101
102			def __post_init__(self):
103			if not self.replace and self.is_complete:
104			raise PathExistsError(f"Path {self.final_path} is complete but --replace is not set")
105			if not self.proceed and self.final_path.exists():
106			raise PathExistsError(f"Path {self.final_path} exists but --proceed is not set")
107
108			def run(self) -> None:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
109			# build up the list of Entry classes first, and run ``test`` on each one
110			# that's to check that the parameters are correct before running anything
111			commands = self._build_commands()
112			if len(commands) == 0:
113			logger.warning(f"No searches — nothing to do")
			0 ignored issues – show introduced 2021-09-23 03:01 UTC by Report Bug Copy Issue Report Using an f-string that does not have any interpolated variables Loading history...
114			return
115			# write a file describing all of the searches
116			self.write_docs(commands)
117			# build and test
118			for cmd in commands:
119			try:
120			cmd.test(replace=self.replace, proceed=self.proceed)
121			except Exception:
122			logger.error(f"Bad search {cmd}")
123			raise
124			logger.notice("Searches look ok")
125			# start!
126			for cmd in commands:
127			cmd.run(replace=self.replace, proceed=self.proceed)
128			logger.notice("Done with all searches!")
129			# write the final file
130			df = HitDf(pd.concat([HitDf.read_file(cmd.output_path) for cmd in commands]))
			0 ignored issues – show Coding Style Naming introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
131			df.write_file(self.final_path, file_hash=True)
132			logger.notice(f"Concatenated results to {self.final_path}")
133
134			def _build_commands(self) -> Sequence[CmdRunner]:
135			commands = {}
136			for i in range(len(self.config)):
137			data = {
138			k: v
139			for k, v in self.config.iloc[i].to_dict().items()
140			if v is not None and not pd.isna(v)
141			}
142			key = data["key"]
143			with logger.contextualize(key=key):
144			default_to = self.out_dir / (key + SETTINGS.table_suffix)
145			# TODO: produces bad logging about being overwritten
			0 ignored issues – show Coding Style introduced 2021-09-29 16:53 UTC by Report Bug Copy Issue Report `TODO` and `FIXME` comments should generally be avoided. Loading history...
146			data["to"] = EntryUtils.adjust_filename(None, default=default_to, replace=True)
147			data["log"] = self._get_log_path(key)
148			cmd = CmdRunner.build(data, self.input_path)
149			commands[cmd.key] = cmd
150			# log about replacing
151			replacing = {k for k, v in commands.items() if v.was_run}
152			if len(replacing) > 0:
153			replacing = Utils.join_to_str(replacing, last="and")
154			logger.notice(f"Overwriting results for {replacing}")
155			return list(commands.values())
156
157			def write_docs(self, commands: Sequence[CmdRunner]) -> None:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
158			rows = []
159			for cmd in commands:
160			name = cmd.cmd.get_search_type().search_name()
161			cat = cmd.category
162			src = cmd.cmd.get_search_type().primary_data_source()
163			desc = cmd.cmd.describe()
164			args = ", ".join([f"{k}={v}" for k, v in cmd.params.items()])
165			ser = dict(key=cmd.key, search=name, category=cat, source=src, desc=desc, args=args)
166			rows.append(pd.Series(ser))
167			df = SearchExplainDf(rows)
			0 ignored issues – show Coding Style Naming introduced 2021-09-29 03:10 UTC by Report Bug Copy Issue Report Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
168			df.write_file(self.doc_path, mkdirs=True)
169
170			def _get_log_path(self, key: str):
171			if self.log_path is None:
172			suffix = SETTINGS.log_suffix
173			else:
174			suffix = LogSinkInfo.guess(self.log_path).suffix
175			return self.out_dir / (key + suffix)
176
177
178			@dataclass(frozen=True, repr=True)
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing class docstring Loading history...
179			class CmdRunner:
180			cmd: Type[Entry]
181			params: MutableMapping[str, Union[int, str, float]]
182			input_path: Path
183			category: Optional[str]
184
185			@property
186			def key(self) -> str:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
187			return self.params["key"]
188
189			@property
190			def output_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
191			return Path(self.params["to"])
192
193			@property
194			def done_path(self) -> Path:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
195			return Checksums.get_hash_dir(self.output_path, algorithm=SETTINGS.search_checksum_alg)
196
197			@property
198			def was_run(self) -> bool:
			0 ignored issues – show introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
199			if not self.done_path.exists():
200			return False
201			sums = Checksums.parse_hash_file_resolved(self.done_path)
202			done = self.output_path in sums
203			if done and not self.output_path.exists():
204			raise IllegalStateError(f"{self.output_path} marked complete but does not exist")
205			return done
206
207			def test(self, *, replace: bool, proceed: bool) -> None:
			0 ignored issues – show introduced 2021-07-05 19:05 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
208			if self.output_path.exists() and not self.was_run and not proceed and not replace:
209			raise PathExistsError(f"Path {self.output_path} exists but not finished")
210			with logger.contextualize(key=self.key):
211			self.cmd.test(self.input_path, **self.params)
212
213			def run(self, *, replace: bool, proceed: bool) -> None:
			0 ignored issues – show Unused Code introduced 2021-09-29 16:53 UTC by Report Bug Copy Issue Report The argument `proceed` seems to be unused. Loading history... introduced 2021-10-02 18:29 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
214			# we already checked that we're allowed to proceed
215			if replace or not self.was_run:
216			with logger.contextualize(key=self.key):
217			self.cmd.run(self.input_path, **self.params)
218
219			@classmethod
220			def build(cls, data: Mapping[str, Any], input_path: Path):
			0 ignored issues – show introduced 2021-10-02 18:29 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
221			key, cmd = data["key"], data["source"]
222			try:
223			cmd = EntriesByCmd[cmd]
224			except KeyError:
225			raise InjectionError(f"Search command {cmd} (key {key}) does not exist") from None
226			params = {}
227			# we need to explicitly add the defaults from the OptionInfo instances
228			params.update(cmd.default_param_values().items())
229			# do this after: the defaults had path, key, and to
230			params["key"] = key
231			# now add the params we got for this command's section
232			params.update({k: v for k, v in data.items() if k != "source" and k != "category"})
			0 ignored issues – show Unused Code introduced 2021-07-05 18:49 UTC by Report Bug Copy Issue Report Consider merging these comparisons with "in" to "k not in ('source', 'category')" Loading history...
233			category = data.get("category")
234			runner = CmdRunner(cmd, params, input_path, category)
235			return runner
236
237
238			__all__ = ["MultiSearch", "SearchExplainDf", "SearchConfigDf"]
239

dmyersturnbull / mandos

Push — main ( 5ce644...da77b5 )

MultiSearch.final_path() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like