Passed
Push — main ( cee75c...37036d )
by Douglas
02:08
created

mandos.entry.searchers   A

Complexity

Total Complexity 7

Size/Duplication

Total Lines 89
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 51
dl 0
loc 89
rs 10
c 0
b 0
f 0
wmc 7

3 Methods

Rating   Name   Duplication   Size   Complexity  
A Searcher.__init__() 0 16 1
A Searcher.search() 0 12 3
A Searcher._search_one() 0 16 2

1 Function

Rating   Name   Duplication   Size   Complexity  
A _fix_cols() 0 2 1
1
"""
2
Run searches and write files.
3
"""
4
5
from __future__ import annotations
6
7
from pathlib import Path
8
from typing import Optional, Sequence
9
10
from pocketutils.core.dot_dict import NestedDotDict
0 ignored issues
show
introduced by
Unable to import 'pocketutils.core.dot_dict'
Loading history...
11
from pocketutils.tools.common_tools import CommonTools
0 ignored issues
show
introduced by
Unable to import 'pocketutils.tools.common_tools'
Loading history...
12
from typeddfs import TypedDfs
0 ignored issues
show
introduced by
Unable to import 'typeddfs'
Loading history...
13
14
from mandos import logger
15
from mandos.entry.paths import EntryPaths
16
from mandos.model.searches import Search
17
18
19
def _fix_cols(df):
0 ignored issues
show
Coding Style Naming introduced by
Argument name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
20
    return df.rename(columns={s: s.lower() for s in df.columns})
21
22
23
InputFrame = (
24
    TypedDfs.typed("InputFrame")
25
    .require("inchikey")
26
    .reserve("inchi", "smiles", "compound_id", dtype=str)
27
    .post(_fix_cols)
28
    .strict(cols=False)
29
    .secure()
30
).build()
31
32
33
class Searcher:
34
    """
35
    Executes one or more searches and saves the results to CSV files.
36
    Create and use once.
37
    """
38
39
    def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path):
40
        """
41
        Constructor.
42
43
        Args:
44
            searches:
45
            input_path: Path to the input file of one of the formats:
46
                - .txt containing one InChI Key per line
47
                - .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (109/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
48
        """
49
        self.what = searches
50
        self.input_path: Optional[Path] = input_path
51
        self.input_df: InputFrame = None
52
        self.output_paths = {
53
            what.key: EntryPaths.output_path_of(what, input_path, path)
54
            for what, path in CommonTools.zip_list(searches, to)
55
        }
56
57
    def search(self) -> Searcher:
58
        """
59
        Performs the search, and writes data.
60
        """
61
        if self.input_df is not None:
62
            raise ValueError(f"Already ran a search")
0 ignored issues
show
introduced by
Using an f-string that does not have any interpolated variables
Loading history...
63
        self.input_df = InputFrame.read_file(self.input_path)
64
        logger.info(f"Read {len(self.input_df)} input compounds")
65
        inchikeys = self.input_df["inchikey"].unique()
66
        for what in self.what:
67
            self._search_one(what, inchikeys)
68
        return self
69
70
    def _search_one(self, what: Search, inchikeys: Sequence[str]):
71
        output_path = self.output_paths[what.key]
72
        metadata_path = output_path.with_suffix(".json.metadata")
73
        df = what.find_to_df(inchikeys)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "df" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
74
        # keep all of the original extra columns from the input
75
        # e.g. if the user had 'inchi' or 'smiles' or 'pretty_name'
76
        for extra_col in [c for c in self.input_df.columns if c != "inchikey"]:
77
            extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict()
78
            df[extra_col] = df["lookup"].map(extra_mp.get)
79
        # write the (intermediate) file
80
        df.write_file(output_path)
81
        # write metadata
82
        params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}}
83
        metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params))
84
        metadata.write_json(metadata_path)
85
        logger.info(f"Wrote {what.key} to {output_path}")
86
87
88
__all__ = ["Searcher", "InputFrame"]
89