1
|
|
|
""" |
2
|
|
|
Run searches and write files. |
3
|
|
|
""" |
4
|
|
|
|
5
|
|
|
from __future__ import annotations |
6
|
|
|
|
7
|
|
|
from pathlib import Path |
8
|
|
|
from typing import Optional, Sequence |
9
|
|
|
|
10
|
|
|
from pocketutils.core.dot_dict import NestedDotDict |
|
|
|
|
11
|
|
|
from pocketutils.tools.common_tools import CommonTools |
|
|
|
|
12
|
|
|
from typeddfs import TypedDfs |
|
|
|
|
13
|
|
|
|
14
|
|
|
from mandos import logger |
15
|
|
|
from mandos.entry.paths import EntryPaths |
16
|
|
|
from mandos.model.searches import Search |
17
|
|
|
|
18
|
|
|
|
19
|
|
|
def _fix_cols(df): |
|
|
|
|
20
|
|
|
return df.rename(columns={s: s.lower() for s in df.columns}) |
21
|
|
|
|
22
|
|
|
|
23
|
|
|
InputFrame = ( |
24
|
|
|
TypedDfs.typed("InputFrame") |
25
|
|
|
.require("inchikey") |
26
|
|
|
.reserve("inchi", "smiles", "compound_id", dtype=str) |
27
|
|
|
.post(_fix_cols) |
28
|
|
|
.strict(cols=False) |
29
|
|
|
.secure() |
30
|
|
|
).build() |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
class Searcher: |
34
|
|
|
""" |
35
|
|
|
Executes one or more searches and saves the results to CSV files. |
36
|
|
|
Create and use once. |
37
|
|
|
""" |
38
|
|
|
|
39
|
|
|
def __init__(self, searches: Sequence[Search], to: Sequence[Path], input_path: Path): |
40
|
|
|
""" |
41
|
|
|
Constructor. |
42
|
|
|
|
43
|
|
|
Args: |
44
|
|
|
searches: |
45
|
|
|
input_path: Path to the input file of one of the formats: |
46
|
|
|
- .txt containing one InChI Key per line |
47
|
|
|
- .csv, .tsv, .tab, csv.gz, .tsv.gz, .tab.gz, or .feather containing a column called inchikey |
|
|
|
|
48
|
|
|
""" |
49
|
|
|
self.what = searches |
50
|
|
|
self.input_path: Optional[Path] = input_path |
51
|
|
|
self.input_df: InputFrame = None |
52
|
|
|
self.output_paths = { |
53
|
|
|
what.key: EntryPaths.output_path_of(what, input_path, path) |
54
|
|
|
for what, path in CommonTools.zip_list(searches, to) |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
def search(self) -> Searcher: |
58
|
|
|
""" |
59
|
|
|
Performs the search, and writes data. |
60
|
|
|
""" |
61
|
|
|
if self.input_df is not None: |
62
|
|
|
raise ValueError(f"Already ran a search") |
|
|
|
|
63
|
|
|
self.input_df = InputFrame.read_file(self.input_path) |
64
|
|
|
logger.info(f"Read {len(self.input_df)} input compounds") |
65
|
|
|
inchikeys = self.input_df["inchikey"].unique() |
66
|
|
|
for what in self.what: |
67
|
|
|
self._search_one(what, inchikeys) |
68
|
|
|
return self |
69
|
|
|
|
70
|
|
|
def _search_one(self, what: Search, inchikeys: Sequence[str]): |
71
|
|
|
output_path = self.output_paths[what.key] |
72
|
|
|
metadata_path = output_path.with_suffix(".json.metadata") |
73
|
|
|
df = what.find_to_df(inchikeys) |
|
|
|
|
74
|
|
|
# keep all of the original extra columns from the input |
75
|
|
|
# e.g. if the user had 'inchi' or 'smiles' or 'pretty_name' |
76
|
|
|
for extra_col in [c for c in self.input_df.columns if c != "inchikey"]: |
77
|
|
|
extra_mp = self.input_df.set_index("inchikey")[extra_col].to_dict() |
78
|
|
|
df[extra_col] = df["lookup"].map(extra_mp.get) |
79
|
|
|
# write the (intermediate) file |
80
|
|
|
df.write_file(output_path) |
81
|
|
|
# write metadata |
82
|
|
|
params = {k: str(v) for k, v in what.get_params().items() if k not in {"key", "api"}} |
83
|
|
|
metadata = NestedDotDict(dict(key=what.key, search=what.search_class, params=params)) |
84
|
|
|
metadata.write_json(metadata_path) |
85
|
|
|
logger.info(f"Wrote {what.key} to {output_path}") |
86
|
|
|
|
87
|
|
|
|
88
|
|
|
__all__ = ["Searcher", "InputFrame"] |
89
|
|
|
|