|
1
|
|
|
from __future__ import annotations |
|
|
|
|
|
|
2
|
|
|
|
|
3
|
|
|
import dataclasses |
|
4
|
|
|
import os |
|
5
|
|
|
from dataclasses import dataclass |
|
6
|
|
|
from pathlib import Path |
|
7
|
|
|
from typing import AbstractSet, Any, Collection, Mapping, Optional, Type, TypeVar, Union |
|
8
|
|
|
|
|
9
|
|
|
from pocketutils.core.dot_dict import NestedDotDict |
|
|
|
|
|
|
10
|
|
|
from pocketutils.core.exceptions import ConfigError, DirDoesNotExistError, XValueError |
|
|
|
|
|
|
11
|
|
|
from pocketutils.core.query_utils import QueryExecutor |
|
|
|
|
|
|
12
|
|
|
from pocketutils.tools.common_tools import CommonTools |
|
|
|
|
|
|
13
|
|
|
from pocketutils.tools.sys_tools import SystemTools |
|
|
|
|
|
|
14
|
|
|
from suretime import Suretime |
|
|
|
|
|
|
15
|
|
|
from typeddfs import FileFormat, FrozeDict |
|
|
|
|
|
|
16
|
|
|
|
|
17
|
|
|
from mandos.model.utils.globals import Globals |
|
18
|
|
|
from mandos.model.utils.setup import LOG_SETUP, MandosResources, logger |
|
19
|
|
|
|
|
20
|
|
|
defaults: Mapping[str, Any] = FrozeDict(MandosResources.json_dict("default_settings.json")) |
|
21
|
|
|
max_coeff = 1.1 |
|
|
|
|
|
|
22
|
|
|
T = TypeVar("T") |
|
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
|
|
25
|
|
|
@dataclass(frozen=True, repr=True) |
|
|
|
|
|
|
26
|
|
|
class Settings: |
|
27
|
|
|
""" """ |
|
28
|
|
|
|
|
29
|
|
|
is_testing: bool |
|
30
|
|
|
ntp_continent: str |
|
31
|
|
|
table_suffix: str |
|
32
|
|
|
log_suffix: str |
|
33
|
|
|
cache_path: Path |
|
34
|
|
|
cache_gzip: bool |
|
35
|
|
|
save_every: int |
|
36
|
|
|
sanitize_paths: bool |
|
37
|
|
|
chembl_expire_sec: int |
|
38
|
|
|
chembl_n_tries: int |
|
39
|
|
|
chembl_timeout_sec: int |
|
40
|
|
|
chembl_backoff_factor: float |
|
41
|
|
|
chembl_query_delay_min: float |
|
42
|
|
|
chembl_query_delay_max: float |
|
43
|
|
|
chembl_fast_save: bool |
|
44
|
|
|
pubchem_expire_sec: int |
|
45
|
|
|
pubchem_n_tries: int |
|
46
|
|
|
pubchem_timeout_sec: float |
|
47
|
|
|
pubchem_backoff_factor: float |
|
48
|
|
|
pubchem_query_delay_min: float |
|
49
|
|
|
pubchem_query_delay_max: float |
|
50
|
|
|
hmdb_expire_sec: int |
|
51
|
|
|
hmdb_timeout_sec: float |
|
52
|
|
|
hmdb_backoff_factor: float |
|
53
|
|
|
hmdb_query_delay_min: float |
|
54
|
|
|
hmdb_query_delay_max: float |
|
55
|
|
|
taxon_expire_sec: int |
|
56
|
|
|
archive_filename_suffix: str |
|
57
|
|
|
selenium_driver: str |
|
58
|
|
|
selenium_driver_path: Optional[Path] |
|
59
|
|
|
log_signals: bool |
|
60
|
|
|
log_exit: bool |
|
61
|
|
|
|
|
62
|
|
|
@property |
|
63
|
|
|
def as_dict(self) -> Mapping[str, Any]: |
|
|
|
|
|
|
64
|
|
|
return dataclasses.asdict(self) |
|
65
|
|
|
|
|
66
|
|
|
@property |
|
67
|
|
|
def all_cache_paths(self) -> AbstractSet[Path]: |
|
|
|
|
|
|
68
|
|
|
return { |
|
69
|
|
|
self.chembl_cache_path, |
|
70
|
|
|
self.pubchem_cache_path, |
|
71
|
|
|
self.g2p_cache_path, |
|
72
|
|
|
self.hmdb_cache_path, |
|
73
|
|
|
self.taxonomy_cache_path, |
|
74
|
|
|
} |
|
75
|
|
|
|
|
76
|
|
|
@property |
|
77
|
|
|
def driver_path(self) -> Path: |
|
|
|
|
|
|
78
|
|
|
return self.cache_path / "driver" |
|
79
|
|
|
|
|
80
|
|
|
@property |
|
81
|
|
|
def chembl_cache_path(self) -> Path: |
|
|
|
|
|
|
82
|
|
|
return self.cache_path / "chembl" |
|
83
|
|
|
|
|
84
|
|
|
@property |
|
85
|
|
|
def chembl_scrape_path(self) -> Path: |
|
|
|
|
|
|
86
|
|
|
return self.chembl_cache_path / "scrape" |
|
87
|
|
|
|
|
88
|
|
|
@property |
|
89
|
|
|
def pubchem_cache_path(self) -> Path: |
|
|
|
|
|
|
90
|
|
|
return self.cache_path / "pubchem" |
|
91
|
|
|
|
|
92
|
|
|
@property |
|
93
|
|
|
def g2p_cache_path(self) -> Path: |
|
|
|
|
|
|
94
|
|
|
return self.cache_path / "g2p" |
|
95
|
|
|
|
|
96
|
|
|
@property |
|
97
|
|
|
def hmdb_cache_path(self) -> Path: |
|
|
|
|
|
|
98
|
|
|
return self.cache_path / "hmdb" |
|
99
|
|
|
|
|
100
|
|
|
@property |
|
101
|
|
|
def taxonomy_cache_path(self) -> Path: |
|
|
|
|
|
|
102
|
|
|
return self.cache_path / "taxonomy" |
|
103
|
|
|
|
|
104
|
|
|
@classmethod |
|
105
|
|
|
def from_file(cls, path: Path) -> Settings: |
|
|
|
|
|
|
106
|
|
|
return cls.load(NestedDotDict.read_toml(path)) |
|
107
|
|
|
|
|
108
|
|
|
@classmethod |
|
109
|
|
|
def empty(cls) -> Settings: |
|
|
|
|
|
|
110
|
|
|
return cls.load(NestedDotDict({})) |
|
111
|
|
|
|
|
112
|
|
|
def __post_init__(self): |
|
113
|
|
|
# check these things |
|
114
|
|
|
FileFormat.from_suffix(self.table_suffix) |
|
115
|
|
|
FileFormat.from_suffix(self.archive_filename_suffix) |
|
116
|
|
|
LOG_SETUP.guess_file_sink_info(self.log_suffix) |
|
117
|
|
|
for k, v in self.as_dict.items(): |
|
|
|
|
|
|
118
|
|
|
# this happens to work for now -- we have none that can be < 0 |
|
119
|
|
|
if isinstance(v, (int, float)) and v < 0: |
|
120
|
|
|
raise XValueError(f"{k} = {v} < 0") |
|
121
|
|
|
|
|
122
|
|
|
@classmethod |
|
123
|
|
|
def load(cls, data: NestedDotDict) -> Settings: |
|
|
|
|
|
|
124
|
|
|
extra_default_keys = dict(defaults) |
|
125
|
|
|
|
|
126
|
|
|
def get(s: str, t: Type[T]) -> T: |
|
|
|
|
|
|
127
|
|
|
if s in extra_default_keys: # could be accessed more than once |
|
128
|
|
|
del extra_default_keys[s] |
|
129
|
|
|
try: |
|
130
|
|
|
return data.get_as(s, t, defaults[s]) |
|
131
|
|
|
except TypeError: |
|
132
|
|
|
raise ConfigError(f"Key {s}={data.get(s), defaults[s]} is not of type {t}") |
|
133
|
|
|
|
|
134
|
|
|
_continent = Suretime.Types.NtpContinents.of |
|
135
|
|
|
_selenium_path = get("query.selenium_driver_path", Path) |
|
136
|
|
|
if _selenium_path is not None: |
|
137
|
|
|
_selenium_path = _selenium_path.expanduser() |
|
138
|
|
|
chembl_delay = get("query.chembl.delay_sec", float) |
|
139
|
|
|
pubchem_delay = get("query.pubchem.delay_sec", float) |
|
140
|
|
|
hmdb_delay = get("query.hmdb.delay_sec", float) |
|
141
|
|
|
data = cls( |
|
142
|
|
|
is_testing=get("is_testing", bool), |
|
143
|
|
|
ntp_continent=get("search.ntp_continent_code", _continent), |
|
144
|
|
|
table_suffix=get("search.default_table_suffix", str), |
|
145
|
|
|
log_suffix=get("search.default_log_suffix", str), |
|
146
|
|
|
save_every=get("search.save_every", int), |
|
147
|
|
|
sanitize_paths=get("search.sanitize_paths", bool), |
|
148
|
|
|
cache_path=Path(get("cache.path", str)).expanduser(), |
|
149
|
|
|
chembl_expire_sec=get("cache.chembl.expire_sec", int), |
|
150
|
|
|
pubchem_expire_sec=get("cache.pubchem.expire_sec", int), |
|
151
|
|
|
taxon_expire_sec=get("cache.taxa.expire_sec", int), |
|
152
|
|
|
cache_gzip=get("cache.gzip", bool), |
|
153
|
|
|
archive_filename_suffix=get("cache.archive_filename_suffix", str), |
|
154
|
|
|
chembl_n_tries=get("query.chembl.n_tries", int), |
|
155
|
|
|
chembl_fast_save=get("query.chembl.fast_save", bool), |
|
156
|
|
|
chembl_timeout_sec=get("query.chembl.timeout_sec", int), |
|
157
|
|
|
chembl_backoff_factor=get("query.chembl.backoff_factor", float), |
|
158
|
|
|
chembl_query_delay_min=chembl_delay, |
|
159
|
|
|
chembl_query_delay_max=chembl_delay * max_coeff, |
|
160
|
|
|
pubchem_timeout_sec=get("query.pubchem.timeout_sec", int), |
|
161
|
|
|
hmdb_expire_sec=get("cache.hmdb.expire_sec", int), |
|
162
|
|
|
pubchem_backoff_factor=get("query.pubchem.backoff_factor", float), |
|
163
|
|
|
pubchem_query_delay_min=get("query.pubchem.delay_sec", float), |
|
164
|
|
|
pubchem_query_delay_max=pubchem_delay * max_coeff, |
|
165
|
|
|
pubchem_n_tries=get("query.pubchem.n_tries", int), |
|
166
|
|
|
hmdb_timeout_sec=get("query.hmdb.timeout_sec", int), |
|
167
|
|
|
hmdb_backoff_factor=get("query.hmdb.backoff_factor", float), |
|
168
|
|
|
hmdb_query_delay_min=hmdb_delay, |
|
169
|
|
|
hmdb_query_delay_max=hmdb_delay * max_coeff, |
|
170
|
|
|
selenium_driver=get("query.selenium_driver", str).title(), |
|
171
|
|
|
selenium_driver_path=_selenium_path, |
|
172
|
|
|
log_signals=get("cli.log_signals", bool), |
|
173
|
|
|
log_exit=get("cli.log_exit", bool), |
|
174
|
|
|
) |
|
175
|
|
|
# we got all the required fields |
|
176
|
|
|
# make sure we don't have extra keys in defaults |
|
177
|
|
|
if len(extra_default_keys) > 0: |
|
178
|
|
|
raise AssertionError( |
|
179
|
|
|
f"There are {len(extra_default_keys)} extra defaults" |
|
180
|
|
|
+ f"in {defaults}: {extra_default_keys}" |
|
181
|
|
|
) |
|
182
|
|
|
return data |
|
183
|
|
|
|
|
184
|
|
|
@classmethod |
|
185
|
|
|
def defaults(cls) -> Mapping[str, Any]: |
|
|
|
|
|
|
186
|
|
|
return defaults |
|
187
|
|
|
|
|
188
|
|
|
def configure(self): |
|
|
|
|
|
|
189
|
|
|
""" """ |
|
190
|
|
|
if self.log_exit: |
|
191
|
|
|
SystemTools.trace_exit(CommonTools.make_writer(logger.trace)) |
|
192
|
|
|
if self.log_signals: |
|
193
|
|
|
SystemTools.trace_signals(CommonTools.make_writer(logger.trace)) |
|
194
|
|
|
|
|
195
|
|
|
def configure_chembl(self): |
|
|
|
|
|
|
196
|
|
|
from chembl_webresource_client.settings import Settings as ChemblSettings |
|
|
|
|
|
|
197
|
|
|
|
|
198
|
|
|
if not Globals.disable_chembl: |
|
199
|
|
|
instance = ChemblSettings.Instance() |
|
200
|
|
|
instance.CACHING = True |
|
201
|
|
|
instance.CACHE_NAME = str(self.chembl_cache_path.resolve() / "chembl.sqlite") |
|
202
|
|
|
logger.debug(f"ChEMBL cache is at {instance.CACHE_NAME}") |
|
203
|
|
|
instance.TOTAL_RETRIES = self.chembl_n_tries |
|
204
|
|
|
instance.FAST_SAVE = self.chembl_fast_save |
|
205
|
|
|
instance.TIMEOUT = self.chembl_timeout_sec |
|
206
|
|
|
instance.BACKOFF_FACTOR = self.chembl_backoff_factor |
|
207
|
|
|
instance.CACHE_EXPIRE = self.chembl_expire_sec |
|
208
|
|
|
|
|
209
|
|
|
@classmethod |
|
210
|
|
|
def set_path_for_selenium(cls) -> None: |
|
|
|
|
|
|
211
|
|
|
cls.add_to_path([SETTINGS.driver_path, MandosResources.dir(), Globals.where_am_i_installed]) |
|
212
|
|
|
|
|
213
|
|
|
@classmethod |
|
214
|
|
|
def add_to_path(cls, paths: Collection[Union[None, str, Path]]) -> None: |
|
|
|
|
|
|
215
|
|
|
paths = {Path(p) for p in paths if p is not None} |
|
216
|
|
|
for path in paths: |
|
217
|
|
|
if path.exists() and not path.is_dir() and not path.is_mount(): |
|
218
|
|
|
raise DirDoesNotExistError(f"Path {path} is not a directory or mount") |
|
219
|
|
|
paths = os.pathsep.join({str(p) for p in paths}) |
|
220
|
|
|
if len(paths) > 0: |
|
221
|
|
|
os.environ["PATH"] += os.pathsep + paths |
|
222
|
|
|
logger.debug(f"Added to PATH: {paths}") |
|
223
|
|
|
|
|
224
|
|
|
|
|
225
|
|
|
if Globals.settings_path.exists(): |
|
226
|
|
|
SETTINGS = Settings.from_file(Globals.settings_path) |
|
227
|
|
|
logger.success(f"Read settings at {Globals.settings_path}") |
|
228
|
|
|
else: |
|
229
|
|
|
SETTINGS = Settings.empty() |
|
230
|
|
|
logger.success(f"Using defaults (no file at {Globals.settings_path})") |
|
231
|
|
|
SETTINGS.configure() |
|
232
|
|
|
|
|
233
|
|
|
|
|
234
|
|
|
class QueryExecutors: |
|
|
|
|
|
|
235
|
|
|
chembl = QueryExecutor(SETTINGS.chembl_query_delay_min, SETTINGS.chembl_query_delay_max) |
|
236
|
|
|
pubchem = QueryExecutor(SETTINGS.pubchem_query_delay_min, SETTINGS.pubchem_query_delay_max) |
|
237
|
|
|
hmdb = QueryExecutor(SETTINGS.hmdb_query_delay_min, SETTINGS.hmdb_query_delay_max) |
|
238
|
|
|
|
|
239
|
|
|
|
|
240
|
|
|
QUERY_EXECUTORS = QueryExecutors |
|
|
|
|
|
|
241
|
|
|
|
|
242
|
|
|
|
|
243
|
|
|
__all__ = ["SETTINGS", "QUERY_EXECUTORS"] |
|
244
|
|
|
|