1
|
|
|
from __future__ import annotations |
|
|
|
|
2
|
|
|
|
3
|
|
|
import dataclasses |
4
|
|
|
import os |
5
|
|
|
from dataclasses import dataclass |
6
|
|
|
from pathlib import Path |
7
|
|
|
from typing import AbstractSet, Any, Collection, Mapping, Optional, Type, TypeVar, Union |
8
|
|
|
|
9
|
|
|
from pocketutils.core.dot_dict import NestedDotDict |
|
|
|
|
10
|
|
|
from pocketutils.core.exceptions import ConfigError, DirDoesNotExistError, XValueError |
|
|
|
|
11
|
|
|
from pocketutils.core.query_utils import QueryExecutor |
|
|
|
|
12
|
|
|
from pocketutils.tools.common_tools import CommonTools |
|
|
|
|
13
|
|
|
from pocketutils.tools.sys_tools import SystemTools |
|
|
|
|
14
|
|
|
from suretime import Suretime |
|
|
|
|
15
|
|
|
from typeddfs import FileFormat, FrozeDict |
|
|
|
|
16
|
|
|
|
17
|
|
|
from mandos.model.utils.globals import Globals |
18
|
|
|
from mandos.model.utils.setup import LOG_SETUP, MandosResources, logger |
19
|
|
|
|
20
|
|
|
defaults: Mapping[str, Any] = FrozeDict(MandosResources.json_dict("default_settings.json")) |
21
|
|
|
max_coeff = 1.1 |
|
|
|
|
22
|
|
|
T = TypeVar("T") |
|
|
|
|
23
|
|
|
|
24
|
|
|
|
25
|
|
|
@dataclass(frozen=True, repr=True) |
|
|
|
|
26
|
|
|
class Settings: |
27
|
|
|
""" """ |
28
|
|
|
|
29
|
|
|
is_testing: bool |
30
|
|
|
ntp_continent: str |
31
|
|
|
table_suffix: str |
32
|
|
|
log_suffix: str |
33
|
|
|
cache_path: Path |
34
|
|
|
cache_gzip: bool |
35
|
|
|
save_every: int |
36
|
|
|
sanitize_paths: bool |
37
|
|
|
chembl_expire_sec: int |
38
|
|
|
chembl_n_tries: int |
39
|
|
|
chembl_timeout_sec: int |
40
|
|
|
chembl_backoff_factor: float |
41
|
|
|
chembl_query_delay_min: float |
42
|
|
|
chembl_query_delay_max: float |
43
|
|
|
chembl_fast_save: bool |
44
|
|
|
pubchem_expire_sec: int |
45
|
|
|
pubchem_n_tries: int |
46
|
|
|
pubchem_timeout_sec: float |
47
|
|
|
pubchem_backoff_factor: float |
48
|
|
|
pubchem_query_delay_min: float |
49
|
|
|
pubchem_query_delay_max: float |
50
|
|
|
hmdb_expire_sec: int |
51
|
|
|
hmdb_timeout_sec: float |
52
|
|
|
hmdb_backoff_factor: float |
53
|
|
|
hmdb_query_delay_min: float |
54
|
|
|
hmdb_query_delay_max: float |
55
|
|
|
taxon_expire_sec: int |
56
|
|
|
archive_filename_suffix: str |
57
|
|
|
selenium_driver: str |
58
|
|
|
selenium_driver_path: Optional[Path] |
59
|
|
|
log_signals: bool |
60
|
|
|
log_exit: bool |
61
|
|
|
|
62
|
|
|
@property |
63
|
|
|
def as_dict(self) -> Mapping[str, Any]: |
|
|
|
|
64
|
|
|
return dataclasses.asdict(self) |
65
|
|
|
|
66
|
|
|
@property |
67
|
|
|
def all_cache_paths(self) -> AbstractSet[Path]: |
|
|
|
|
68
|
|
|
return { |
69
|
|
|
self.chembl_cache_path, |
70
|
|
|
self.pubchem_cache_path, |
71
|
|
|
self.g2p_cache_path, |
72
|
|
|
self.hmdb_cache_path, |
73
|
|
|
self.taxonomy_cache_path, |
74
|
|
|
} |
75
|
|
|
|
76
|
|
|
@property |
77
|
|
|
def driver_path(self) -> Path: |
|
|
|
|
78
|
|
|
return self.cache_path / "driver" |
79
|
|
|
|
80
|
|
|
@property |
81
|
|
|
def chembl_cache_path(self) -> Path: |
|
|
|
|
82
|
|
|
return self.cache_path / "chembl" |
83
|
|
|
|
84
|
|
|
@property |
85
|
|
|
def chembl_scrape_path(self) -> Path: |
|
|
|
|
86
|
|
|
return self.chembl_cache_path / "scrape" |
87
|
|
|
|
88
|
|
|
@property |
89
|
|
|
def pubchem_cache_path(self) -> Path: |
|
|
|
|
90
|
|
|
return self.cache_path / "pubchem" |
91
|
|
|
|
92
|
|
|
@property |
93
|
|
|
def g2p_cache_path(self) -> Path: |
|
|
|
|
94
|
|
|
return self.cache_path / "g2p" |
95
|
|
|
|
96
|
|
|
@property |
97
|
|
|
def hmdb_cache_path(self) -> Path: |
|
|
|
|
98
|
|
|
return self.cache_path / "hmdb" |
99
|
|
|
|
100
|
|
|
@property |
101
|
|
|
def taxonomy_cache_path(self) -> Path: |
|
|
|
|
102
|
|
|
return self.cache_path / "taxonomy" |
103
|
|
|
|
104
|
|
|
@classmethod |
105
|
|
|
def from_file(cls, path: Path) -> Settings: |
|
|
|
|
106
|
|
|
return cls.load(NestedDotDict.read_toml(path)) |
107
|
|
|
|
108
|
|
|
@classmethod |
109
|
|
|
def empty(cls) -> Settings: |
|
|
|
|
110
|
|
|
return cls.load(NestedDotDict({})) |
111
|
|
|
|
112
|
|
|
def __post_init__(self): |
113
|
|
|
# check these things |
114
|
|
|
FileFormat.from_suffix(self.table_suffix) |
115
|
|
|
FileFormat.from_suffix(self.archive_filename_suffix) |
116
|
|
|
LOG_SETUP.guess_file_sink_info(self.log_suffix) |
117
|
|
|
for k, v in self.as_dict.items(): |
|
|
|
|
118
|
|
|
# this happens to work for now -- we have none that can be < 0 |
119
|
|
|
if isinstance(v, (int, float)) and v < 0: |
120
|
|
|
raise XValueError(f"{k} = {v} < 0") |
121
|
|
|
|
122
|
|
|
@classmethod |
123
|
|
|
def load(cls, data: NestedDotDict) -> Settings: |
|
|
|
|
124
|
|
|
extra_default_keys = dict(defaults) |
125
|
|
|
|
126
|
|
|
def get(s: str, t: Type[T]) -> T: |
|
|
|
|
127
|
|
|
if s in extra_default_keys: # could be accessed more than once |
128
|
|
|
del extra_default_keys[s] |
129
|
|
|
try: |
130
|
|
|
return data.get_as(s, t, defaults[s]) |
131
|
|
|
except TypeError: |
132
|
|
|
raise ConfigError(f"Key {s}={data.get(s), defaults[s]} is not of type {t}") |
133
|
|
|
|
134
|
|
|
_continent = Suretime.Types.NtpContinents.of |
135
|
|
|
_selenium_path = get("query.selenium_driver_path", Path) |
136
|
|
|
if _selenium_path is not None: |
137
|
|
|
_selenium_path = _selenium_path.expanduser() |
138
|
|
|
chembl_delay = get("query.chembl.delay_sec", float) |
139
|
|
|
pubchem_delay = get("query.pubchem.delay_sec", float) |
140
|
|
|
hmdb_delay = get("query.hmdb.delay_sec", float) |
141
|
|
|
data = cls( |
142
|
|
|
is_testing=get("is_testing", bool), |
143
|
|
|
ntp_continent=get("search.ntp_continent_code", _continent), |
144
|
|
|
table_suffix=get("search.default_table_suffix", str), |
145
|
|
|
log_suffix=get("search.default_log_suffix", str), |
146
|
|
|
save_every=get("search.save_every", int), |
147
|
|
|
sanitize_paths=get("search.sanitize_paths", bool), |
148
|
|
|
cache_path=Path(get("cache.path", str)).expanduser(), |
149
|
|
|
chembl_expire_sec=get("cache.chembl.expire_sec", int), |
150
|
|
|
pubchem_expire_sec=get("cache.pubchem.expire_sec", int), |
151
|
|
|
taxon_expire_sec=get("cache.taxa.expire_sec", int), |
152
|
|
|
cache_gzip=get("cache.gzip", bool), |
153
|
|
|
archive_filename_suffix=get("cache.archive_filename_suffix", str), |
154
|
|
|
chembl_n_tries=get("query.chembl.n_tries", int), |
155
|
|
|
chembl_fast_save=get("query.chembl.fast_save", bool), |
156
|
|
|
chembl_timeout_sec=get("query.chembl.timeout_sec", int), |
157
|
|
|
chembl_backoff_factor=get("query.chembl.backoff_factor", float), |
158
|
|
|
chembl_query_delay_min=chembl_delay, |
159
|
|
|
chembl_query_delay_max=chembl_delay * max_coeff, |
160
|
|
|
pubchem_timeout_sec=get("query.pubchem.timeout_sec", int), |
161
|
|
|
hmdb_expire_sec=get("cache.hmdb.expire_sec", int), |
162
|
|
|
pubchem_backoff_factor=get("query.pubchem.backoff_factor", float), |
163
|
|
|
pubchem_query_delay_min=get("query.pubchem.delay_sec", float), |
164
|
|
|
pubchem_query_delay_max=pubchem_delay * max_coeff, |
165
|
|
|
pubchem_n_tries=get("query.pubchem.n_tries", int), |
166
|
|
|
hmdb_timeout_sec=get("query.hmdb.timeout_sec", int), |
167
|
|
|
hmdb_backoff_factor=get("query.hmdb.backoff_factor", float), |
168
|
|
|
hmdb_query_delay_min=hmdb_delay, |
169
|
|
|
hmdb_query_delay_max=hmdb_delay * max_coeff, |
170
|
|
|
selenium_driver=get("query.selenium_driver", str).title(), |
171
|
|
|
selenium_driver_path=_selenium_path, |
172
|
|
|
log_signals=get("cli.log_signals", bool), |
173
|
|
|
log_exit=get("cli.log_exit", bool), |
174
|
|
|
) |
175
|
|
|
# we got all the required fields |
176
|
|
|
# make sure we don't have extra keys in defaults |
177
|
|
|
if len(extra_default_keys) > 0: |
178
|
|
|
raise AssertionError( |
179
|
|
|
f"There are {len(extra_default_keys)} extra defaults" |
180
|
|
|
+ f"in {defaults}: {extra_default_keys}" |
181
|
|
|
) |
182
|
|
|
return data |
183
|
|
|
|
184
|
|
|
@classmethod |
185
|
|
|
def defaults(cls) -> Mapping[str, Any]: |
|
|
|
|
186
|
|
|
return defaults |
187
|
|
|
|
188
|
|
|
def configure(self): |
|
|
|
|
189
|
|
|
""" """ |
190
|
|
|
if self.log_exit: |
191
|
|
|
SystemTools.trace_exit(CommonTools.make_writer(logger.trace)) |
192
|
|
|
if self.log_signals: |
193
|
|
|
SystemTools.trace_signals(CommonTools.make_writer(logger.trace)) |
194
|
|
|
|
195
|
|
|
def configure_chembl(self): |
|
|
|
|
196
|
|
|
from chembl_webresource_client.settings import Settings as ChemblSettings |
|
|
|
|
197
|
|
|
|
198
|
|
|
if not Globals.disable_chembl: |
199
|
|
|
instance = ChemblSettings.Instance() |
200
|
|
|
instance.CACHING = True |
201
|
|
|
instance.CACHE_NAME = str(self.chembl_cache_path.resolve() / "chembl.sqlite") |
202
|
|
|
logger.debug(f"ChEMBL cache is at {instance.CACHE_NAME}") |
203
|
|
|
instance.TOTAL_RETRIES = self.chembl_n_tries |
204
|
|
|
instance.FAST_SAVE = self.chembl_fast_save |
205
|
|
|
instance.TIMEOUT = self.chembl_timeout_sec |
206
|
|
|
instance.BACKOFF_FACTOR = self.chembl_backoff_factor |
207
|
|
|
instance.CACHE_EXPIRE = self.chembl_expire_sec |
208
|
|
|
|
209
|
|
|
@classmethod |
210
|
|
|
def set_path_for_selenium(cls) -> None: |
|
|
|
|
211
|
|
|
cls.add_to_path([SETTINGS.driver_path, MandosResources.dir(), Globals.where_am_i_installed]) |
212
|
|
|
|
213
|
|
|
@classmethod |
214
|
|
|
def add_to_path(cls, paths: Collection[Union[None, str, Path]]) -> None: |
|
|
|
|
215
|
|
|
paths = {Path(p) for p in paths if p is not None} |
216
|
|
|
for path in paths: |
217
|
|
|
if path.exists() and not path.is_dir() and not path.is_mount(): |
218
|
|
|
raise DirDoesNotExistError(f"Path {path} is not a directory or mount") |
219
|
|
|
paths = os.pathsep.join({str(p) for p in paths}) |
220
|
|
|
if len(paths) > 0: |
221
|
|
|
os.environ["PATH"] += os.pathsep + paths |
222
|
|
|
logger.debug(f"Added to PATH: {paths}") |
223
|
|
|
|
224
|
|
|
|
225
|
|
|
if Globals.settings_path.exists(): |
226
|
|
|
SETTINGS = Settings.from_file(Globals.settings_path) |
227
|
|
|
logger.success(f"Read settings at {Globals.settings_path}") |
228
|
|
|
else: |
229
|
|
|
SETTINGS = Settings.empty() |
230
|
|
|
logger.success(f"Using defaults (no file at {Globals.settings_path})") |
231
|
|
|
SETTINGS.configure() |
232
|
|
|
|
233
|
|
|
|
234
|
|
|
class QueryExecutors: |
|
|
|
|
235
|
|
|
chembl = QueryExecutor(SETTINGS.chembl_query_delay_min, SETTINGS.chembl_query_delay_max) |
236
|
|
|
pubchem = QueryExecutor(SETTINGS.pubchem_query_delay_min, SETTINGS.pubchem_query_delay_max) |
237
|
|
|
hmdb = QueryExecutor(SETTINGS.hmdb_query_delay_min, SETTINGS.hmdb_query_delay_max) |
238
|
|
|
|
239
|
|
|
|
240
|
|
|
QUERY_EXECUTORS = QueryExecutors |
|
|
|
|
241
|
|
|
|
242
|
|
|
|
243
|
|
|
__all__ = ["SETTINGS", "QUERY_EXECUTORS"] |
244
|
|
|
|