1
|
|
|
# copied from sktime, BSD-3-Clause License (see LICENSE file) |
2
|
|
|
# to be moved to scikit-base in the future |
3
|
|
|
"""Common abstraction utilities for parallelization backends. |
4
|
|
|
|
5
|
|
|
New parallelization or iteration backends can be added easily as follows: |
6
|
|
|
|
7
|
|
|
* Add a new backend name to ``backend_dict``, syntax is |
8
|
|
|
backend_name: backend_type, where backend_type collects backend options, |
9
|
|
|
e.g., multiple options for a single parallelization backend. |
10
|
|
|
* Add a new function to ``para_dict``, should have name |
11
|
|
|
``_parallelize_<backend_name>`` and take the same arguments as |
12
|
|
|
``_parallelize_none``. Ensure that ``backend`` and ``backend_params`` are arguments, |
13
|
|
|
even if there is only one backend option, or no additional parameters. |
14
|
|
|
* add the backend string in the docstring of parallelize, and any downstream |
15
|
|
|
functions that use ``parallelize`` and expose the backend parameter an argument |
16
|
|
|
""" |
17
|
|
|
|
18
|
|
|
|
19
|
|
|
def parallelize(fun, iter, meta=None, backend=None, backend_params=None): |
20
|
|
|
"""Parallelize loop over iter via backend. |
21
|
|
|
|
22
|
|
|
Executes ``fun(x, meta=meta)`` in parallel for ``x`` in ``iter``, |
23
|
|
|
and returns the results as a list in the same order as ``iter``. |
24
|
|
|
|
25
|
|
|
Uses the iteration or parallelization backend specified by ``backend``. |
26
|
|
|
|
27
|
|
|
Parameters |
28
|
|
|
---------- |
29
|
|
|
fun : callable, must have exactly two arguments, second argument of name "meta" |
30
|
|
|
function to be executed in parallel |
31
|
|
|
|
32
|
|
|
iter : iterable |
33
|
|
|
iterable over which to parallelize, elements are passed to fun in order, |
34
|
|
|
to the first argument |
35
|
|
|
|
36
|
|
|
meta : dict, optional |
37
|
|
|
variables to be passed to fun, as the second argument, under the key ``meta`` |
38
|
|
|
|
39
|
|
|
backend : str, optional |
40
|
|
|
backend to use for parallelization, one of |
41
|
|
|
|
42
|
|
|
- "None": executes loop sequentially, simple list comprehension |
43
|
|
|
- "loky", "multiprocessing" and "threading": uses ``joblib`` ``Parallel`` loops |
44
|
|
|
- "joblib": custom and 3rd party ``joblib`` backends, e.g., ``spark`` |
45
|
|
|
- "dask": uses ``dask``, requires ``dask`` package in environment |
46
|
|
|
- "dask_lazy": same as ``"dask"``, but returns delayed object instead of list |
47
|
|
|
- "ray": uses a ray remote to execute jobs in parallel |
48
|
|
|
|
49
|
|
|
backend_params : dict, optional |
50
|
|
|
additional parameters passed to the backend as config. |
51
|
|
|
Valid keys depend on the value of ``backend``: |
52
|
|
|
|
53
|
|
|
- "None": no additional parameters, ``backend_params`` is ignored |
54
|
|
|
- "loky", "multiprocessing" and "threading": default ``joblib`` backends |
55
|
|
|
any valid keys for ``joblib.Parallel`` can be passed here, e.g., ``n_jobs``, |
56
|
|
|
with the exception of ``backend`` which is directly controlled by ``backend``. |
57
|
|
|
If ``n_jobs`` is not passed, it will default to ``-1``, other parameters |
58
|
|
|
will default to ``joblib`` defaults. |
59
|
|
|
- "joblib": custom and 3rd party ``joblib`` backends, e.g., ``spark``. |
60
|
|
|
any valid keys for ``joblib.Parallel`` can be passed here, e.g., ``n_jobs``, |
61
|
|
|
``backend`` must be passed as a key of ``backend_params`` in this case. |
62
|
|
|
If ``n_jobs`` is not passed, it will default to ``-1``, other parameters |
63
|
|
|
will default to ``joblib`` defaults. |
64
|
|
|
- "dask": any valid keys for ``dask.compute`` can be passed, e.g., ``scheduler`` |
65
|
|
|
|
66
|
|
|
- "ray": The following keys can be passed: |
67
|
|
|
|
68
|
|
|
- "ray_remote_args": dictionary of valid keys for ``ray.init`` |
69
|
|
|
- "shutdown_ray": bool, default=True; False prevents ``ray`` from shutting |
70
|
|
|
down after parallelization. |
71
|
|
|
- "logger_name": str, default="ray"; name of the logger to use. |
72
|
|
|
- "mute_warnings": bool, default=False; if True, suppresses warnings |
73
|
|
|
|
74
|
|
|
""" |
75
|
|
|
if meta is None: |
76
|
|
|
meta = {} |
77
|
|
|
if backend is None: |
78
|
|
|
backend = "None" |
79
|
|
|
if backend_params is None: |
80
|
|
|
backend_params = {} |
81
|
|
|
|
82
|
|
|
backend_name = backend_dict[backend] |
83
|
|
|
para_fun = para_dict[backend_name] |
84
|
|
|
|
85
|
|
|
ret = para_fun( |
86
|
|
|
fun=fun, iter=iter, meta=meta, backend=backend, backend_params=backend_params |
87
|
|
|
) |
88
|
|
|
return ret |
89
|
|
|
|
90
|
|
|
|
91
|
|
|
backend_dict = { |
92
|
|
|
"None": "none", |
93
|
|
|
"loky": "joblib", |
94
|
|
|
"multiprocessing": "joblib", |
95
|
|
|
"threading": "joblib", |
96
|
|
|
"joblib": "joblib", |
97
|
|
|
"dask": "dask", |
98
|
|
|
"dask_lazy": "dask", |
99
|
|
|
"ray": "ray", |
100
|
|
|
} |
101
|
|
|
para_dict = {} |
102
|
|
|
|
103
|
|
|
|
104
|
|
|
def _parallelize_none(fun, iter, meta, backend, backend_params): |
105
|
|
|
"""Execute loop via simple sequential list comprehension.""" |
106
|
|
|
ret = [fun(x, meta=meta) for x in iter] |
107
|
|
|
return ret |
108
|
|
|
|
109
|
|
|
|
110
|
|
|
para_dict["none"] = _parallelize_none |
111
|
|
|
|
112
|
|
|
|
113
|
|
|
def _parallelize_joblib(fun, iter, meta, backend, backend_params): |
114
|
|
|
"""Parallelize loop via joblib Parallel.""" |
115
|
|
|
from joblib import Parallel, delayed |
116
|
|
|
|
117
|
|
|
par_params = backend_params.copy() |
118
|
|
|
if "backend" not in par_params: |
119
|
|
|
# if user selects custom joblib backend but does not specify backend explicitly, |
120
|
|
|
# raise a ValueError |
121
|
|
|
if backend == "joblib": |
122
|
|
|
raise ValueError( |
123
|
|
|
'"joblib" was selected as first layer parallelization backend, ' |
124
|
|
|
"but no backend string was " |
125
|
|
|
'passed in the backend parameters dict, e.g., "spark". ' |
126
|
|
|
"Please specify a backend to joblib as a key-value pair " |
127
|
|
|
"in the backend_params arg or the backend:parallel:params config " |
128
|
|
|
'when using "joblib". ' |
129
|
|
|
'For clarity, "joblib" should only be used for two-layer ' |
130
|
|
|
"backend dispatch, where the first layer is joblib, " |
131
|
|
|
"and the second layer is a custom backend of joblib, e.g., spark. " |
132
|
|
|
"For first-party joblib backends, please use the backend string " |
133
|
|
|
'of sktime directly, e.g., by specifying "multiprocessing" or "loky".' |
134
|
|
|
) |
135
|
|
|
# in all other cases, we ensure the backend parameter is one of |
136
|
|
|
# "loky", "multiprocessing" or "threading", as passed via backend |
137
|
|
|
else: |
138
|
|
|
par_params["backend"] = backend |
139
|
|
|
elif backend != "joblib": |
140
|
|
|
par_params["backend"] = backend |
141
|
|
|
|
142
|
|
|
if "n_jobs" not in par_params: |
143
|
|
|
par_params["n_jobs"] = -1 |
144
|
|
|
|
145
|
|
|
ret = Parallel(**par_params)(delayed(fun)(x, meta=meta) for x in iter) |
146
|
|
|
return ret |
147
|
|
|
|
148
|
|
|
|
149
|
|
|
para_dict["joblib"] = _parallelize_joblib |
150
|
|
|
|
151
|
|
|
|
152
|
|
|
def _parallelize_dask(fun, iter, meta, backend, backend_params): |
153
|
|
|
"""Parallelize loop via dask.""" |
154
|
|
|
from dask import compute, delayed |
155
|
|
|
|
156
|
|
|
lazy = [delayed(fun)(x, meta=meta) for x in iter] |
157
|
|
|
if backend == "dask": |
158
|
|
|
return compute(*lazy, **backend_params) |
159
|
|
|
else: |
160
|
|
|
return lazy |
161
|
|
|
|
162
|
|
|
|
163
|
|
|
para_dict["dask"] = _parallelize_dask |
164
|
|
|
|
165
|
|
|
|
166
|
|
|
def _parallelize_ray(fun, iter, meta, backend, backend_params): |
167
|
|
|
"""Parallelize loop via ray.""" |
168
|
|
|
import logging |
169
|
|
|
import warnings |
170
|
|
|
|
171
|
|
|
import ray |
172
|
|
|
|
173
|
|
|
par_params = backend_params.copy() |
174
|
|
|
|
175
|
|
|
# read the possible additional keys |
176
|
|
|
logger = logging.getLogger(par_params.get("logger_name", None)) |
177
|
|
|
mute_warnings = par_params.get("mute_warnings", False) |
178
|
|
|
shutdown_ray = par_params.get("shutdown_ray", True) |
179
|
|
|
|
180
|
|
|
if "ray_remote_args" not in par_params.keys(): |
181
|
|
|
par_params["ray_remote_args"] = {} |
182
|
|
|
|
183
|
|
|
@ray.remote # pragma: no cover |
184
|
|
|
def _ray_execute_function( |
185
|
|
|
fun, params: dict, meta: dict, mute_warnings: bool = False |
186
|
|
|
): |
187
|
|
|
if mute_warnings: |
188
|
|
|
warnings.filterwarnings("ignore") # silence sktime warnings |
189
|
|
|
assert ray.is_initialized() |
190
|
|
|
result = fun(params, meta) |
191
|
|
|
return result |
192
|
|
|
|
193
|
|
|
if not ray.is_initialized(): |
194
|
|
|
logger.info("Starting Ray Parallel") |
195
|
|
|
context = ray.init(**par_params["ray_remote_args"]) |
196
|
|
|
logger.info( |
197
|
|
|
f"Ray initialized. Open dashboard at http://{context.dashboard_url}" |
198
|
|
|
) |
199
|
|
|
|
200
|
|
|
# this is to keep the order of results while still using wait to optimize runtime |
201
|
|
|
refs = [ |
202
|
|
|
_ray_execute_function.remote(fun, x, meta, mute_warnings=mute_warnings) |
203
|
|
|
for x in iter |
204
|
|
|
] |
205
|
|
|
res_dict = dict.fromkeys(refs) |
206
|
|
|
|
207
|
|
|
unfinished = refs |
208
|
|
|
while unfinished: |
209
|
|
|
finished, unfinished = ray.wait(unfinished, num_returns=1) |
210
|
|
|
res_dict[finished[0]] = ray.get(finished[0]) |
211
|
|
|
|
212
|
|
|
if shutdown_ray: |
213
|
|
|
ray.shutdown() |
214
|
|
|
|
215
|
|
|
res = [res_dict[ref] for ref in refs] |
216
|
|
|
return res |
217
|
|
|
|
218
|
|
|
|
219
|
|
|
para_dict["ray"] = _parallelize_ray |
220
|
|
|
|
221
|
|
|
|
222
|
|
|
# list of backends where we skip tests during CI |
223
|
|
|
SKIP_FIXTURES = [ |
224
|
|
|
"ray", # unstable, sporadic crashes in CI, see bug 8149 |
225
|
|
|
] |
226
|
|
|
|
227
|
|
|
|
228
|
|
|
def _get_parallel_test_fixtures(naming="estimator"): |
229
|
|
|
"""Return fixtures for parallelization tests. |
230
|
|
|
|
231
|
|
|
Returns a list of parameter fixtures, where each fixture |
232
|
|
|
is a dict with keys "backend" and "backend_params". |
233
|
|
|
|
234
|
|
|
Parameters |
235
|
|
|
---------- |
236
|
|
|
naming : str, optional |
237
|
|
|
naming convention for the parameters, one of |
238
|
|
|
|
239
|
|
|
"estimator": for use in estimator constructors, |
240
|
|
|
``backend`` and ``backend_params`` |
241
|
|
|
"config": for use in ``set_config``, |
242
|
|
|
``backend:parallel`` and ``backend:parallel:params`` |
243
|
|
|
|
244
|
|
|
Returns |
245
|
|
|
------- |
246
|
|
|
fixtures : list of dict |
247
|
|
|
list of backend parameter fixtures |
248
|
|
|
keys depend on ``naming`` parameter, see above |
249
|
|
|
either ``backend`` and ``backend_params`` (``naming="estimator"``), |
250
|
|
|
or ``backend:parallel`` and ``backend:parallel:params`` (``naming="config"``) |
251
|
|
|
values are backend strings and backend parameter dicts |
252
|
|
|
only backends that are available in the environment are included |
253
|
|
|
""" |
254
|
|
|
from skbase.utils.dependencies import _check_soft_dependencies |
255
|
|
|
|
256
|
|
|
fixtures = [] |
257
|
|
|
|
258
|
|
|
# test no parallelization |
259
|
|
|
fixtures.append({"backend": "None", "backend_params": {}}) |
260
|
|
|
|
261
|
|
|
# test joblib backends |
262
|
|
|
for backend in ["loky", "multiprocessing", "threading"]: |
263
|
|
|
fixtures.append({"backend": backend, "backend_params": {}}) |
264
|
|
|
fixtures.append({"backend": backend, "backend_params": {"n_jobs": 2}}) |
265
|
|
|
fixtures.append({"backend": backend, "backend_params": {"n_jobs": -1}}) |
266
|
|
|
|
267
|
|
|
# test dask backends |
268
|
|
|
if _check_soft_dependencies("dask", severity="none"): |
269
|
|
|
fixtures.append({"backend": "dask", "backend_params": {}}) |
270
|
|
|
fixtures.append({"backend": "dask", "backend_params": {"scheduler": "sync"}}) |
271
|
|
|
|
272
|
|
|
# test ray backend |
273
|
|
|
""" TODO: faster ray test |
274
|
|
|
if _check_soft_dependencies("ray", severity="none"): |
275
|
|
|
import os |
276
|
|
|
|
277
|
|
|
fixtures.append( |
278
|
|
|
{ |
279
|
|
|
"backend": "ray", |
280
|
|
|
"backend_params": { |
281
|
|
|
"mute_warnings": True, |
282
|
|
|
"ray_remote_args": {"num_cpus": os.cpu_count() - 1}, |
283
|
|
|
}, |
284
|
|
|
} |
285
|
|
|
) |
286
|
|
|
|
287
|
|
|
fixtures = [x for x in fixtures if x["backend"] not in SKIP_FIXTURES] |
288
|
|
|
# remove backends in SKIP_FIXTURES from fixtures |
289
|
|
|
""" |
290
|
|
|
return fixtures |
291
|
|
|
|