|
1
|
|
|
"""Training algorithms.""" |
|
2
|
|
|
import logging |
|
3
|
|
|
import itertools |
|
4
|
|
|
from abc import ABCMeta, abstractmethod |
|
5
|
|
|
from collections import OrderedDict |
|
6
|
|
|
from six.moves import reduce |
|
7
|
|
|
|
|
8
|
|
|
from picklable_itertools.extras import equizip |
|
9
|
|
|
|
|
10
|
|
|
import theano |
|
11
|
|
|
from six import add_metaclass |
|
12
|
|
|
from theano import tensor |
|
13
|
|
|
|
|
14
|
|
|
from blocks.graph import ComputationGraph |
|
15
|
|
|
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER |
|
16
|
|
|
from blocks.theano_expressions import l2_norm |
|
17
|
|
|
from blocks.utils import (dict_subset, pack, shared_floatx, |
|
18
|
|
|
shared_floatx_zeros_matching) |
|
19
|
|
|
|
|
20
|
|
|
logger = logging.getLogger(__name__) |
|
21
|
|
|
|
|
22
|
|
|
|
|
23
|
|
|
def _create_algorithm_buffer_for(param, *args, **kwargs): |
|
24
|
|
|
buf = shared_floatx_zeros_matching(param, *args, **kwargs) |
|
25
|
|
|
buf.tag.for_parameter = param |
|
26
|
|
|
add_role(buf, ALGORITHM_BUFFER) |
|
27
|
|
|
return buf |
|
28
|
|
|
|
|
29
|
|
|
|
|
30
|
|
|
@add_metaclass(ABCMeta) |
|
31
|
|
|
class TrainingAlgorithm(object): |
|
32
|
|
|
"""Base class for training algorithms. |
|
33
|
|
|
|
|
34
|
|
|
A training algorithm object has a simple life-cycle. |
|
35
|
|
|
First it is initialized by calling its :meth:`initialize` method. |
|
36
|
|
|
At this stage, for instance, Theano functions can be compiled. |
|
37
|
|
|
After that the :meth:`process_batch` method is repeatedly |
|
38
|
|
|
called with a batch of training data as a parameter. |
|
39
|
|
|
|
|
40
|
|
|
""" |
|
41
|
|
|
@abstractmethod |
|
42
|
|
|
def initialize(self, **kwargs): |
|
43
|
|
|
"""Initialize the training algorithm.""" |
|
44
|
|
|
pass |
|
45
|
|
|
|
|
46
|
|
|
@abstractmethod |
|
47
|
|
|
def process_batch(self, batch): |
|
48
|
|
|
"""Process a batch of training data. |
|
49
|
|
|
|
|
50
|
|
|
Attributes |
|
51
|
|
|
---------- |
|
52
|
|
|
batch : dict |
|
53
|
|
|
A dictionary of (source name, data) pairs. |
|
54
|
|
|
|
|
55
|
|
|
""" |
|
56
|
|
|
pass |
|
57
|
|
|
|
|
58
|
|
|
|
|
59
|
|
|
variable_mismatch_error = """ |
|
60
|
|
|
|
|
61
|
|
|
Blocks tried to match the sources ({sources}) of the training dataset to \ |
|
62
|
|
|
the names of the Theano variables ({variables}), but failed to do so. \ |
|
63
|
|
|
If you want to train on a subset of the sources that your dataset provides, \ |
|
64
|
|
|
pass the `sources` keyword argument to its constructor. Or pass \ |
|
65
|
|
|
on_unused_sources='warn' or on_unused_sources='ignore' to \ |
|
66
|
|
|
the GradientDescent algorithm.""" |
|
67
|
|
|
|
|
68
|
|
|
source_missing_error = """ |
|
69
|
|
|
|
|
70
|
|
|
Blocks didn't find all the sources ({sources}) of the training dataset \ |
|
71
|
|
|
that match the names of the Theano variables ({variables}).""" |
|
72
|
|
|
|
|
73
|
|
|
|
|
74
|
|
|
determinism_error = """Cannot infer parameter list in a fixed order. |
|
|
|
|
|
|
75
|
|
|
|
|
76
|
|
|
Because dictionaries are unordered (and Python uses randomized hashing, \ |
|
77
|
|
|
which can change the iteration order over the same dictionary from one \ |
|
78
|
|
|
interpreter session to the next), Blocks cannot infer the parameters list \ |
|
79
|
|
|
from a plain dictionary of gradients in an order that is reproducible \ |
|
80
|
|
|
across interpreter sessions; please either specify the parameters \ |
|
81
|
|
|
explicitly or pass gradients as an OrderedDict (though exercise care in \ |
|
82
|
|
|
constructing that OrderedDict, as an OrderedDict \ created by iterating \ |
|
83
|
|
|
over an unordered iterable (e.g. a dict) will still \ have an arbitrary \ |
|
84
|
|
|
and unpredictable order that could cause problems with \ |
|
85
|
|
|
reproducibility).""" |
|
86
|
|
|
|
|
87
|
|
|
|
|
88
|
|
|
class UpdatesAlgorithm(TrainingAlgorithm): |
|
89
|
|
|
"""Base class for algorithms that use Theano functions with updates. |
|
90
|
|
|
|
|
91
|
|
|
Parameters |
|
92
|
|
|
---------- |
|
93
|
|
|
updates : list of tuples or :class:`~collections.OrderedDict` |
|
94
|
|
|
The updates that should be performed. |
|
95
|
|
|
theano_func_kwargs : dict, optional |
|
96
|
|
|
A passthrough to `theano.function` for additional arguments. |
|
97
|
|
|
Useful for passing `profile` or `mode` arguments to the theano |
|
98
|
|
|
function that will be compiled for the algorithm. |
|
99
|
|
|
on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn' |
|
100
|
|
|
Controls behavior when not all sources in a batch are used |
|
101
|
|
|
(i.e. there is no variable with a matching name in the inputs |
|
102
|
|
|
of the computational graph of the updates). |
|
103
|
|
|
|
|
104
|
|
|
Attributes |
|
105
|
|
|
---------- |
|
106
|
|
|
updates : list of :class:`~tensor.TensorSharedVariable` updates |
|
107
|
|
|
Updates to be done for every batch. It is required that the |
|
108
|
|
|
updates are done using the old values of optimized parameters. |
|
109
|
|
|
|
|
110
|
|
|
""" |
|
111
|
|
|
def __init__(self, updates=None, theano_func_kwargs=None, |
|
112
|
|
|
on_unused_sources='raise', **kwargs): |
|
113
|
|
|
self.updates = [] if updates is None else updates |
|
114
|
|
|
self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs |
|
115
|
|
|
is not None else dict()) |
|
116
|
|
|
self.on_unused_sources = on_unused_sources |
|
117
|
|
|
super(UpdatesAlgorithm, self).__init__(**kwargs) |
|
118
|
|
|
|
|
119
|
|
|
def initialize(self): |
|
120
|
|
|
logger.info("Initializing the training algorithm") |
|
121
|
|
|
update_values = [new_value for _, new_value in self.updates] |
|
122
|
|
|
logger.debug("Inferring graph inputs...") |
|
123
|
|
|
self.inputs = ComputationGraph(update_values).inputs |
|
124
|
|
|
logger.debug("Compiling training function...") |
|
125
|
|
|
self._function = theano.function( |
|
126
|
|
|
self.inputs, [], updates=self.updates, **self.theano_func_kwargs) |
|
127
|
|
|
logger.info("The training algorithm is initialized") |
|
128
|
|
|
|
|
129
|
|
|
@property |
|
130
|
|
|
def updates(self): |
|
131
|
|
|
return self._updates |
|
132
|
|
|
|
|
133
|
|
|
@updates.setter |
|
134
|
|
|
def updates(self, value): |
|
135
|
|
|
self._updates = value |
|
136
|
|
|
|
|
137
|
|
|
def add_updates(self, updates): |
|
138
|
|
|
"""Add updates to the training process. |
|
139
|
|
|
|
|
140
|
|
|
The updates will be done _before_ the parameters are changed. |
|
141
|
|
|
|
|
142
|
|
|
Parameters |
|
143
|
|
|
---------- |
|
144
|
|
|
updates : list of tuples or :class:`~collections.OrderedDict` |
|
145
|
|
|
The updates to add. |
|
146
|
|
|
|
|
147
|
|
|
""" |
|
148
|
|
|
if isinstance(updates, OrderedDict): |
|
149
|
|
|
updates = list(updates.items()) |
|
150
|
|
|
if not isinstance(updates, list): |
|
151
|
|
|
raise ValueError |
|
152
|
|
|
self.updates.extend(updates) |
|
153
|
|
|
|
|
154
|
|
|
def _validate_source_names(self, batch): |
|
155
|
|
|
in_names = [v.name for v in self.inputs] |
|
156
|
|
|
|
|
157
|
|
|
if not set(in_names).issubset(set(batch.keys())): |
|
158
|
|
|
raise ValueError("Didn't find all sources: " + |
|
159
|
|
|
source_missing_error.format( |
|
160
|
|
|
sources=batch.keys(), |
|
161
|
|
|
variables=in_names)) |
|
162
|
|
|
if not set(batch.keys()).issubset(set(in_names)): |
|
163
|
|
|
if self.on_unused_sources == 'ignore': |
|
164
|
|
|
pass |
|
165
|
|
|
elif self.on_unused_sources == 'warn': |
|
166
|
|
|
if not hasattr(self, '_unused_source_warned'): |
|
167
|
|
|
logger.warn(variable_mismatch_error.format( |
|
168
|
|
|
sources=batch.keys(), |
|
169
|
|
|
variables=in_names)) |
|
170
|
|
|
self._unused_source_warned = True |
|
171
|
|
|
elif self.on_unused_sources == 'raise': |
|
172
|
|
|
raise ValueError( |
|
173
|
|
|
"mismatch of variable names and data sources" + |
|
174
|
|
|
variable_mismatch_error.format( |
|
175
|
|
|
sources=batch.keys(), |
|
176
|
|
|
variables=in_names)) |
|
177
|
|
|
else: |
|
178
|
|
|
raise ValueError("Wrong value of on_unused_sources: {}." |
|
179
|
|
|
.format(self.on_unused_sources)) |
|
180
|
|
|
|
|
181
|
|
|
def process_batch(self, batch): |
|
182
|
|
|
self._validate_source_names(batch) |
|
183
|
|
|
ordered_batch = [batch[v.name] for v in self.inputs] |
|
184
|
|
|
self._function(*ordered_batch) |
|
185
|
|
|
|
|
186
|
|
|
|
|
187
|
|
|
class GradientDescent(UpdatesAlgorithm): |
|
188
|
|
|
"""A base class for all gradient descent algorithms. |
|
189
|
|
|
|
|
190
|
|
|
By "gradient descent" we mean a training algorithm of the following |
|
191
|
|
|
form: |
|
192
|
|
|
|
|
193
|
|
|
.. code-block:: python |
|
194
|
|
|
|
|
195
|
|
|
for batch in data: |
|
196
|
|
|
steps = step_rule.compute_steps(parameters, |
|
197
|
|
|
gradients_wr_parameters) |
|
198
|
|
|
for parameter in parameters: |
|
199
|
|
|
parameter -= steps[parameter] |
|
200
|
|
|
|
|
201
|
|
|
Note, that the step is *subtracted, not added*! This is done in order |
|
202
|
|
|
to make step rule chaining possible. |
|
203
|
|
|
|
|
204
|
|
|
Parameters |
|
205
|
|
|
---------- |
|
206
|
|
|
cost : :class:`~tensor.TensorVariable`, optional |
|
207
|
|
|
The objective to be minimized. Unused if `gradients` is specified. |
|
208
|
|
|
parameters : list of :class:`~tensor.TensorSharedVariable`, optional |
|
209
|
|
|
The parameters to be tuned. If not provided, inferred from the |
|
210
|
|
|
keys of `gradients` (in which case `gradients` *must* be an |
|
211
|
|
|
`OrderedDict`). |
|
212
|
|
|
step_rule : instance of :class:`StepRule`, optional |
|
213
|
|
|
An object encapsulating most of the algorithm's logic. Its |
|
214
|
|
|
`compute_steps` method is called to get Theano expression for |
|
215
|
|
|
steps. Note, that the step rule might have a state, e.g. to |
|
216
|
|
|
remember a weighted sum of gradients from previous steps like it is |
|
217
|
|
|
done in gradient descent with momentum. If ``None``, an instance of |
|
218
|
|
|
:class:`Scale` is created. |
|
219
|
|
|
gradients : OrderedDict, optional |
|
220
|
|
|
A dictionary mapping a parameter to an expression for the cost's |
|
221
|
|
|
gradient with respect to the parameter. If ``None``, the gradient |
|
222
|
|
|
are taken automatically using :func:`theano.gradient.grad`. |
|
223
|
|
|
known_grads : dict, optional |
|
224
|
|
|
A passthrough to `theano.tensor.grad`'s `known_grads` argument. |
|
225
|
|
|
Useful when you know the [approximate] gradients of some |
|
226
|
|
|
sub-expressions and would like Theano to use that information |
|
227
|
|
|
to compute parameter gradients. Only makes sense when `gradients` |
|
228
|
|
|
is `None`. |
|
229
|
|
|
consider_constant : list, optional |
|
230
|
|
|
A passthrough to `theano.tensor.grad`'s `consider_constant` |
|
231
|
|
|
argument. A list of expressions through which gradients will not |
|
232
|
|
|
be backpropagated. Only makes sense when `gradients` is `None`. |
|
233
|
|
|
|
|
234
|
|
|
Attributes |
|
235
|
|
|
---------- |
|
236
|
|
|
gradients : OrderedDict |
|
237
|
|
|
The gradient dictionary. |
|
238
|
|
|
step_rule : instance of :class:`StepRule` |
|
239
|
|
|
The step rule. |
|
240
|
|
|
|
|
241
|
|
|
Notes |
|
242
|
|
|
----- |
|
243
|
|
|
Changing `updates` attribute or calling `add_updates` after |
|
244
|
|
|
the `initialize` method is called will have no effect. |
|
245
|
|
|
|
|
246
|
|
|
`gradients` must be an `OrderedDict` if `parameters` is unspecified |
|
247
|
|
|
because ordinary dictionaries have an unpredictable iteration |
|
248
|
|
|
order due to hash randomization (which is enabled by default since |
|
249
|
|
|
versions 2.7.3 and 3.2.3 of Python). This source of variability, |
|
250
|
|
|
when combined with Theano's heuristic graph optimizations, can cause |
|
251
|
|
|
serious reproducibility issues. |
|
252
|
|
|
|
|
253
|
|
|
.. todo:: |
|
254
|
|
|
|
|
255
|
|
|
Some shared variables are not parameters (e.g. those created by |
|
256
|
|
|
random streams). |
|
257
|
|
|
|
|
258
|
|
|
.. todo:: |
|
259
|
|
|
|
|
260
|
|
|
Due to a rather premature status of the :class:`ComputationGraph` |
|
261
|
|
|
class the parameter used only inside scans are not fetched |
|
262
|
|
|
currently. |
|
263
|
|
|
|
|
264
|
|
|
""" |
|
265
|
|
|
def __init__(self, cost=None, parameters=None, step_rule=None, |
|
266
|
|
|
gradients=None, known_grads=None, consider_constant=None, |
|
267
|
|
|
**kwargs): |
|
268
|
|
|
# Set initial values for cost, parameters, gradients. |
|
269
|
|
|
self.cost = cost |
|
270
|
|
|
self.parameters = parameters |
|
271
|
|
|
self.gradients = gradients |
|
272
|
|
|
|
|
273
|
|
|
# If we don't have gradients, we'll need to infer them from the |
|
274
|
|
|
# cost and the parameters, both of which must not be None. |
|
275
|
|
|
if not self.gradients: |
|
276
|
|
|
self.gradients = self._compute_gradients(known_grads, |
|
277
|
|
|
consider_constant) |
|
278
|
|
|
else: |
|
279
|
|
|
if cost is not None: |
|
280
|
|
|
logger.warning(('{}: gradients already specified directly; ' |
|
281
|
|
|
'cost is unused.' |
|
282
|
|
|
.format(self.__class__.__name__))) |
|
283
|
|
|
if self.parameters is None and isinstance(gradients, OrderedDict): |
|
284
|
|
|
# If the dictionary is ordered, it's safe to use the keys |
|
285
|
|
|
# as they have a deterministic order. |
|
286
|
|
|
self.parameters = list(self.gradients.keys()) |
|
287
|
|
|
elif self.parameters is not None: |
|
288
|
|
|
# If parameters and gradients.keys() don't match we can |
|
289
|
|
|
# try to recover if gradients is ordered. |
|
290
|
|
|
if set(self.parameters) != set(self.gradients.keys()): |
|
291
|
|
|
logger.warn("Specified parameters list does not match " |
|
292
|
|
|
"keys in provided gradient dictionary; " |
|
293
|
|
|
"using parameters inferred from gradients") |
|
294
|
|
|
if not isinstance(self.gradients, OrderedDict): |
|
295
|
|
|
raise ValueError(determinism_error) |
|
296
|
|
|
self.parameters = list(self.gradients.keys()) |
|
297
|
|
|
else: |
|
298
|
|
|
# self.parameters is not None, and gradients isn't |
|
299
|
|
|
# an OrderedDict. We can't do anything safe. |
|
300
|
|
|
raise ValueError(determinism_error) |
|
301
|
|
|
if known_grads: |
|
302
|
|
|
raise ValueError("known_grads has no effect when gradients " |
|
303
|
|
|
"are passed in") |
|
304
|
|
|
if consider_constant is not None: |
|
305
|
|
|
raise ValueError("consider_constant has no effect when " |
|
306
|
|
|
"gradients are passed in") |
|
307
|
|
|
|
|
308
|
|
|
# The order in which the different gradient terms appears |
|
309
|
|
|
# here matters, as floating point addition is non-commutative (and |
|
310
|
|
|
# Theano's graph optimizations are not order-independent). |
|
311
|
|
|
# This is why we do not use .values(). |
|
312
|
|
|
gradient_values = [self.gradients[p] for p in self.parameters] |
|
313
|
|
|
self.total_gradient_norm = (l2_norm(gradient_values) |
|
314
|
|
|
.copy(name="total_gradient_norm")) |
|
315
|
|
|
|
|
316
|
|
|
self.step_rule = step_rule if step_rule else Scale() |
|
317
|
|
|
logger.debug("Computing parameter steps...") |
|
318
|
|
|
self.steps, self.step_rule_updates = ( |
|
319
|
|
|
self.step_rule.compute_steps(self.gradients)) |
|
320
|
|
|
|
|
321
|
|
|
# Same as gradient_values above: the order may influence a |
|
322
|
|
|
# bunch of things, so enforce a consistent one (don't use |
|
323
|
|
|
# .values()). |
|
324
|
|
|
step_values = [self.steps[p] for p in self.parameters] |
|
325
|
|
|
self.total_step_norm = (l2_norm(step_values) |
|
326
|
|
|
.copy(name="total_step_norm")) |
|
327
|
|
|
|
|
328
|
|
|
# Once again, iterating on gradients may not be deterministically |
|
329
|
|
|
# ordered if it is not an OrderedDict. We add the updates here in |
|
330
|
|
|
# the order specified in self.parameters. Keep it this way to |
|
331
|
|
|
# maintain reproducibility. |
|
332
|
|
|
kwargs.setdefault('updates', []).extend( |
|
333
|
|
|
itertools.chain(((parameter, parameter - self.steps[parameter]) |
|
334
|
|
|
for parameter in self.parameters), |
|
335
|
|
|
self.step_rule_updates) |
|
336
|
|
|
) |
|
337
|
|
|
super(GradientDescent, self).__init__(**kwargs) |
|
338
|
|
|
|
|
339
|
|
|
def _compute_gradients(self, known_grads, consider_constant): |
|
340
|
|
|
if self.cost is None: |
|
341
|
|
|
raise ValueError("can't infer gradients; no cost specified") |
|
342
|
|
|
elif self.parameters is None or len(self.parameters) == 0: |
|
343
|
|
|
raise ValueError("can't infer gradients; no parameters " |
|
344
|
|
|
"specified") |
|
345
|
|
|
# While this strictly speaking could be a dict and not an |
|
346
|
|
|
# OrderedDict (because we iterate over it in the order of |
|
347
|
|
|
# self.parameters), this guards a little bit against |
|
348
|
|
|
# nondeterminism introduced by future refactoring. |
|
349
|
|
|
logger.info("Taking the cost gradient") |
|
350
|
|
|
gradients = OrderedDict( |
|
351
|
|
|
equizip(self.parameters, tensor.grad( |
|
352
|
|
|
self.cost, self.parameters, |
|
353
|
|
|
known_grads=known_grads, |
|
354
|
|
|
consider_constant=consider_constant))) |
|
355
|
|
|
logger.info("The cost gradient computation graph is built") |
|
356
|
|
|
return gradients |
|
357
|
|
|
|
|
358
|
|
|
|
|
359
|
|
|
@add_metaclass(ABCMeta) |
|
360
|
|
|
class StepRule(object): |
|
361
|
|
|
"""A rule to compute steps for a gradient descent algorithm.""" |
|
362
|
|
|
def compute_step(self, parameter, previous_step): |
|
363
|
|
|
"""Build a Theano expression for the step for a parameter. |
|
364
|
|
|
|
|
365
|
|
|
This method is called by default implementation of |
|
366
|
|
|
:meth:`compute_steps`, it relieves from writing a loop each time. |
|
367
|
|
|
|
|
368
|
|
|
Parameters |
|
369
|
|
|
---------- |
|
370
|
|
|
parameter : :class:`~tensor.TensorSharedVariable` |
|
371
|
|
|
The parameter. |
|
372
|
|
|
previous_step : :class:`~tensor.TensorVariable` |
|
373
|
|
|
Some quantity related to the gradient of the cost with respect |
|
374
|
|
|
to the parameter, either the gradient itself or a step in a |
|
375
|
|
|
related direction. |
|
376
|
|
|
|
|
377
|
|
|
Returns |
|
378
|
|
|
------- |
|
379
|
|
|
step : :class:`~theano.Variable` |
|
380
|
|
|
Theano variable for the step to take. |
|
381
|
|
|
updates : list |
|
382
|
|
|
A list of tuples representing updates to be performed. This |
|
383
|
|
|
is useful for stateful rules such as :class:`Momentum` which |
|
384
|
|
|
need to update shared variables after itetations. |
|
385
|
|
|
|
|
386
|
|
|
""" |
|
387
|
|
|
raise NotImplementedError |
|
388
|
|
|
|
|
389
|
|
|
def compute_steps(self, previous_steps): |
|
390
|
|
|
"""Build a Theano expression for steps for all parameters. |
|
391
|
|
|
|
|
392
|
|
|
Override this method if you want to process the steps |
|
393
|
|
|
with respect to all parameters as a whole, not parameter-wise. |
|
394
|
|
|
|
|
395
|
|
|
Parameters |
|
396
|
|
|
---------- |
|
397
|
|
|
previous_steps : OrderedDict |
|
398
|
|
|
An :class:`~OrderedDict` of |
|
399
|
|
|
(:class:`~tensor.TensorSharedVariable` |
|
400
|
|
|
:class:`~tensor.TensorVariable`) pairs. The keys are the |
|
401
|
|
|
parameters being trained, the values are the expressions for |
|
402
|
|
|
quantities related to gradients of the cost with respect to |
|
403
|
|
|
the parameters, either the gradients themselves or steps in |
|
404
|
|
|
related directions. |
|
405
|
|
|
|
|
406
|
|
|
Returns |
|
407
|
|
|
------- |
|
408
|
|
|
steps : OrderedDict |
|
409
|
|
|
A dictionary of the proposed steps in the same form as |
|
410
|
|
|
`previous_steps`. |
|
411
|
|
|
updates : list |
|
412
|
|
|
A list of tuples representing updates to be performed. |
|
413
|
|
|
|
|
414
|
|
|
""" |
|
415
|
|
|
parameter_wise = [self.compute_step(parameter, |
|
416
|
|
|
previous_steps[parameter]) |
|
417
|
|
|
for parameter in previous_steps] |
|
418
|
|
|
steps, updates = equizip(*parameter_wise) |
|
419
|
|
|
steps = OrderedDict((parameter, step) for parameter, step |
|
420
|
|
|
in equizip(previous_steps.keys(), steps)) |
|
421
|
|
|
updates = list(itertools.chain(*updates)) |
|
422
|
|
|
return steps, updates |
|
423
|
|
|
|
|
424
|
|
|
|
|
425
|
|
|
class CompositeRule(StepRule): |
|
426
|
|
|
"""Chains several step rules. |
|
427
|
|
|
|
|
428
|
|
|
Parameters |
|
429
|
|
|
---------- |
|
430
|
|
|
components : list of :class:`StepRule` |
|
431
|
|
|
The learning rules to be chained. The rules will be applied in the |
|
432
|
|
|
order as given. |
|
433
|
|
|
|
|
434
|
|
|
""" |
|
435
|
|
|
def __init__(self, components): |
|
436
|
|
|
self.components = components |
|
437
|
|
|
|
|
438
|
|
|
def compute_steps(self, previous_steps): |
|
439
|
|
|
steps = previous_steps |
|
440
|
|
|
updates = [] |
|
441
|
|
|
for rule in self.components: |
|
442
|
|
|
steps, more_updates = rule.compute_steps(steps) |
|
443
|
|
|
updates += more_updates |
|
444
|
|
|
return steps, updates |
|
445
|
|
|
|
|
446
|
|
|
|
|
447
|
|
|
class Scale(StepRule): |
|
448
|
|
|
"""A step in the direction proportional to the previous step. |
|
449
|
|
|
|
|
450
|
|
|
If used in :class:`GradientDescent` alone, this step rule implements |
|
451
|
|
|
steepest descent. |
|
452
|
|
|
|
|
453
|
|
|
Parameters |
|
454
|
|
|
---------- |
|
455
|
|
|
learning_rate : float |
|
456
|
|
|
The learning rate by which the previous step is multiplied to |
|
457
|
|
|
produce the step. |
|
458
|
|
|
|
|
459
|
|
|
Attributes |
|
460
|
|
|
---------- |
|
461
|
|
|
learning_rate : :class:`~tensor.TensorSharedVariable` |
|
462
|
|
|
The shared variable storing the learning rate used. |
|
463
|
|
|
|
|
464
|
|
|
""" |
|
465
|
|
|
def __init__(self, learning_rate=1.0): |
|
466
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
|
467
|
|
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
|
468
|
|
|
|
|
469
|
|
|
def compute_step(self, parameter, previous_step): |
|
470
|
|
|
return self.learning_rate * previous_step, [] |
|
471
|
|
|
|
|
472
|
|
|
|
|
473
|
|
|
class BasicMomentum(StepRule): |
|
474
|
|
|
"""Accumulates step with exponential discount. |
|
475
|
|
|
|
|
476
|
|
|
Parameters |
|
477
|
|
|
---------- |
|
478
|
|
|
momentum : float, optional |
|
479
|
|
|
The momentum coefficient. Defaults to 0. |
|
480
|
|
|
|
|
481
|
|
|
Notes |
|
482
|
|
|
----- |
|
483
|
|
|
This step rule is intended to be used in conjunction with another |
|
484
|
|
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
|
485
|
|
|
experience, look at :class:`Momentum`. |
|
486
|
|
|
|
|
487
|
|
|
""" |
|
488
|
|
|
def __init__(self, momentum=0.): |
|
489
|
|
|
self.momentum = shared_floatx(momentum, "momentum") |
|
490
|
|
|
add_role(self.momentum, ALGORITHM_HYPERPARAMETER) |
|
491
|
|
|
|
|
492
|
|
|
def compute_step(self, parameter, previous_step): |
|
493
|
|
|
velocity = _create_algorithm_buffer_for(parameter, "velocity") |
|
494
|
|
|
step = self.momentum * velocity + previous_step |
|
495
|
|
|
updates = [(velocity, step)] |
|
496
|
|
|
return step, updates |
|
497
|
|
|
|
|
498
|
|
|
|
|
499
|
|
|
class Momentum(CompositeRule): |
|
500
|
|
|
"""Accumulates step with exponential discount. |
|
501
|
|
|
|
|
502
|
|
|
Combines :class:`BasicMomentum` and :class:`Scale` to form the |
|
503
|
|
|
usual momentum step rule. |
|
504
|
|
|
|
|
505
|
|
|
Parameters |
|
506
|
|
|
---------- |
|
507
|
|
|
learning_rate : float, optional |
|
508
|
|
|
The learning rate by which the previous step scaled. Defaults to 1. |
|
509
|
|
|
momentum : float, optional |
|
510
|
|
|
The momentum coefficient. Defaults to 0. |
|
511
|
|
|
|
|
512
|
|
|
Attributes |
|
513
|
|
|
---------- |
|
514
|
|
|
learning_rate : :class:`~tensor.SharedVariable` |
|
515
|
|
|
A variable for learning rate. |
|
516
|
|
|
momentum : :class:`~tensor.SharedVariable` |
|
517
|
|
|
A variable for momentum. |
|
518
|
|
|
|
|
519
|
|
|
See Also |
|
520
|
|
|
-------- |
|
521
|
|
|
:class:`SharedVariableModifier` |
|
522
|
|
View Code Duplication |
|
|
|
|
|
|
|
523
|
|
|
""" |
|
524
|
|
|
def __init__(self, learning_rate=1.0, momentum=0.): |
|
525
|
|
|
scale = Scale(learning_rate=learning_rate) |
|
526
|
|
|
basic_momentum = BasicMomentum(momentum=momentum) |
|
527
|
|
|
self.learning_rate = scale.learning_rate |
|
528
|
|
|
self.momentum = basic_momentum.momentum |
|
529
|
|
|
self.components = [scale, basic_momentum] |
|
530
|
|
|
|
|
531
|
|
|
|
|
532
|
|
|
class AdaDelta(StepRule): |
|
533
|
|
|
"""Adapts the step size over time using only first order information. |
|
534
|
|
|
|
|
535
|
|
|
Parameters |
|
536
|
|
|
---------- |
|
537
|
|
|
decay_rate : float, optional |
|
538
|
|
|
Decay rate in [0, 1]. Defaults to 0.95. |
|
539
|
|
|
epsilon : float, optional |
|
540
|
|
|
Stabilizing constant for RMS. Defaults to 1e-6. |
|
541
|
|
|
|
|
542
|
|
|
Notes |
|
543
|
|
|
----- |
|
544
|
|
|
For more information, see [ADADELTA]_. |
|
545
|
|
|
|
|
546
|
|
|
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning |
|
547
|
|
|
Rate Method*, arXiv:1212.5701. |
|
548
|
|
|
|
|
549
|
|
|
""" |
|
550
|
|
|
def __init__(self, decay_rate=0.95, epsilon=1e-6): |
|
551
|
|
|
if not 0.0 <= decay_rate <= 1.0: |
|
552
|
|
|
raise ValueError("decay rate needs to be in [0, 1]") |
|
553
|
|
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
|
554
|
|
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
|
555
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
|
556
|
|
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
|
557
|
|
|
|
|
558
|
|
|
def compute_step(self, parameter, previous_step): |
|
559
|
|
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
|
560
|
|
|
parameter, "mean_square_step_tm1") |
|
561
|
|
|
mean_square_delta_x_tm1 = _create_algorithm_buffer_for( |
|
562
|
|
|
parameter, "mean_square_delta_x_tm1") |
|
563
|
|
|
|
|
564
|
|
|
mean_square_step_t = ( |
|
565
|
|
|
self.decay_rate * mean_square_step_tm1 + |
|
566
|
|
|
(1 - self.decay_rate) * tensor.sqr(previous_step) |
|
567
|
|
|
) |
|
568
|
|
|
|
|
569
|
|
|
rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) |
|
570
|
|
|
rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) |
|
571
|
|
|
delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step |
|
572
|
|
|
|
|
573
|
|
|
mean_square_delta_x_t = ( |
|
574
|
|
|
self.decay_rate * mean_square_delta_x_tm1 + |
|
575
|
|
|
(1 - self.decay_rate) * tensor.sqr(delta_x_t) |
|
576
|
|
|
) |
|
577
|
|
|
|
|
578
|
|
|
step = delta_x_t |
|
579
|
|
|
updates = [(mean_square_step_tm1, mean_square_step_t), |
|
580
|
|
|
(mean_square_delta_x_tm1, mean_square_delta_x_t)] |
|
581
|
|
|
return step, updates |
|
582
|
|
|
|
|
583
|
|
|
|
|
584
|
|
|
class BasicRMSProp(StepRule): |
|
585
|
|
|
"""Scales the step size by a running average of the recent step norms. |
|
586
|
|
|
|
|
587
|
|
|
Parameters |
|
588
|
|
|
---------- |
|
589
|
|
|
decay_rate : float, optional |
|
590
|
|
|
How fast the running average decays, value in [0, 1] |
|
591
|
|
|
(lower is faster). Defaults to 0.9. |
|
592
|
|
|
max_scaling : float, optional |
|
593
|
|
|
Maximum scaling of the step size, in case the running average is |
|
594
|
|
|
really small. Needs to be greater than 0. Defaults to 1e5. |
|
595
|
|
|
|
|
596
|
|
|
Notes |
|
597
|
|
|
----- |
|
598
|
|
|
This step rule is intended to be used in conjunction with another |
|
599
|
|
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
|
600
|
|
|
experience, look at :class:`RMSProp`. |
|
601
|
|
|
|
|
602
|
|
|
In general, this step rule should be used _before_ other step rules, |
|
603
|
|
|
because it has normalization properties that may undo their work. |
|
604
|
|
|
For instance, it should be applied first when used in conjunction |
|
605
|
|
|
with :class:`Scale`. |
|
606
|
|
|
|
|
607
|
|
|
For more information, see [Hint2014]_. |
|
608
|
|
|
|
|
609
|
|
|
""" |
|
610
|
|
|
def __init__(self, decay_rate=0.9, max_scaling=1e5): |
|
611
|
|
|
if not 0.0 <= decay_rate <= 1.0: |
|
612
|
|
|
raise ValueError("decay rate needs to be in [0, 1]") |
|
613
|
|
|
if max_scaling <= 0: |
|
614
|
|
|
raise ValueError("max. scaling needs to be greater than 0") |
|
615
|
|
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
|
616
|
|
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
|
617
|
|
|
self.epsilon = 1. / max_scaling |
|
618
|
|
|
|
|
619
|
|
|
def compute_step(self, parameter, previous_step): |
|
620
|
|
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
|
621
|
|
|
parameter, "mean_square_step_tm1") |
|
622
|
|
|
mean_square_step_t = ( |
|
623
|
|
|
self.decay_rate * mean_square_step_tm1 + |
|
624
|
|
|
(1 - self.decay_rate) * tensor.sqr(previous_step)) |
|
625
|
|
|
rms_step_t = tensor.maximum( |
|
626
|
|
|
tensor.sqrt(mean_square_step_t), self.epsilon) |
|
627
|
|
|
step = previous_step / rms_step_t |
|
628
|
|
|
updates = [(mean_square_step_tm1, mean_square_step_t)] |
|
629
|
|
|
return step, updates |
|
630
|
|
|
|
|
631
|
|
|
|
|
632
|
|
|
class RMSProp(CompositeRule): |
|
633
|
|
|
"""Scales the step size by a running average of the recent step norms. |
|
634
|
|
|
|
|
635
|
|
|
Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule |
|
636
|
|
|
described in [Hint2014]_. |
|
637
|
|
|
|
|
638
|
|
|
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, |
|
639
|
|
|
lecture 6a, |
|
640
|
|
|
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf |
|
641
|
|
|
|
|
642
|
|
|
Parameters |
|
643
|
|
|
---------- |
|
644
|
|
|
learning_rate : float, optional |
|
645
|
|
|
The learning rate by which the previous step scaled. Defaults to 1. |
|
646
|
|
|
decay_rate : float, optional |
|
647
|
|
|
How fast the running average decays (lower is faster). |
|
648
|
|
|
Defaults to 0.9. |
|
649
|
|
|
max_scaling : float, optional |
|
650
|
|
|
Maximum scaling of the step size, in case the running average is |
|
651
|
|
|
really small. Defaults to 1e5. |
|
652
|
|
|
|
|
653
|
|
|
Attributes |
|
654
|
|
|
---------- |
|
655
|
|
|
learning_rate : :class:`~tensor.SharedVariable` |
|
656
|
|
|
A variable for learning rate. |
|
657
|
|
|
decay_rate : :class:`~tensor.SharedVariable` |
|
658
|
|
|
A variable for decay rate. |
|
659
|
|
|
|
|
660
|
|
|
See Also |
|
661
|
|
|
-------- |
|
662
|
|
|
:class:`SharedVariableModifier` |
|
663
|
|
|
|
|
664
|
|
|
""" |
|
665
|
|
|
def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5): |
|
666
|
|
|
basic_rms_prop = BasicRMSProp(decay_rate=decay_rate, |
|
667
|
|
|
max_scaling=max_scaling) |
|
668
|
|
|
scale = Scale(learning_rate=learning_rate) |
|
669
|
|
|
self.learning_rate = scale.learning_rate |
|
670
|
|
|
self.decay_rate = basic_rms_prop.decay_rate |
|
671
|
|
|
self.components = [basic_rms_prop, scale] |
|
672
|
|
|
|
|
673
|
|
|
|
|
674
|
|
|
class StepClipping(StepRule): |
|
675
|
|
|
"""Rescales an entire step if its L2 norm exceeds a threshold. |
|
676
|
|
|
|
|
677
|
|
|
When the previous steps are the gradients, this step rule performs |
|
678
|
|
|
gradient clipping. |
|
679
|
|
|
|
|
680
|
|
|
Parameters |
|
681
|
|
|
---------- |
|
682
|
|
|
threshold : float, optional |
|
683
|
|
|
The maximum permitted L2 norm for the step. The step |
|
684
|
|
|
will be rescaled to be not higher than this quanity. |
|
685
|
|
|
If ``None``, no rescaling will be applied. |
|
686
|
|
|
|
|
687
|
|
|
Attributes |
|
688
|
|
|
---------- |
|
689
|
|
|
threshold : :class:`.tensor.TensorSharedVariable` |
|
690
|
|
|
The shared variable storing the clipping threshold used. |
|
691
|
|
|
|
|
692
|
|
|
""" |
|
693
|
|
|
def __init__(self, threshold=None): |
|
694
|
|
|
if threshold: |
|
695
|
|
|
self.threshold = shared_floatx(threshold, "threshold") |
|
696
|
|
|
add_role(self.threshold, ALGORITHM_HYPERPARAMETER) |
|
697
|
|
|
|
|
698
|
|
|
def compute_steps(self, previous_steps): |
|
699
|
|
|
if not hasattr(self, 'threshold'): |
|
700
|
|
|
return previous_steps |
|
701
|
|
|
norm = l2_norm(previous_steps.values()) |
|
702
|
|
|
multiplier = tensor.switch(norm < self.threshold, |
|
703
|
|
|
1, self.threshold / norm) |
|
704
|
|
|
steps = OrderedDict( |
|
705
|
|
|
(parameter, step * multiplier) |
|
706
|
|
|
for parameter, step in previous_steps.items()) |
|
707
|
|
|
return steps, [] |
|
708
|
|
|
|
|
709
|
|
|
|
|
710
|
|
|
class VariableClipping(StepRule): |
|
711
|
|
|
"""Clip the maximum norm of individual variables along certain axes. |
|
712
|
|
|
|
|
713
|
|
|
This :class:`StepRule` can be used to implement L2 norm constraints on |
|
714
|
|
|
e.g. the weight vectors of individual hidden units, convolutional |
|
715
|
|
|
filters or entire weight tensors. Combine with :class:`Restrict` |
|
716
|
|
|
(and possibly :class:`CompositeRule`), to apply such constraints only |
|
717
|
|
|
to certain variables and/or apply different norm constraints to |
|
718
|
|
|
different variables. |
|
719
|
|
|
|
|
720
|
|
|
Parameters |
|
721
|
|
|
---------- |
|
722
|
|
|
threshold : float |
|
723
|
|
|
Maximum norm for a given (portion of a) tensor. |
|
724
|
|
|
axis : int or iterable, optional |
|
725
|
|
View Code Duplication |
An integer single axis, or an iterable collection of integer |
|
|
|
|
|
|
726
|
|
|
axes over which to sum in order to calculate the L2 norm. If |
|
727
|
|
|
`None` (the default), the norm is computed over all elements |
|
728
|
|
|
of the tensor. |
|
729
|
|
|
|
|
730
|
|
|
Notes |
|
731
|
|
|
----- |
|
732
|
|
|
Because of the way the :class:`StepRule` API works, this particular |
|
733
|
|
|
rule implements norm clipping of the value *after* update in the |
|
734
|
|
|
following way: it computes ``parameter - previous_step``, scales it |
|
735
|
|
|
to have (possibly axes-wise) norm(s) of at most `threshold`, |
|
736
|
|
|
then subtracts *that* value from `parameter` to yield an 'equivalent |
|
737
|
|
|
step' that respects the desired norm constraints. This procedure |
|
738
|
|
|
implicitly assumes one is doing simple (stochastic) gradient descent, |
|
739
|
|
|
and so steps computed by this step rule may not make sense for use |
|
740
|
|
|
in other contexts. |
|
741
|
|
|
|
|
742
|
|
|
Investigations into max-norm regularization date from [Srebro2005]_. |
|
743
|
|
|
The first appearance of this technique as a regularization method |
|
744
|
|
|
for the weight vectors of individual hidden units in feed-forward |
|
745
|
|
|
neural networks may be [Hinton2012]_. |
|
746
|
|
|
|
|
747
|
|
|
.. [Srebro2005] Nathan Srebro and Adi Shraibman. |
|
748
|
|
|
"Rank, Trace-Norm and Max-Norm". *18th Annual Conference |
|
749
|
|
|
on Learning Theory (COLT)*, June 2005. |
|
750
|
|
|
|
|
751
|
|
|
.. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava, |
|
752
|
|
|
Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov. |
|
753
|
|
|
"Improving neural networks by preventing co-adaptation of |
|
754
|
|
|
feature detectors". arXiv:1207.0580. |
|
755
|
|
|
|
|
756
|
|
|
""" |
|
757
|
|
|
def __init__(self, threshold, axis=None): |
|
758
|
|
|
axis = pack(axis) if axis is not None else () |
|
759
|
|
|
self.axis = set(axis) |
|
760
|
|
|
self.threshold = shared_floatx(threshold, "threshold") |
|
761
|
|
|
add_role(self.threshold, ALGORITHM_HYPERPARAMETER) |
|
762
|
|
|
if len(axis) != len(self.axis): |
|
763
|
|
|
raise ValueError("axis must be unique") |
|
764
|
|
|
|
|
765
|
|
|
def compute_step(self, parameter, previous_step): |
|
766
|
|
|
if any(ax >= previous_step.ndim for ax in self.axis): |
|
767
|
|
|
raise ValueError("Invalid axis {} for {}, ndim={}".format( |
|
768
|
|
|
self.axis, parameter, previous_step.ndim)) |
|
769
|
|
|
if len(self.axis) == 0: |
|
770
|
|
|
norms = l2_norm([parameter - previous_step]) |
|
771
|
|
|
else: |
|
772
|
|
|
squares = tensor.sqr(parameter - previous_step) |
|
773
|
|
|
norms = tensor.sqrt( |
|
774
|
|
|
reduce(lambda t, a: t.sum(axis=a, keepdims=True), |
|
775
|
|
|
sorted(self.axis), squares)) |
|
776
|
|
|
# We want a step s* that is the same as scaling |
|
777
|
|
|
# (parameter - previous_step) by threshold / norm |
|
778
|
|
|
# when threshold < norm. |
|
779
|
|
|
shrinking_step = (parameter - |
|
780
|
|
|
(self.threshold / norms) * |
|
781
|
|
|
(parameter - previous_step)) |
|
782
|
|
|
return tensor.switch(norms > self.threshold, |
|
783
|
|
|
shrinking_step, |
|
784
|
|
|
previous_step), () |
|
785
|
|
|
|
|
786
|
|
|
|
|
787
|
|
|
class AdaGrad(StepRule): |
|
788
|
|
|
"""Implements the AdaGrad learning rule. |
|
789
|
|
|
|
|
790
|
|
|
Parameters |
|
791
|
|
|
---------- |
|
792
|
|
|
learning_rate : float, optional |
|
793
|
|
|
Step size. |
|
794
|
|
|
Default value is set to 0.0002. |
|
795
|
|
|
epsilon : float, optional |
|
796
|
|
|
Stabilizing constant for one over root of sum of squares. |
|
797
|
|
|
Defaults to 1e-6. |
|
798
|
|
|
|
|
799
|
|
|
Notes |
|
800
|
|
|
----- |
|
801
|
|
|
For more information, see [ADAGRAD]_. |
|
802
|
|
|
|
|
803
|
|
|
.. [ADADGRAD] Duchi J, Hazan E, Singer Y., |
|
804
|
|
|
*Adaptive subgradient methods for online learning and |
|
805
|
|
|
stochastic optimization*, |
|
806
|
|
|
http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf |
|
807
|
|
|
|
|
808
|
|
|
""" |
|
809
|
|
|
def __init__(self, learning_rate=0.002, epsilon=1e-6): |
|
810
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
|
811
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
|
812
|
|
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
|
813
|
|
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
|
814
|
|
|
|
|
815
|
|
|
def compute_step(self, parameter, previous_step): |
|
816
|
|
|
name = 'adagrad_sqs' |
|
817
|
|
|
if parameter.name: |
|
818
|
|
|
name += '_' + parameter.name |
|
819
|
|
|
ssq = _create_algorithm_buffer_for(parameter, name=name) |
|
820
|
|
|
|
|
821
|
|
|
ssq_t = (tensor.sqr(previous_step) + ssq) |
|
822
|
|
|
step = (self.learning_rate * previous_step / |
|
823
|
|
|
(tensor.sqrt(ssq_t) + self.epsilon)) |
|
824
|
|
|
|
|
825
|
|
|
updates = [(ssq, ssq_t)] |
|
826
|
|
|
|
|
827
|
|
|
return step, updates |
|
828
|
|
|
|
|
829
|
|
|
|
|
830
|
|
|
class Adam(StepRule): |
|
831
|
|
|
"""Adam optimizer as described in [King2014]_. |
|
832
|
|
|
|
|
833
|
|
|
.. [King2014] Diederik Kingma, Jimmy Ba, |
|
834
|
|
|
*Adam: A Method for Stochastic Optimization*, |
|
835
|
|
|
http://arxiv.org/abs/1412.6980 |
|
836
|
|
|
|
|
837
|
|
|
Parameters |
|
838
|
|
|
---------- |
|
839
|
|
|
learning_rate : float, optional |
|
840
|
|
|
Step size. |
|
841
|
|
|
Default value is set to 0.002. |
|
842
|
|
|
beta1 : float, optional |
|
843
|
|
|
Exponential decay rate for the first moment estimates. |
|
844
|
|
|
Default value is set to 0.1. |
|
845
|
|
|
beta2 : float, optional |
|
846
|
|
|
Exponential decay rate for the second moment estimates. |
|
847
|
|
|
Default value is set to 0.001. |
|
848
|
|
|
epsilon : float, optional |
|
849
|
|
|
Default value is set to 1e-8. |
|
850
|
|
|
decay_factor : float, optional |
|
851
|
|
|
Default value is set to 1 - 1e-8. |
|
852
|
|
|
|
|
853
|
|
|
""" |
|
854
|
|
|
def __init__(self, learning_rate=0.002, |
|
855
|
|
|
beta1=0.1, beta2=0.001, epsilon=1e-8, |
|
856
|
|
|
decay_factor=(1 - 1e-8)): |
|
857
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
|
858
|
|
|
self.beta1 = shared_floatx(beta1, "beta1") |
|
859
|
|
|
self.beta2 = shared_floatx(beta2, "beta2") |
|
860
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
|
861
|
|
|
self.decay_factor = shared_floatx(decay_factor, "decay_factor") |
|
862
|
|
|
for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon, |
|
863
|
|
|
self.decay_factor]: |
|
864
|
|
|
add_role(param, ALGORITHM_HYPERPARAMETER) |
|
865
|
|
|
|
|
866
|
|
|
def compute_step(self, parameter, previous_step): |
|
867
|
|
|
mean = _create_algorithm_buffer_for(parameter, 'mean') |
|
868
|
|
|
variance = _create_algorithm_buffer_for(parameter, 'variance') |
|
869
|
|
|
time = shared_floatx(0., 'time') |
|
870
|
|
|
add_role(time, ALGORITHM_BUFFER) |
|
871
|
|
|
|
|
872
|
|
|
t1 = time + 1 |
|
873
|
|
|
learning_rate = (self.learning_rate * |
|
874
|
|
|
tensor.sqrt((1. - (1. - self.beta2)**t1)) / |
|
875
|
|
|
(1. - (1. - self.beta1)**t1)) |
|
876
|
|
|
beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1) |
|
877
|
|
|
mean_t = beta_1t * previous_step + (1. - beta_1t) * mean |
|
878
|
|
|
variance_t = (self.beta2 * tensor.sqr(previous_step) + |
|
879
|
|
|
(1. - self.beta2) * variance) |
|
880
|
|
|
step = (learning_rate * mean_t / |
|
881
|
|
|
(tensor.sqrt(variance_t) + self.epsilon)) |
|
882
|
|
|
|
|
883
|
|
|
updates = [(mean, mean_t), |
|
884
|
|
|
(variance, variance_t), |
|
885
|
|
|
(time, t1)] |
|
886
|
|
|
|
|
887
|
|
|
return step, updates |
|
888
|
|
|
|
|
889
|
|
|
|
|
890
|
|
|
class RemoveNotFinite(StepRule): |
|
891
|
|
|
"""A step rule that skips steps with non-finite elements. |
|
892
|
|
|
|
|
893
|
|
|
Replaces a step (the parameter update of a single shared variable) |
|
894
|
|
|
which contains non-finite elements (such as ``inf`` or ``NaN``) with a |
|
895
|
|
|
step rescaling the parameters. |
|
896
|
|
|
|
|
897
|
|
|
Parameters |
|
898
|
|
|
---------- |
|
899
|
|
|
scaler : float, optional |
|
900
|
|
|
The scaling applied to the parameter in case the step contains |
|
901
|
|
|
non-finite elements. Defaults to 1, which means that parameters |
|
902
|
|
|
will not be changed. |
|
903
|
|
|
|
|
904
|
|
|
Notes |
|
905
|
|
|
----- |
|
906
|
|
|
This rule should be applied last! |
|
907
|
|
|
|
|
908
|
|
|
This trick was originally used in the GroundHog_ framework. |
|
909
|
|
|
|
|
910
|
|
|
.. _GroundHog: https://github.com/lisa-groundhog/GroundHog |
|
911
|
|
|
|
|
912
|
|
|
""" |
|
913
|
|
|
def __init__(self, scaler=1): |
|
914
|
|
|
self.scaler = scaler |
|
915
|
|
|
|
|
916
|
|
|
def compute_step(self, parameter, previous_step): |
|
917
|
|
|
step_sum = tensor.sum(previous_step) |
|
918
|
|
|
not_finite = (tensor.isnan(step_sum) + |
|
919
|
|
|
tensor.isinf(step_sum)) |
|
920
|
|
|
step = tensor.switch( |
|
921
|
|
|
not_finite > 0, (1 - self.scaler) * parameter, previous_step) |
|
922
|
|
|
return step, [] |
|
923
|
|
|
|
|
924
|
|
|
|
|
925
|
|
|
class Restrict(StepRule): |
|
926
|
|
|
"""Applies a given :class:`StepRule` only to certain variables. |
|
927
|
|
|
|
|
928
|
|
|
Example applications include clipping steps on only certain parameters, |
|
929
|
|
|
or scaling a certain kind of parameter's updates (e.g. adding an |
|
930
|
|
|
additional scalar multiplier to the steps taken on convolutional |
|
931
|
|
|
filters). |
|
932
|
|
|
|
|
933
|
|
|
Parameters |
|
934
|
|
|
---------- |
|
935
|
|
|
step_rule : :class:`StepRule` |
|
936
|
|
|
The :class:`StepRule` to be applied on the given variables. |
|
937
|
|
|
variables : iterable |
|
938
|
|
|
A collection of Theano variables on which to apply `step_rule`. |
|
939
|
|
|
Variables not appearing in this collection will not have |
|
940
|
|
|
`step_rule` applied to them. |
|
941
|
|
|
|
|
942
|
|
|
""" |
|
943
|
|
|
def __init__(self, step_rule, variables): |
|
944
|
|
|
self.step_rule = step_rule |
|
945
|
|
|
self.variables = frozenset(variables) |
|
946
|
|
|
|
|
947
|
|
|
def compute_steps(self, previous_steps): |
|
948
|
|
|
filtered_previous_steps = dict_subset(previous_steps, self.variables) |
|
949
|
|
|
steps, updates = self.step_rule.compute_steps(filtered_previous_steps) |
|
950
|
|
|
actual = OrderedDict((parameter, steps[parameter]) |
|
951
|
|
|
if parameter in steps |
|
952
|
|
|
else (parameter, previous_steps[parameter]) |
|
953
|
|
|
for parameter in previous_steps) |
|
954
|
|
|
return actual, updates |
|
955
|
|
|
|
Escape sequences in Python are generally interpreted according to rules similar to standard C. Only if strings are prefixed with
rorRare they interpreted as regular expressions.The escape sequence that was used indicates that you might have intended to write a regular expression.
Learn more about the available escape sequences. in the Python documentation.