|
1
|
|
|
"""Training algorithms.""" |
|
2
|
|
|
import logging |
|
3
|
|
|
import itertools |
|
4
|
|
|
from abc import ABCMeta, abstractmethod |
|
5
|
|
|
from collections import OrderedDict |
|
6
|
|
|
from collections import Mapping |
|
7
|
|
|
from six.moves import reduce |
|
8
|
|
|
|
|
9
|
|
|
from picklable_itertools.extras import equizip |
|
10
|
|
|
|
|
11
|
|
|
import theano |
|
12
|
|
|
from six import add_metaclass |
|
13
|
|
|
from theano import tensor |
|
14
|
|
|
|
|
15
|
|
|
from blocks.graph import ComputationGraph |
|
16
|
|
|
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER |
|
17
|
|
|
from blocks.theano_expressions import l2_norm |
|
18
|
|
|
from blocks.utils import (dict_subset, pack, shared_floatx, |
|
19
|
|
|
shared_floatx_zeros_matching) |
|
20
|
|
|
|
|
21
|
|
|
logger = logging.getLogger(__name__) |
|
22
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
def _create_algorithm_buffer_for(param, *args, **kwargs): |
|
25
|
|
|
buf = shared_floatx_zeros_matching(param, *args, **kwargs) |
|
26
|
|
|
buf.tag.for_parameter = param |
|
27
|
|
|
add_role(buf, ALGORITHM_BUFFER) |
|
28
|
|
|
return buf |
|
29
|
|
|
|
|
30
|
|
|
|
|
31
|
|
|
@add_metaclass(ABCMeta) |
|
32
|
|
|
class TrainingAlgorithm(object): |
|
33
|
|
|
"""Base class for training algorithms. |
|
34
|
|
|
|
|
35
|
|
|
A training algorithm object has a simple life-cycle. |
|
36
|
|
|
First it is initialized by calling its :meth:`initialize` method. |
|
37
|
|
|
At this stage, for instance, Theano functions can be compiled. |
|
38
|
|
|
After that the :meth:`process_batch` method is repeatedly |
|
39
|
|
|
called with a batch of training data as a parameter. |
|
40
|
|
|
|
|
41
|
|
|
""" |
|
42
|
|
|
@abstractmethod |
|
43
|
|
|
def initialize(self, **kwargs): |
|
44
|
|
|
"""Initialize the training algorithm.""" |
|
45
|
|
|
pass |
|
46
|
|
|
|
|
47
|
|
|
@abstractmethod |
|
48
|
|
|
def process_batch(self, batch): |
|
49
|
|
|
"""Process a batch of training data. |
|
50
|
|
|
|
|
51
|
|
|
Attributes |
|
52
|
|
|
---------- |
|
53
|
|
|
batch : dict |
|
54
|
|
|
A dictionary of (source name, data) pairs. |
|
55
|
|
|
|
|
56
|
|
|
""" |
|
57
|
|
|
pass |
|
58
|
|
|
|
|
59
|
|
|
|
|
60
|
|
|
variable_mismatch_error = """ |
|
61
|
|
|
|
|
62
|
|
|
Blocks tried to match the sources ({sources}) of the training dataset to \ |
|
63
|
|
|
the names of the Theano variables ({variables}), but failed to do so. \ |
|
64
|
|
|
If you want to train on a subset of the sources that your dataset provides, \ |
|
65
|
|
|
pass the `sources` keyword argument to its constructor, use the \ |
|
66
|
|
|
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \ |
|
67
|
|
|
or on_unused_sources='ignore' to the GradientDescent algorithm.""" |
|
68
|
|
|
|
|
69
|
|
|
source_missing_error = """ |
|
70
|
|
|
|
|
71
|
|
|
Blocks didn't find all the sources ({sources}) of the training dataset \ |
|
72
|
|
|
that match the names of the Theano variables ({variables}).""" |
|
73
|
|
|
|
|
74
|
|
|
|
|
75
|
|
|
determinism_error = """Cannot infer parameter list in a fixed order. |
|
76
|
|
|
|
|
77
|
|
|
Because dictionaries are unordered (and Python uses randomized hashing, \ |
|
78
|
|
|
which can change the iteration order over the same dictionary from one \ |
|
79
|
|
|
interpreter session to the next), Blocks cannot infer the parameters list \ |
|
80
|
|
|
from a plain dictionary of gradients in an order that is reproducible \ |
|
81
|
|
|
across interpreter sessions; please either specify the parameters \ |
|
82
|
|
|
explicitly or pass gradients as an OrderedDict (though exercise care in \ |
|
83
|
|
|
constructing that OrderedDict, as an OrderedDict created by iterating \ |
|
84
|
|
|
over an unordered iterable (e.g. a dict) will still have an arbitrary \ |
|
85
|
|
|
and unpredictable order that could cause problems with \ |
|
86
|
|
|
reproducibility).""" |
|
87
|
|
|
|
|
88
|
|
|
|
|
89
|
|
|
class UpdatesAlgorithm(TrainingAlgorithm): |
|
90
|
|
|
"""Base class for algorithms that use Theano functions with updates. |
|
91
|
|
|
|
|
92
|
|
|
Parameters |
|
93
|
|
|
---------- |
|
94
|
|
|
updates : list of tuples or :class:`~collections.OrderedDict` |
|
95
|
|
|
The updates that should be performed. |
|
96
|
|
|
theano_func_kwargs : dict, optional |
|
97
|
|
|
A passthrough to `theano.function` for additional arguments. |
|
98
|
|
|
Useful for passing `profile` or `mode` arguments to the theano |
|
99
|
|
|
function that will be compiled for the algorithm. |
|
100
|
|
|
on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn' |
|
101
|
|
|
Controls behavior when not all sources in a batch are used |
|
102
|
|
|
(i.e. there is no variable with a matching name in the inputs |
|
103
|
|
|
of the computational graph of the updates). |
|
104
|
|
|
|
|
105
|
|
|
Attributes |
|
106
|
|
|
---------- |
|
107
|
|
|
updates : list of :class:`~tensor.TensorSharedVariable` updates |
|
108
|
|
|
Updates to be done for every batch. It is required that the |
|
109
|
|
|
updates are done using the old values of optimized parameters. |
|
110
|
|
|
|
|
111
|
|
|
Notes |
|
112
|
|
|
----- |
|
113
|
|
|
Changing `updates` attribute or calling `add_updates` after |
|
114
|
|
|
the `initialize` method is called will have no effect. |
|
115
|
|
|
|
|
116
|
|
|
""" |
|
117
|
|
|
def __init__(self, updates=None, theano_func_kwargs=None, |
|
118
|
|
|
on_unused_sources='raise', **kwargs): |
|
119
|
|
|
self.updates = [] if updates is None else updates |
|
120
|
|
|
self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs |
|
121
|
|
|
is not None else dict()) |
|
122
|
|
|
self.on_unused_sources = on_unused_sources |
|
123
|
|
|
super(UpdatesAlgorithm, self).__init__(**kwargs) |
|
124
|
|
|
|
|
125
|
|
|
def initialize(self): |
|
126
|
|
|
logger.info("Initializing the training algorithm") |
|
127
|
|
|
update_values = [new_value for _, new_value in self.updates] |
|
128
|
|
|
logger.debug("Inferring graph inputs...") |
|
129
|
|
|
self.inputs = ComputationGraph(update_values).inputs |
|
130
|
|
|
logger.debug("Compiling training function...") |
|
131
|
|
|
self._function = theano.function( |
|
132
|
|
|
self.inputs, [], updates=self.updates, **self.theano_func_kwargs) |
|
133
|
|
|
logger.info("The training algorithm is initialized") |
|
134
|
|
|
|
|
135
|
|
|
@property |
|
136
|
|
|
def updates(self): |
|
137
|
|
|
return self._updates |
|
138
|
|
|
|
|
139
|
|
|
@updates.setter |
|
140
|
|
|
def updates(self, value): |
|
141
|
|
|
self._updates = value |
|
142
|
|
|
|
|
143
|
|
|
def add_updates(self, updates): |
|
144
|
|
|
"""Add updates to the training process. |
|
145
|
|
|
|
|
146
|
|
|
The updates will be done _before_ the parameters are changed. |
|
147
|
|
|
|
|
148
|
|
|
Parameters |
|
149
|
|
|
---------- |
|
150
|
|
|
updates : list of tuples or :class:`~collections.OrderedDict` |
|
151
|
|
|
The updates to add. |
|
152
|
|
|
|
|
153
|
|
|
""" |
|
154
|
|
|
if isinstance(updates, OrderedDict): |
|
155
|
|
|
updates = list(updates.items()) |
|
156
|
|
|
if not isinstance(updates, list): |
|
157
|
|
|
raise ValueError |
|
158
|
|
|
self.updates.extend(updates) |
|
159
|
|
|
|
|
160
|
|
|
def _validate_source_names(self, batch): |
|
161
|
|
|
in_names = [v.name for v in self.inputs] |
|
162
|
|
|
|
|
163
|
|
|
if not set(in_names).issubset(set(batch.keys())): |
|
164
|
|
|
raise ValueError("Didn't find all sources: " + |
|
165
|
|
|
source_missing_error.format( |
|
166
|
|
|
sources=batch.keys(), |
|
167
|
|
|
variables=in_names)) |
|
168
|
|
|
if not set(batch.keys()).issubset(set(in_names)): |
|
169
|
|
|
if self.on_unused_sources == 'ignore': |
|
170
|
|
|
pass |
|
171
|
|
|
elif self.on_unused_sources == 'warn': |
|
172
|
|
|
if not hasattr(self, '_unused_source_warned'): |
|
173
|
|
|
logger.warn(variable_mismatch_error.format( |
|
174
|
|
|
sources=batch.keys(), |
|
175
|
|
|
variables=in_names)) |
|
176
|
|
|
self._unused_source_warned = True |
|
177
|
|
|
elif self.on_unused_sources == 'raise': |
|
178
|
|
|
raise ValueError( |
|
179
|
|
|
"mismatch of variable names and data sources" + |
|
180
|
|
|
variable_mismatch_error.format( |
|
181
|
|
|
sources=batch.keys(), |
|
182
|
|
|
variables=in_names)) |
|
183
|
|
|
else: |
|
184
|
|
|
raise ValueError("Wrong value of on_unused_sources: {}." |
|
185
|
|
|
.format(self.on_unused_sources)) |
|
186
|
|
|
|
|
187
|
|
|
def process_batch(self, batch): |
|
188
|
|
|
self._validate_source_names(batch) |
|
189
|
|
|
ordered_batch = [batch[v.name] for v in self.inputs] |
|
190
|
|
|
self._function(*ordered_batch) |
|
191
|
|
|
|
|
192
|
|
|
|
|
193
|
|
|
class GradientDescent(UpdatesAlgorithm): |
|
194
|
|
|
"""A base class for all gradient descent algorithms. |
|
195
|
|
|
|
|
196
|
|
|
By "gradient descent" we mean a training algorithm of the following |
|
197
|
|
|
form: |
|
198
|
|
|
|
|
199
|
|
|
.. code-block:: python |
|
200
|
|
|
|
|
201
|
|
|
for batch in data: |
|
202
|
|
|
steps = step_rule.compute_steps(parameters, |
|
203
|
|
|
gradients_wr_parameters) |
|
204
|
|
|
for parameter in parameters: |
|
205
|
|
|
parameter -= steps[parameter] |
|
206
|
|
|
|
|
207
|
|
|
Note, that the step is *subtracted, not added*! This is done in order |
|
208
|
|
|
to make step rule chaining possible. |
|
209
|
|
|
|
|
210
|
|
|
Parameters |
|
211
|
|
|
---------- |
|
212
|
|
|
cost : :class:`~tensor.TensorVariable`, optional |
|
213
|
|
|
The objective to be minimized. Unused if `gradients` is specified. |
|
214
|
|
|
parameters : list of :class:`~tensor.TensorSharedVariable`, optional |
|
215
|
|
|
The parameters to be tuned. If not provided, inferred from the |
|
216
|
|
|
keys of `gradients` (in which case `gradients` *must* be an |
|
217
|
|
|
`OrderedDict`). |
|
218
|
|
|
step_rule : instance of :class:`StepRule`, optional |
|
219
|
|
|
An object encapsulating most of the algorithm's logic. Its |
|
220
|
|
|
`compute_steps` method is called to get Theano expression for |
|
221
|
|
|
steps. Note, that the step rule might have a state, e.g. to |
|
222
|
|
|
remember a weighted sum of gradients from previous steps like it is |
|
223
|
|
|
done in gradient descent with momentum. If ``None``, an instance of |
|
224
|
|
|
:class:`Scale` is created. |
|
225
|
|
|
gradients : OrderedDict or list of 2-tuples, optional |
|
226
|
|
|
A dictionary mapping a parameter to an expression for the cost's |
|
227
|
|
|
gradient with respect to the parameter, or equivalently, a list of |
|
228
|
|
|
(parameter, gradient) tuples. If ``None``, the gradient |
|
229
|
|
|
are taken automatically using :func:`theano.gradient.grad`. |
|
230
|
|
|
known_grads : dict, optional |
|
231
|
|
|
A passthrough to `theano.tensor.grad`'s `known_grads` argument. |
|
232
|
|
|
Useful when you know the [approximate] gradients of some |
|
233
|
|
|
sub-expressions and would like Theano to use that information |
|
234
|
|
|
to compute parameter gradients. Only makes sense when `gradients` |
|
235
|
|
|
is `None`. |
|
236
|
|
|
consider_constant : list, optional |
|
237
|
|
|
A passthrough to `theano.tensor.grad`'s `consider_constant` |
|
238
|
|
|
argument. A list of expressions through which gradients will not |
|
239
|
|
|
be backpropagated. Only makes sense when `gradients` is `None`. |
|
240
|
|
|
|
|
241
|
|
|
Attributes |
|
242
|
|
|
---------- |
|
243
|
|
|
gradients : OrderedDict |
|
244
|
|
|
The gradient dictionary. |
|
245
|
|
|
step_rule : instance of :class:`StepRule` |
|
246
|
|
|
The step rule. |
|
247
|
|
|
|
|
248
|
|
|
Notes |
|
249
|
|
|
----- |
|
250
|
|
|
Changing `updates` attribute or calling `add_updates` after |
|
251
|
|
|
the `initialize` method is called will have no effect. |
|
252
|
|
|
|
|
253
|
|
|
If a cost and parameters are provided, gradients are taken immediately |
|
254
|
|
|
upon construction, and changes to these attributes after construction |
|
255
|
|
|
will have no effect. |
|
256
|
|
|
|
|
257
|
|
|
`gradients` must be an `OrderedDict` if `parameters` is unspecified |
|
258
|
|
|
because ordinary dictionaries have an unpredictable iteration |
|
259
|
|
|
order due to hash randomization (which is enabled by default since |
|
260
|
|
|
versions 2.7.3 and 3.2.3 of Python). This source of variability, |
|
261
|
|
|
when combined with Theano's heuristic graph optimizations, can cause |
|
262
|
|
|
serious reproducibility issues. |
|
263
|
|
|
|
|
264
|
|
|
""" |
|
265
|
|
|
def __init__(self, cost=None, parameters=None, step_rule=None, |
|
266
|
|
|
gradients=None, known_grads=None, consider_constant=None, |
|
267
|
|
|
**kwargs): |
|
268
|
|
|
# Set initial values for cost, parameters, gradients. |
|
269
|
|
|
self.cost = cost |
|
270
|
|
|
self.parameters = parameters |
|
271
|
|
|
# Coerce lists of tuples to OrderedDict. Do not coerce Mappings, |
|
272
|
|
|
# as we don't want to convert dict -> OrderedDict and give it |
|
273
|
|
|
# an arbitrary, non-deterministic order. |
|
274
|
|
|
if gradients is not None and not isinstance(gradients, Mapping): |
|
275
|
|
|
gradients = OrderedDict(gradients) |
|
276
|
|
|
self.gradients = gradients |
|
277
|
|
|
|
|
278
|
|
|
# If we don't have gradients, we'll need to infer them from the |
|
279
|
|
|
# cost and the parameters, both of which must not be None. |
|
280
|
|
|
if not self.gradients: |
|
281
|
|
|
self.gradients = self._compute_gradients(known_grads, |
|
282
|
|
|
consider_constant) |
|
283
|
|
|
else: |
|
284
|
|
|
if cost is not None: |
|
285
|
|
|
logger.warning(('{}: gradients already specified directly; ' |
|
286
|
|
|
'cost is unused.' |
|
287
|
|
|
.format(self.__class__.__name__))) |
|
288
|
|
|
if self.parameters is None and isinstance(gradients, OrderedDict): |
|
289
|
|
|
# If the dictionary is ordered, it's safe to use the keys |
|
290
|
|
|
# as they have a deterministic order. |
|
291
|
|
|
self.parameters = list(self.gradients.keys()) |
|
292
|
|
|
elif self.parameters is not None: |
|
293
|
|
|
# If parameters and gradients.keys() don't match we can |
|
294
|
|
|
# try to recover if gradients is ordered. |
|
295
|
|
|
if set(self.parameters) != set(self.gradients.keys()): |
|
296
|
|
|
logger.warn("Specified parameters list does not match " |
|
297
|
|
|
"keys in provided gradient dictionary; " |
|
298
|
|
|
"using parameters inferred from gradients") |
|
299
|
|
|
if not isinstance(self.gradients, OrderedDict): |
|
300
|
|
|
raise ValueError(determinism_error) |
|
301
|
|
|
self.parameters = list(self.gradients.keys()) |
|
302
|
|
|
else: |
|
303
|
|
|
# self.parameters is not None, and gradients isn't |
|
304
|
|
|
# an OrderedDict. We can't do anything safe. |
|
305
|
|
|
raise ValueError(determinism_error) |
|
306
|
|
|
if known_grads: |
|
307
|
|
|
raise ValueError("known_grads has no effect when gradients " |
|
308
|
|
|
"are passed in") |
|
309
|
|
|
if consider_constant is not None: |
|
310
|
|
|
raise ValueError("consider_constant has no effect when " |
|
311
|
|
|
"gradients are passed in") |
|
312
|
|
|
|
|
313
|
|
|
# The order in which the different gradient terms appears |
|
314
|
|
|
# here matters, as floating point addition is non-commutative (and |
|
315
|
|
|
# Theano's graph optimizations are not order-independent). |
|
316
|
|
|
# This is why we do not use .values(). |
|
317
|
|
|
gradient_values = [self.gradients[p] for p in self.parameters] |
|
318
|
|
|
self.total_gradient_norm = (l2_norm(gradient_values) |
|
319
|
|
|
.copy(name="total_gradient_norm")) |
|
320
|
|
|
|
|
321
|
|
|
self.step_rule = step_rule if step_rule else Scale() |
|
322
|
|
|
logger.debug("Computing parameter steps...") |
|
323
|
|
|
self.steps, self.step_rule_updates = ( |
|
324
|
|
|
self.step_rule.compute_steps(self.gradients)) |
|
325
|
|
|
|
|
326
|
|
|
# Same as gradient_values above: the order may influence a |
|
327
|
|
|
# bunch of things, so enforce a consistent one (don't use |
|
328
|
|
|
# .values()). |
|
329
|
|
|
step_values = [self.steps[p] for p in self.parameters] |
|
330
|
|
|
self.total_step_norm = (l2_norm(step_values) |
|
331
|
|
|
.copy(name="total_step_norm")) |
|
332
|
|
|
|
|
333
|
|
|
# Once again, iterating on gradients may not be deterministically |
|
334
|
|
|
# ordered if it is not an OrderedDict. We add the updates here in |
|
335
|
|
|
# the order specified in self.parameters. Keep it this way to |
|
336
|
|
|
# maintain reproducibility. |
|
337
|
|
|
kwargs.setdefault('updates', []).extend( |
|
338
|
|
|
itertools.chain(((parameter, parameter - self.steps[parameter]) |
|
339
|
|
|
for parameter in self.parameters), |
|
340
|
|
|
self.step_rule_updates) |
|
341
|
|
|
) |
|
342
|
|
|
super(GradientDescent, self).__init__(**kwargs) |
|
343
|
|
|
|
|
344
|
|
|
def _compute_gradients(self, known_grads, consider_constant): |
|
345
|
|
|
if self.cost is None: |
|
346
|
|
|
raise ValueError("can't infer gradients; no cost specified") |
|
347
|
|
|
elif self.parameters is None or len(self.parameters) == 0: |
|
348
|
|
|
raise ValueError("can't infer gradients; no parameters " |
|
349
|
|
|
"specified") |
|
350
|
|
|
# While this strictly speaking could be a dict and not an |
|
351
|
|
|
# OrderedDict (because we iterate over it in the order of |
|
352
|
|
|
# self.parameters), this guards a little bit against |
|
353
|
|
|
# nondeterminism introduced by future refactoring. |
|
354
|
|
|
logger.info("Taking the cost gradient") |
|
355
|
|
|
gradients = OrderedDict( |
|
356
|
|
|
equizip(self.parameters, tensor.grad( |
|
357
|
|
|
self.cost, self.parameters, |
|
358
|
|
|
known_grads=known_grads, |
|
359
|
|
|
consider_constant=consider_constant))) |
|
360
|
|
|
logger.info("The cost gradient computation graph is built") |
|
361
|
|
|
return gradients |
|
362
|
|
|
|
|
363
|
|
|
|
|
364
|
|
|
@add_metaclass(ABCMeta) |
|
365
|
|
|
class StepRule(object): |
|
366
|
|
|
"""A rule to compute steps for a gradient descent algorithm.""" |
|
367
|
|
|
def compute_step(self, parameter, previous_step): |
|
368
|
|
|
"""Build a Theano expression for the step for a parameter. |
|
369
|
|
|
|
|
370
|
|
|
This method is called by default implementation of |
|
371
|
|
|
:meth:`compute_steps`, it relieves from writing a loop each time. |
|
372
|
|
|
|
|
373
|
|
|
Parameters |
|
374
|
|
|
---------- |
|
375
|
|
|
parameter : :class:`~tensor.TensorSharedVariable` |
|
376
|
|
|
The parameter. |
|
377
|
|
|
previous_step : :class:`~tensor.TensorVariable` |
|
378
|
|
|
Some quantity related to the gradient of the cost with respect |
|
379
|
|
|
to the parameter, either the gradient itself or a step in a |
|
380
|
|
|
related direction. |
|
381
|
|
|
|
|
382
|
|
|
Returns |
|
383
|
|
|
------- |
|
384
|
|
|
step : :class:`~theano.Variable` |
|
385
|
|
|
Theano variable for the step to take. |
|
386
|
|
|
updates : list |
|
387
|
|
|
A list of tuples representing updates to be performed. This |
|
388
|
|
|
is useful for stateful rules such as :class:`Momentum` which |
|
389
|
|
|
need to update shared variables after itetations. |
|
390
|
|
|
|
|
391
|
|
|
""" |
|
392
|
|
|
raise NotImplementedError |
|
393
|
|
|
|
|
394
|
|
|
def compute_steps(self, previous_steps): |
|
395
|
|
|
"""Build a Theano expression for steps for all parameters. |
|
396
|
|
|
|
|
397
|
|
|
Override this method if you want to process the steps |
|
398
|
|
|
with respect to all parameters as a whole, not parameter-wise. |
|
399
|
|
|
|
|
400
|
|
|
Parameters |
|
401
|
|
|
---------- |
|
402
|
|
|
previous_steps : OrderedDict |
|
403
|
|
|
An :class:`~OrderedDict` of |
|
404
|
|
|
(:class:`~tensor.TensorSharedVariable` |
|
405
|
|
|
:class:`~tensor.TensorVariable`) pairs. The keys are the |
|
406
|
|
|
parameters being trained, the values are the expressions for |
|
407
|
|
|
quantities related to gradients of the cost with respect to |
|
408
|
|
|
the parameters, either the gradients themselves or steps in |
|
409
|
|
|
related directions. |
|
410
|
|
|
|
|
411
|
|
|
Returns |
|
412
|
|
|
------- |
|
413
|
|
|
steps : OrderedDict |
|
414
|
|
|
A dictionary of the proposed steps in the same form as |
|
415
|
|
|
`previous_steps`. |
|
416
|
|
|
updates : list |
|
417
|
|
|
A list of tuples representing updates to be performed. |
|
418
|
|
|
|
|
419
|
|
|
""" |
|
420
|
|
|
parameter_wise = [self.compute_step(parameter, |
|
421
|
|
|
previous_steps[parameter]) |
|
422
|
|
|
for parameter in previous_steps] |
|
423
|
|
|
steps, updates = equizip(*parameter_wise) |
|
424
|
|
|
steps = OrderedDict((parameter, step) for parameter, step |
|
425
|
|
|
in equizip(previous_steps.keys(), steps)) |
|
426
|
|
|
updates = list(itertools.chain(*updates)) |
|
427
|
|
|
return steps, updates |
|
428
|
|
|
|
|
429
|
|
|
|
|
430
|
|
|
class CompositeRule(StepRule): |
|
431
|
|
|
"""Chains several step rules. |
|
432
|
|
|
|
|
433
|
|
|
Parameters |
|
434
|
|
|
---------- |
|
435
|
|
|
components : list of :class:`StepRule` |
|
436
|
|
|
The learning rules to be chained. The rules will be applied in the |
|
437
|
|
|
order as given. |
|
438
|
|
|
|
|
439
|
|
|
""" |
|
440
|
|
|
def __init__(self, components): |
|
441
|
|
|
self.components = components |
|
442
|
|
|
|
|
443
|
|
|
def compute_steps(self, previous_steps): |
|
444
|
|
|
steps = previous_steps |
|
445
|
|
|
updates = [] |
|
446
|
|
|
for rule in self.components: |
|
447
|
|
|
steps, more_updates = rule.compute_steps(steps) |
|
448
|
|
|
updates += more_updates |
|
449
|
|
|
return steps, updates |
|
450
|
|
|
|
|
451
|
|
|
|
|
452
|
|
|
class Scale(StepRule): |
|
453
|
|
|
"""A step in the direction proportional to the previous step. |
|
454
|
|
|
|
|
455
|
|
|
If used in :class:`GradientDescent` alone, this step rule implements |
|
456
|
|
|
steepest descent. |
|
457
|
|
|
|
|
458
|
|
|
Parameters |
|
459
|
|
|
---------- |
|
460
|
|
|
learning_rate : float |
|
461
|
|
|
The learning rate by which the previous step is multiplied to |
|
462
|
|
|
produce the step. |
|
463
|
|
|
|
|
464
|
|
|
Attributes |
|
465
|
|
|
---------- |
|
466
|
|
|
learning_rate : :class:`~tensor.TensorSharedVariable` |
|
467
|
|
|
The shared variable storing the learning rate used. |
|
468
|
|
|
|
|
469
|
|
|
""" |
|
470
|
|
|
def __init__(self, learning_rate=1.0): |
|
471
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
|
472
|
|
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
|
473
|
|
|
|
|
474
|
|
|
def compute_step(self, parameter, previous_step): |
|
475
|
|
|
return self.learning_rate * previous_step, [] |
|
476
|
|
|
|
|
477
|
|
|
|
|
478
|
|
|
class BasicMomentum(StepRule): |
|
479
|
|
|
"""Accumulates step with exponential discount. |
|
480
|
|
|
|
|
481
|
|
|
Parameters |
|
482
|
|
|
---------- |
|
483
|
|
|
momentum : float, optional |
|
484
|
|
|
The momentum coefficient. Defaults to 0. |
|
485
|
|
|
|
|
486
|
|
|
Notes |
|
487
|
|
|
----- |
|
488
|
|
|
This step rule is intended to be used in conjunction with another |
|
489
|
|
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
|
490
|
|
|
experience, look at :class:`Momentum`. |
|
491
|
|
|
|
|
492
|
|
|
""" |
|
493
|
|
|
def __init__(self, momentum=0.): |
|
494
|
|
|
self.momentum = shared_floatx(momentum, "momentum") |
|
495
|
|
|
add_role(self.momentum, ALGORITHM_HYPERPARAMETER) |
|
496
|
|
|
|
|
497
|
|
|
def compute_step(self, parameter, previous_step): |
|
498
|
|
|
velocity = _create_algorithm_buffer_for(parameter, "velocity") |
|
499
|
|
|
step = self.momentum * velocity + previous_step |
|
500
|
|
|
updates = [(velocity, step)] |
|
501
|
|
|
return step, updates |
|
502
|
|
|
|
|
503
|
|
|
|
|
504
|
|
|
class Momentum(CompositeRule): |
|
505
|
|
|
"""Accumulates step with exponential discount. |
|
506
|
|
|
|
|
507
|
|
|
Combines :class:`BasicMomentum` and :class:`Scale` to form the |
|
508
|
|
|
usual momentum step rule. |
|
509
|
|
|
|
|
510
|
|
|
Parameters |
|
511
|
|
|
---------- |
|
512
|
|
|
learning_rate : float, optional |
|
513
|
|
|
The learning rate by which the previous step scaled. Defaults to 1. |
|
514
|
|
|
momentum : float, optional |
|
515
|
|
|
The momentum coefficient. Defaults to 0. |
|
516
|
|
|
|
|
517
|
|
|
Attributes |
|
518
|
|
|
---------- |
|
519
|
|
|
learning_rate : :class:`~tensor.SharedVariable` |
|
520
|
|
|
A variable for learning rate. |
|
521
|
|
|
momentum : :class:`~tensor.SharedVariable` |
|
522
|
|
View Code Duplication |
A variable for momentum. |
|
|
|
|
|
|
523
|
|
|
|
|
524
|
|
|
See Also |
|
525
|
|
|
-------- |
|
526
|
|
|
:class:`SharedVariableModifier` |
|
527
|
|
|
|
|
528
|
|
|
""" |
|
529
|
|
|
def __init__(self, learning_rate=1.0, momentum=0.): |
|
530
|
|
|
scale = Scale(learning_rate=learning_rate) |
|
531
|
|
|
basic_momentum = BasicMomentum(momentum=momentum) |
|
532
|
|
|
self.learning_rate = scale.learning_rate |
|
533
|
|
|
self.momentum = basic_momentum.momentum |
|
534
|
|
|
self.components = [scale, basic_momentum] |
|
535
|
|
|
|
|
536
|
|
|
|
|
537
|
|
|
class AdaDelta(StepRule): |
|
538
|
|
|
"""Adapts the step size over time using only first order information. |
|
539
|
|
|
|
|
540
|
|
|
Parameters |
|
541
|
|
|
---------- |
|
542
|
|
|
decay_rate : float, optional |
|
543
|
|
|
Decay rate in [0, 1]. Defaults to 0.95. |
|
544
|
|
|
epsilon : float, optional |
|
545
|
|
|
Stabilizing constant for RMS. Defaults to 1e-6. |
|
546
|
|
|
|
|
547
|
|
|
Notes |
|
548
|
|
|
----- |
|
549
|
|
|
For more information, see [ADADELTA]_. |
|
550
|
|
|
|
|
551
|
|
|
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning |
|
552
|
|
|
Rate Method*, arXiv:1212.5701. |
|
553
|
|
|
|
|
554
|
|
|
""" |
|
555
|
|
|
def __init__(self, decay_rate=0.95, epsilon=1e-6): |
|
556
|
|
|
if not 0.0 <= decay_rate <= 1.0: |
|
557
|
|
|
raise ValueError("decay rate needs to be in [0, 1]") |
|
558
|
|
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
|
559
|
|
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
|
560
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
|
561
|
|
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
|
562
|
|
|
|
|
563
|
|
|
def compute_step(self, parameter, previous_step): |
|
564
|
|
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
|
565
|
|
|
parameter, "mean_square_step_tm1") |
|
566
|
|
|
mean_square_delta_x_tm1 = _create_algorithm_buffer_for( |
|
567
|
|
|
parameter, "mean_square_delta_x_tm1") |
|
568
|
|
|
|
|
569
|
|
|
mean_square_step_t = ( |
|
570
|
|
|
self.decay_rate * mean_square_step_tm1 + |
|
571
|
|
|
(1 - self.decay_rate) * tensor.sqr(previous_step) |
|
572
|
|
|
) |
|
573
|
|
|
|
|
574
|
|
|
rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) |
|
575
|
|
|
rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) |
|
576
|
|
|
delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step |
|
577
|
|
|
|
|
578
|
|
|
mean_square_delta_x_t = ( |
|
579
|
|
|
self.decay_rate * mean_square_delta_x_tm1 + |
|
580
|
|
|
(1 - self.decay_rate) * tensor.sqr(delta_x_t) |
|
581
|
|
|
) |
|
582
|
|
|
|
|
583
|
|
|
step = delta_x_t |
|
584
|
|
|
updates = [(mean_square_step_tm1, mean_square_step_t), |
|
585
|
|
|
(mean_square_delta_x_tm1, mean_square_delta_x_t)] |
|
586
|
|
|
return step, updates |
|
587
|
|
|
|
|
588
|
|
|
|
|
589
|
|
|
class BasicRMSProp(StepRule): |
|
590
|
|
|
"""Scales the step size by a running average of the recent step norms. |
|
591
|
|
|
|
|
592
|
|
|
Parameters |
|
593
|
|
|
---------- |
|
594
|
|
|
decay_rate : float, optional |
|
595
|
|
|
How fast the running average decays, value in [0, 1] |
|
596
|
|
|
(lower is faster). Defaults to 0.9. |
|
597
|
|
|
max_scaling : float, optional |
|
598
|
|
|
Maximum scaling of the step size, in case the running average is |
|
599
|
|
|
really small. Needs to be greater than 0. Defaults to 1e5. |
|
600
|
|
|
|
|
601
|
|
|
Notes |
|
602
|
|
|
----- |
|
603
|
|
|
This step rule is intended to be used in conjunction with another |
|
604
|
|
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
|
605
|
|
|
experience, look at :class:`RMSProp`. |
|
606
|
|
|
|
|
607
|
|
|
In general, this step rule should be used _before_ other step rules, |
|
608
|
|
|
because it has normalization properties that may undo their work. |
|
609
|
|
|
For instance, it should be applied first when used in conjunction |
|
610
|
|
|
with :class:`Scale`. |
|
611
|
|
|
|
|
612
|
|
|
For more information, see [Hint2014]_. |
|
613
|
|
|
|
|
614
|
|
|
""" |
|
615
|
|
|
def __init__(self, decay_rate=0.9, max_scaling=1e5): |
|
616
|
|
|
if not 0.0 <= decay_rate <= 1.0: |
|
617
|
|
|
raise ValueError("decay rate needs to be in [0, 1]") |
|
618
|
|
|
if max_scaling <= 0: |
|
619
|
|
|
raise ValueError("max. scaling needs to be greater than 0") |
|
620
|
|
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
|
621
|
|
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
|
622
|
|
|
self.epsilon = 1. / max_scaling |
|
623
|
|
|
|
|
624
|
|
|
def compute_step(self, parameter, previous_step): |
|
625
|
|
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
|
626
|
|
|
parameter, "mean_square_step_tm1") |
|
627
|
|
|
mean_square_step_t = ( |
|
628
|
|
|
self.decay_rate * mean_square_step_tm1 + |
|
629
|
|
|
(1 - self.decay_rate) * tensor.sqr(previous_step)) |
|
630
|
|
|
rms_step_t = tensor.maximum( |
|
631
|
|
|
tensor.sqrt(mean_square_step_t), self.epsilon) |
|
632
|
|
|
step = previous_step / rms_step_t |
|
633
|
|
|
updates = [(mean_square_step_tm1, mean_square_step_t)] |
|
634
|
|
|
return step, updates |
|
635
|
|
|
|
|
636
|
|
|
|
|
637
|
|
|
class RMSProp(CompositeRule): |
|
638
|
|
|
"""Scales the step size by a running average of the recent step norms. |
|
639
|
|
|
|
|
640
|
|
|
Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule |
|
641
|
|
|
described in [Hint2014]_. |
|
642
|
|
|
|
|
643
|
|
|
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, |
|
644
|
|
|
lecture 6a, |
|
645
|
|
|
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf |
|
646
|
|
|
|
|
647
|
|
|
Parameters |
|
648
|
|
|
---------- |
|
649
|
|
|
learning_rate : float, optional |
|
650
|
|
|
The learning rate by which the previous step scaled. Defaults to 1. |
|
651
|
|
|
decay_rate : float, optional |
|
652
|
|
|
How fast the running average decays (lower is faster). |
|
653
|
|
|
Defaults to 0.9. |
|
654
|
|
|
max_scaling : float, optional |
|
655
|
|
|
Maximum scaling of the step size, in case the running average is |
|
656
|
|
|
really small. Defaults to 1e5. |
|
657
|
|
|
|
|
658
|
|
|
Attributes |
|
659
|
|
|
---------- |
|
660
|
|
|
learning_rate : :class:`~tensor.SharedVariable` |
|
661
|
|
|
A variable for learning rate. |
|
662
|
|
|
decay_rate : :class:`~tensor.SharedVariable` |
|
663
|
|
|
A variable for decay rate. |
|
664
|
|
|
|
|
665
|
|
|
See Also |
|
666
|
|
|
-------- |
|
667
|
|
|
:class:`SharedVariableModifier` |
|
668
|
|
|
|
|
669
|
|
|
""" |
|
670
|
|
|
def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5): |
|
671
|
|
|
basic_rms_prop = BasicRMSProp(decay_rate=decay_rate, |
|
672
|
|
|
max_scaling=max_scaling) |
|
673
|
|
|
scale = Scale(learning_rate=learning_rate) |
|
674
|
|
|
self.learning_rate = scale.learning_rate |
|
675
|
|
|
self.decay_rate = basic_rms_prop.decay_rate |
|
676
|
|
|
self.components = [basic_rms_prop, scale] |
|
677
|
|
|
|
|
678
|
|
|
|
|
679
|
|
|
class StepClipping(StepRule): |
|
680
|
|
|
"""Rescales an entire step if its L2 norm exceeds a threshold. |
|
681
|
|
|
|
|
682
|
|
|
When the previous steps are the gradients, this step rule performs |
|
683
|
|
|
gradient clipping. |
|
684
|
|
|
|
|
685
|
|
|
Parameters |
|
686
|
|
|
---------- |
|
687
|
|
|
threshold : float, optional |
|
688
|
|
|
The maximum permitted L2 norm for the step. The step |
|
689
|
|
|
will be rescaled to be not higher than this quanity. |
|
690
|
|
|
If ``None``, no rescaling will be applied. |
|
691
|
|
|
|
|
692
|
|
|
Attributes |
|
693
|
|
|
---------- |
|
694
|
|
|
threshold : :class:`.tensor.TensorSharedVariable` |
|
695
|
|
|
The shared variable storing the clipping threshold used. |
|
696
|
|
|
|
|
697
|
|
|
""" |
|
698
|
|
|
def __init__(self, threshold=None): |
|
699
|
|
|
if threshold is not None: |
|
700
|
|
|
threshold = shared_floatx(threshold, "threshold") |
|
701
|
|
|
add_role(threshold, ALGORITHM_HYPERPARAMETER) |
|
702
|
|
|
self.threshold = threshold |
|
703
|
|
|
|
|
704
|
|
|
def compute_steps(self, previous_steps): |
|
705
|
|
|
if self.threshold is None: |
|
706
|
|
|
steps = previous_steps |
|
707
|
|
|
else: |
|
708
|
|
|
norm = l2_norm(previous_steps.values()) |
|
709
|
|
|
multiplier = tensor.switch(norm < self.threshold, |
|
710
|
|
|
1, self.threshold / norm) |
|
711
|
|
|
steps = OrderedDict( |
|
712
|
|
|
(parameter, step * multiplier) |
|
713
|
|
|
for parameter, step in previous_steps.items()) |
|
714
|
|
|
return steps, [] |
|
715
|
|
|
|
|
716
|
|
|
|
|
717
|
|
|
class VariableClipping(StepRule): |
|
718
|
|
|
"""Clip the maximum norm of individual variables along certain axes. |
|
719
|
|
|
|
|
720
|
|
|
This :class:`StepRule` can be used to implement L2 norm constraints on |
|
721
|
|
|
e.g. the weight vectors of individual hidden units, convolutional |
|
722
|
|
|
filters or entire weight tensors. Combine with :class:`Restrict` |
|
723
|
|
|
(and possibly :class:`CompositeRule`), to apply such constraints only |
|
724
|
|
|
to certain variables and/or apply different norm constraints to |
|
725
|
|
View Code Duplication |
different variables. |
|
|
|
|
|
|
726
|
|
|
|
|
727
|
|
|
Parameters |
|
728
|
|
|
---------- |
|
729
|
|
|
threshold : float |
|
730
|
|
|
Maximum norm for a given (portion of a) tensor. |
|
731
|
|
|
axis : int or iterable, optional |
|
732
|
|
|
An integer single axis, or an iterable collection of integer |
|
733
|
|
|
axes over which to sum in order to calculate the L2 norm. If |
|
734
|
|
|
`None` (the default), the norm is computed over all elements |
|
735
|
|
|
of the tensor. |
|
736
|
|
|
|
|
737
|
|
|
Notes |
|
738
|
|
|
----- |
|
739
|
|
|
Because of the way the :class:`StepRule` API works, this particular |
|
740
|
|
|
rule implements norm clipping of the value *after* update in the |
|
741
|
|
|
following way: it computes ``parameter - previous_step``, scales it |
|
742
|
|
|
to have (possibly axes-wise) norm(s) of at most `threshold`, |
|
743
|
|
|
then subtracts *that* value from `parameter` to yield an 'equivalent |
|
744
|
|
|
step' that respects the desired norm constraints. This procedure |
|
745
|
|
|
implicitly assumes one is doing simple (stochastic) gradient descent, |
|
746
|
|
|
and so steps computed by this step rule may not make sense for use |
|
747
|
|
|
in other contexts. |
|
748
|
|
|
|
|
749
|
|
|
Investigations into max-norm regularization date from [Srebro2005]_. |
|
750
|
|
|
The first appearance of this technique as a regularization method |
|
751
|
|
|
for the weight vectors of individual hidden units in feed-forward |
|
752
|
|
|
neural networks may be [Hinton2012]_. |
|
753
|
|
|
|
|
754
|
|
|
.. [Srebro2005] Nathan Srebro and Adi Shraibman. |
|
755
|
|
|
"Rank, Trace-Norm and Max-Norm". *18th Annual Conference |
|
756
|
|
|
on Learning Theory (COLT)*, June 2005. |
|
757
|
|
|
|
|
758
|
|
|
.. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava, |
|
759
|
|
|
Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov. |
|
760
|
|
|
"Improving neural networks by preventing co-adaptation of |
|
761
|
|
|
feature detectors". arXiv:1207.0580. |
|
762
|
|
|
|
|
763
|
|
|
""" |
|
764
|
|
|
def __init__(self, threshold, axis=None): |
|
765
|
|
|
axis = pack(axis) if axis is not None else () |
|
766
|
|
|
self.axis = set(axis) |
|
767
|
|
|
self.threshold = shared_floatx(threshold, "threshold") |
|
768
|
|
|
add_role(self.threshold, ALGORITHM_HYPERPARAMETER) |
|
769
|
|
|
if len(axis) != len(self.axis): |
|
770
|
|
|
raise ValueError("axis must be unique") |
|
771
|
|
|
|
|
772
|
|
|
def compute_step(self, parameter, previous_step): |
|
773
|
|
|
if any(ax >= previous_step.ndim for ax in self.axis): |
|
774
|
|
|
raise ValueError("Invalid axis {} for {}, ndim={}".format( |
|
775
|
|
|
self.axis, parameter, previous_step.ndim)) |
|
776
|
|
|
if len(self.axis) == 0: |
|
777
|
|
|
norms = l2_norm([parameter - previous_step]) |
|
778
|
|
|
else: |
|
779
|
|
|
squares = tensor.sqr(parameter - previous_step) |
|
780
|
|
|
norms = tensor.sqrt( |
|
781
|
|
|
reduce(lambda t, a: t.sum(axis=a, keepdims=True), |
|
782
|
|
|
sorted(self.axis), squares)) |
|
783
|
|
|
# We want a step s* that is the same as scaling |
|
784
|
|
|
# (parameter - previous_step) by threshold / norm |
|
785
|
|
|
# when threshold < norm. |
|
786
|
|
|
shrinking_step = (parameter - |
|
787
|
|
|
(self.threshold / norms) * |
|
788
|
|
|
(parameter - previous_step)) |
|
789
|
|
|
return tensor.switch(norms > self.threshold, |
|
790
|
|
|
shrinking_step, |
|
791
|
|
|
previous_step), () |
|
792
|
|
|
|
|
793
|
|
|
|
|
794
|
|
|
class AdaGrad(StepRule): |
|
795
|
|
|
"""Implements the AdaGrad learning rule. |
|
796
|
|
|
|
|
797
|
|
|
Parameters |
|
798
|
|
|
---------- |
|
799
|
|
|
learning_rate : float, optional |
|
800
|
|
|
Step size. |
|
801
|
|
|
Default value is set to 0.0002. |
|
802
|
|
|
epsilon : float, optional |
|
803
|
|
|
Stabilizing constant for one over root of sum of squares. |
|
804
|
|
|
Defaults to 1e-6. |
|
805
|
|
|
|
|
806
|
|
|
Notes |
|
807
|
|
|
----- |
|
808
|
|
|
For more information, see [ADAGRAD]_. |
|
809
|
|
|
|
|
810
|
|
|
.. [ADAGRAD] Duchi J, Hazan E, Singer Y., |
|
811
|
|
|
*Adaptive subgradient methods for online learning and |
|
812
|
|
|
stochastic optimization*, |
|
813
|
|
|
http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf |
|
814
|
|
|
|
|
815
|
|
|
""" |
|
816
|
|
|
def __init__(self, learning_rate=0.002, epsilon=1e-6): |
|
817
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
|
818
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
|
819
|
|
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
|
820
|
|
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
|
821
|
|
|
|
|
822
|
|
|
def compute_step(self, parameter, previous_step): |
|
823
|
|
|
name = 'adagrad_sqs' |
|
824
|
|
|
if parameter.name: |
|
825
|
|
|
name += '_' + parameter.name |
|
826
|
|
|
ssq = _create_algorithm_buffer_for(parameter, name=name) |
|
827
|
|
|
|
|
828
|
|
|
ssq_t = (tensor.sqr(previous_step) + ssq) |
|
829
|
|
|
step = (self.learning_rate * previous_step / |
|
830
|
|
|
(tensor.sqrt(ssq_t) + self.epsilon)) |
|
831
|
|
|
|
|
832
|
|
|
updates = [(ssq, ssq_t)] |
|
833
|
|
|
|
|
834
|
|
|
return step, updates |
|
835
|
|
|
|
|
836
|
|
|
|
|
837
|
|
|
class Adam(StepRule): |
|
838
|
|
|
"""Adam optimizer as described in [King2014]_. |
|
839
|
|
|
|
|
840
|
|
|
.. [King2014] Diederik Kingma, Jimmy Ba, |
|
841
|
|
|
*Adam: A Method for Stochastic Optimization*, |
|
842
|
|
|
http://arxiv.org/abs/1412.6980 |
|
843
|
|
|
|
|
844
|
|
|
Parameters |
|
845
|
|
|
---------- |
|
846
|
|
|
learning_rate : float, optional |
|
847
|
|
|
Step size. |
|
848
|
|
|
Default value is set to 0.002. |
|
849
|
|
|
beta1 : float, optional |
|
850
|
|
|
Exponential decay rate for the first moment estimates. |
|
851
|
|
|
Default value is set to 0.9. |
|
852
|
|
|
beta2 : float, optional |
|
853
|
|
|
Exponential decay rate for the second moment estimates. |
|
854
|
|
|
Default value is set to 0.999. |
|
855
|
|
|
epsilon : float, optional |
|
856
|
|
|
Default value is set to 1e-8. |
|
857
|
|
|
decay_factor : float, optional |
|
858
|
|
|
Default value is set to 1. |
|
859
|
|
|
|
|
860
|
|
|
""" |
|
861
|
|
|
def __init__(self, learning_rate=0.002, |
|
862
|
|
|
beta1=0.9, beta2=0.999, epsilon=1e-8, |
|
863
|
|
|
decay_factor=1): |
|
864
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
|
865
|
|
|
self.beta1 = shared_floatx(beta1, "beta1") |
|
866
|
|
|
self.beta2 = shared_floatx(beta2, "beta2") |
|
867
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
|
868
|
|
|
self.decay_factor = shared_floatx(decay_factor, "decay_factor") |
|
869
|
|
|
for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon, |
|
870
|
|
|
self.decay_factor]: |
|
871
|
|
|
add_role(param, ALGORITHM_HYPERPARAMETER) |
|
872
|
|
|
|
|
873
|
|
|
def compute_step(self, parameter, previous_step): |
|
874
|
|
|
mean = _create_algorithm_buffer_for(parameter, 'mean') |
|
875
|
|
|
variance = _create_algorithm_buffer_for(parameter, 'variance') |
|
876
|
|
|
time = shared_floatx(0., 'time') |
|
877
|
|
|
add_role(time, ALGORITHM_BUFFER) |
|
878
|
|
|
|
|
879
|
|
|
t1 = time + 1 |
|
880
|
|
|
beta_1_decayed = self.beta1 * self.decay_factor ** (t1 - 1) |
|
881
|
|
|
learning_rate = (self.learning_rate * |
|
882
|
|
|
tensor.sqrt(1. - self.beta2**t1) / |
|
883
|
|
|
(1. - beta_1_decayed**t1)) |
|
884
|
|
|
mean_t = beta_1_decayed * mean + (1. - beta_1_decayed) * previous_step |
|
885
|
|
|
variance_t = (self.beta2 * variance + |
|
886
|
|
|
(1. - self.beta2) * tensor.sqr(previous_step)) |
|
887
|
|
|
step = (learning_rate * mean_t / |
|
888
|
|
|
(tensor.sqrt(variance_t) + self.epsilon)) |
|
889
|
|
|
|
|
890
|
|
|
updates = [(mean, mean_t), |
|
891
|
|
|
(variance, variance_t), |
|
892
|
|
|
(time, t1)] |
|
893
|
|
|
|
|
894
|
|
|
return step, updates |
|
895
|
|
|
|
|
896
|
|
|
|
|
897
|
|
|
class RemoveNotFinite(StepRule): |
|
898
|
|
|
"""A step rule that skips steps with non-finite elements. |
|
899
|
|
|
|
|
900
|
|
|
Replaces a step (the parameter update of a single shared variable) |
|
901
|
|
|
which contains non-finite elements (such as ``inf`` or ``NaN``) with a |
|
902
|
|
|
step rescaling the parameters. |
|
903
|
|
|
|
|
904
|
|
|
Parameters |
|
905
|
|
|
---------- |
|
906
|
|
|
scaler : float, optional |
|
907
|
|
|
The scaling applied to the parameter in case the step contains |
|
908
|
|
|
non-finite elements. Defaults to 1, which means that parameters |
|
909
|
|
|
will not be changed. |
|
910
|
|
|
|
|
911
|
|
|
Notes |
|
912
|
|
|
----- |
|
913
|
|
|
This rule should be applied last! |
|
914
|
|
|
|
|
915
|
|
|
This trick was originally used in the GroundHog_ framework. |
|
916
|
|
|
|
|
917
|
|
|
.. _GroundHog: https://github.com/lisa-groundhog/GroundHog |
|
918
|
|
|
|
|
919
|
|
|
""" |
|
920
|
|
|
def __init__(self, scaler=1): |
|
921
|
|
|
self.scaler = scaler |
|
922
|
|
|
|
|
923
|
|
|
def compute_step(self, parameter, previous_step): |
|
924
|
|
|
step_sum = tensor.sum(previous_step) |
|
925
|
|
|
not_finite = (tensor.isnan(step_sum) + |
|
926
|
|
|
tensor.isinf(step_sum)) |
|
927
|
|
|
step = tensor.switch( |
|
928
|
|
|
not_finite > 0, (1 - self.scaler) * parameter, previous_step) |
|
929
|
|
|
return step, [] |
|
930
|
|
|
|
|
931
|
|
|
|
|
932
|
|
|
class Restrict(StepRule): |
|
933
|
|
|
"""Applies a given :class:`StepRule` only to certain variables. |
|
934
|
|
|
|
|
935
|
|
|
Example applications include clipping steps on only certain parameters, |
|
936
|
|
|
or scaling a certain kind of parameter's updates (e.g. adding an |
|
937
|
|
|
additional scalar multiplier to the steps taken on convolutional |
|
938
|
|
|
filters). |
|
939
|
|
|
|
|
940
|
|
|
Parameters |
|
941
|
|
|
---------- |
|
942
|
|
|
step_rule : :class:`StepRule` |
|
943
|
|
|
The :class:`StepRule` to be applied on the given variables. |
|
944
|
|
|
variables : iterable |
|
945
|
|
|
A collection of Theano variables on which to apply `step_rule`. |
|
946
|
|
|
Variables not appearing in this collection will not have |
|
947
|
|
|
`step_rule` applied to them. |
|
948
|
|
|
|
|
949
|
|
|
""" |
|
950
|
|
|
def __init__(self, step_rule, variables): |
|
951
|
|
|
self.step_rule = step_rule |
|
952
|
|
|
self.variables = frozenset(variables) |
|
953
|
|
|
|
|
954
|
|
|
def compute_steps(self, previous_steps): |
|
955
|
|
|
filtered_previous_steps = dict_subset(previous_steps, self.variables) |
|
956
|
|
|
steps, updates = self.step_rule.compute_steps(filtered_previous_steps) |
|
957
|
|
|
actual = OrderedDict((parameter, steps[parameter]) |
|
958
|
|
|
if parameter in steps |
|
959
|
|
|
else (parameter, previous_steps[parameter]) |
|
960
|
|
|
for parameter in previous_steps) |
|
961
|
|
|
return actual, updates |
|
962
|
|
|
|