1
|
|
|
"""Training algorithms.""" |
|
|
|
|
2
|
|
|
import logging |
3
|
|
|
import itertools |
4
|
|
|
from abc import ABCMeta, abstractmethod |
5
|
|
|
from collections import OrderedDict |
6
|
|
|
from collections import Mapping |
7
|
|
|
from six.moves import reduce |
8
|
|
|
|
9
|
|
|
from picklable_itertools.extras import equizip |
10
|
|
|
|
11
|
|
|
import theano |
12
|
|
|
from six import add_metaclass |
13
|
|
|
from theano import tensor |
14
|
|
|
|
15
|
|
|
from blocks.graph import ComputationGraph |
16
|
|
|
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER |
17
|
|
|
from blocks.theano_expressions import l2_norm |
18
|
|
|
from blocks.utils import (dict_subset, pack, shared_floatx, |
19
|
|
|
shared_floatx_zeros_matching) |
20
|
|
|
|
21
|
|
|
logger = logging.getLogger(__name__) |
22
|
|
|
|
23
|
|
|
|
24
|
|
|
def _create_algorithm_buffer_for(param, *args, **kwargs): |
25
|
|
|
buf = shared_floatx_zeros_matching(param, *args, **kwargs) |
26
|
|
|
buf.tag.for_parameter = param |
27
|
|
|
add_role(buf, ALGORITHM_BUFFER) |
28
|
|
|
return buf |
29
|
|
|
|
30
|
|
|
|
31
|
|
|
@add_metaclass(ABCMeta) |
32
|
|
|
class TrainingAlgorithm(object): |
33
|
|
|
"""Base class for training algorithms. |
34
|
|
|
|
35
|
|
|
A training algorithm object has a simple life-cycle. |
36
|
|
|
First it is initialized by calling its :meth:`initialize` method. |
37
|
|
|
At this stage, for instance, Theano functions can be compiled. |
38
|
|
|
After that the :meth:`process_batch` method is repeatedly |
39
|
|
|
called with a batch of training data as a parameter. |
40
|
|
|
|
41
|
|
|
""" |
42
|
|
|
@abstractmethod |
43
|
|
|
def initialize(self, **kwargs): |
44
|
|
|
"""Initialize the training algorithm.""" |
45
|
|
|
pass |
46
|
|
|
|
47
|
|
|
@abstractmethod |
48
|
|
|
def process_batch(self, batch): |
49
|
|
|
"""Process a batch of training data. |
50
|
|
|
|
51
|
|
|
Attributes |
52
|
|
|
---------- |
53
|
|
|
batch : dict |
54
|
|
|
A dictionary of (source name, data) pairs. |
55
|
|
|
|
56
|
|
|
""" |
57
|
|
|
pass |
58
|
|
|
|
59
|
|
|
|
60
|
|
|
variable_mismatch_error = """ |
61
|
|
|
|
62
|
|
|
Blocks tried to match the sources ({sources}) of the training dataset to \ |
63
|
|
|
the names of the Theano variables ({variables}), but failed to do so. \ |
64
|
|
|
If you want to train on a subset of the sources that your dataset provides, \ |
65
|
|
|
pass the `sources` keyword argument to its constructor, use the \ |
66
|
|
|
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \ |
67
|
|
|
or on_unused_sources='ignore' to the GradientDescent algorithm.""" |
68
|
|
|
|
69
|
|
|
source_missing_error = """ |
70
|
|
|
|
71
|
|
|
Blocks didn't find all the sources ({sources}) of the training dataset \ |
72
|
|
|
that match the names of the Theano variables ({variables}).""" |
73
|
|
|
|
74
|
|
|
|
75
|
|
|
determinism_error = """Cannot infer parameter list in a fixed order. |
76
|
|
|
|
77
|
|
|
Because dictionaries are unordered (and Python uses randomized hashing, \ |
78
|
|
|
which can change the iteration order over the same dictionary from one \ |
79
|
|
|
interpreter session to the next), Blocks cannot infer the parameters list \ |
80
|
|
|
from a plain dictionary of gradients in an order that is reproducible \ |
81
|
|
|
across interpreter sessions; please either specify the parameters \ |
82
|
|
|
explicitly or pass gradients as an OrderedDict (though exercise care in \ |
83
|
|
|
constructing that OrderedDict, as an OrderedDict created by iterating \ |
84
|
|
|
over an unordered iterable (e.g. a dict) will still have an arbitrary \ |
85
|
|
|
and unpredictable order that could cause problems with \ |
86
|
|
|
reproducibility).""" |
87
|
|
|
|
88
|
|
|
|
89
|
|
|
class UpdatesAlgorithm(TrainingAlgorithm): |
90
|
|
|
"""Base class for algorithms that use Theano functions with updates. |
91
|
|
|
|
92
|
|
|
Parameters |
93
|
|
|
---------- |
94
|
|
|
updates : list of tuples or :class:`~collections.OrderedDict` |
95
|
|
|
The updates that should be performed. |
96
|
|
|
theano_func_kwargs : dict, optional |
97
|
|
|
A passthrough to `theano.function` for additional arguments. |
98
|
|
|
Useful for passing `profile` or `mode` arguments to the theano |
99
|
|
|
function that will be compiled for the algorithm. |
100
|
|
|
on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn' |
101
|
|
|
Controls behavior when not all sources in a batch are used |
102
|
|
|
(i.e. there is no variable with a matching name in the inputs |
103
|
|
|
of the computational graph of the updates). |
104
|
|
|
|
105
|
|
|
Attributes |
106
|
|
|
---------- |
107
|
|
|
updates : list of :class:`~tensor.TensorSharedVariable` updates |
108
|
|
|
Updates to be done for every batch. It is required that the |
109
|
|
|
updates are done using the old values of optimized parameters. |
110
|
|
|
|
111
|
|
|
Notes |
112
|
|
|
----- |
113
|
|
|
Changing `updates` attribute or calling `add_updates` after |
114
|
|
|
the `initialize` method is called will have no effect. |
115
|
|
|
|
116
|
|
|
""" |
117
|
|
|
def __init__(self, updates=None, theano_func_kwargs=None, |
118
|
|
|
on_unused_sources='raise', **kwargs): |
119
|
|
|
self.updates = [] if updates is None else updates |
120
|
|
|
self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs |
121
|
|
|
is not None else dict()) |
122
|
|
|
self.on_unused_sources = on_unused_sources |
123
|
|
|
super(UpdatesAlgorithm, self).__init__(**kwargs) |
124
|
|
|
|
125
|
|
|
def initialize(self): |
126
|
|
|
logger.info("Initializing the training algorithm") |
127
|
|
|
update_values = [new_value for _, new_value in self.updates] |
128
|
|
|
logger.debug("Inferring graph inputs...") |
129
|
|
|
self.inputs = ComputationGraph(update_values).inputs |
130
|
|
|
logger.debug("Compiling training function...") |
131
|
|
|
self._function = theano.function( |
132
|
|
|
self.inputs, [], updates=self.updates, **self.theano_func_kwargs) |
133
|
|
|
logger.info("The training algorithm is initialized") |
134
|
|
|
|
135
|
|
|
@property |
136
|
|
|
def updates(self): |
137
|
|
|
return self._updates |
138
|
|
|
|
139
|
|
|
@updates.setter |
140
|
|
|
def updates(self, value): |
141
|
|
|
self._updates = value |
142
|
|
|
|
143
|
|
|
def add_updates(self, updates): |
144
|
|
|
"""Add updates to the training process. |
145
|
|
|
|
146
|
|
|
The updates will be done _before_ the parameters are changed. |
147
|
|
|
|
148
|
|
|
Parameters |
149
|
|
|
---------- |
150
|
|
|
updates : list of tuples or :class:`~collections.OrderedDict` |
151
|
|
|
The updates to add. |
152
|
|
|
|
153
|
|
|
""" |
154
|
|
|
if isinstance(updates, OrderedDict): |
155
|
|
|
updates = list(updates.items()) |
156
|
|
|
if not isinstance(updates, list): |
157
|
|
|
raise ValueError |
158
|
|
|
self.updates.extend(updates) |
159
|
|
|
|
160
|
|
|
def _validate_source_names(self, batch): |
161
|
|
|
in_names = [v.name for v in self.inputs] |
162
|
|
|
|
163
|
|
|
if not set(in_names).issubset(set(batch.keys())): |
164
|
|
|
raise ValueError("Didn't find all sources: " + |
165
|
|
|
source_missing_error.format( |
166
|
|
|
sources=batch.keys(), |
167
|
|
|
variables=in_names)) |
168
|
|
|
if not set(batch.keys()).issubset(set(in_names)): |
169
|
|
|
if self.on_unused_sources == 'ignore': |
170
|
|
|
pass |
171
|
|
|
elif self.on_unused_sources == 'warn': |
172
|
|
|
if not hasattr(self, '_unused_source_warned'): |
173
|
|
|
logger.warn(variable_mismatch_error.format( |
174
|
|
|
sources=batch.keys(), |
175
|
|
|
variables=in_names)) |
176
|
|
|
self._unused_source_warned = True |
177
|
|
|
elif self.on_unused_sources == 'raise': |
178
|
|
|
raise ValueError( |
179
|
|
|
"mismatch of variable names and data sources" + |
180
|
|
|
variable_mismatch_error.format( |
181
|
|
|
sources=batch.keys(), |
182
|
|
|
variables=in_names)) |
183
|
|
|
else: |
184
|
|
|
raise ValueError("Wrong value of on_unused_sources: {}." |
185
|
|
|
.format(self.on_unused_sources)) |
186
|
|
|
|
187
|
|
|
def process_batch(self, batch): |
188
|
|
|
self._validate_source_names(batch) |
189
|
|
|
ordered_batch = [batch[v.name] for v in self.inputs] |
190
|
|
|
self._function(*ordered_batch) |
191
|
|
|
|
192
|
|
|
|
193
|
|
|
class GradientDescent(UpdatesAlgorithm): |
194
|
|
|
"""A base class for all gradient descent algorithms. |
195
|
|
|
|
196
|
|
|
By "gradient descent" we mean a training algorithm of the following |
197
|
|
|
form: |
198
|
|
|
|
199
|
|
|
.. code-block:: python |
200
|
|
|
|
201
|
|
|
for batch in data: |
202
|
|
|
steps = step_rule.compute_steps(parameters, |
203
|
|
|
gradients_wr_parameters) |
204
|
|
|
for parameter in parameters: |
205
|
|
|
parameter -= steps[parameter] |
206
|
|
|
|
207
|
|
|
Note, that the step is *subtracted, not added*! This is done in order |
208
|
|
|
to make step rule chaining possible. |
209
|
|
|
|
210
|
|
|
Parameters |
211
|
|
|
---------- |
212
|
|
|
cost : :class:`~tensor.TensorVariable`, optional |
213
|
|
|
The objective to be minimized. Unused if `gradients` is specified. |
214
|
|
|
parameters : list of :class:`~tensor.TensorSharedVariable`, optional |
215
|
|
|
The parameters to be tuned. If not provided, inferred from the |
216
|
|
|
keys of `gradients` (in which case `gradients` *must* be an |
217
|
|
|
`OrderedDict`). |
218
|
|
|
step_rule : instance of :class:`StepRule`, optional |
219
|
|
|
An object encapsulating most of the algorithm's logic. Its |
220
|
|
|
`compute_steps` method is called to get Theano expression for |
221
|
|
|
steps. Note, that the step rule might have a state, e.g. to |
222
|
|
|
remember a weighted sum of gradients from previous steps like it is |
223
|
|
|
done in gradient descent with momentum. If ``None``, an instance of |
224
|
|
|
:class:`Scale` is created. |
225
|
|
|
gradients : OrderedDict or list of 2-tuples, optional |
226
|
|
|
A dictionary mapping a parameter to an expression for the cost's |
227
|
|
|
gradient with respect to the parameter, or equivalently, a list of |
228
|
|
|
(parameter, gradient) tuples. If ``None``, the gradient |
229
|
|
|
are taken automatically using :func:`theano.gradient.grad`. |
230
|
|
|
known_grads : dict, optional |
231
|
|
|
A passthrough to `theano.tensor.grad`'s `known_grads` argument. |
232
|
|
|
Useful when you know the [approximate] gradients of some |
233
|
|
|
sub-expressions and would like Theano to use that information |
234
|
|
|
to compute parameter gradients. Only makes sense when `gradients` |
235
|
|
|
is `None`. |
236
|
|
|
consider_constant : list, optional |
237
|
|
|
A passthrough to `theano.tensor.grad`'s `consider_constant` |
238
|
|
|
argument. A list of expressions through which gradients will not |
239
|
|
|
be backpropagated. Only makes sense when `gradients` is `None`. |
240
|
|
|
|
241
|
|
|
Attributes |
242
|
|
|
---------- |
243
|
|
|
gradients : OrderedDict |
244
|
|
|
The gradient dictionary. |
245
|
|
|
step_rule : instance of :class:`StepRule` |
246
|
|
|
The step rule. |
247
|
|
|
|
248
|
|
|
Notes |
249
|
|
|
----- |
250
|
|
|
Changing `updates` attribute or calling `add_updates` after |
251
|
|
|
the `initialize` method is called will have no effect. |
252
|
|
|
|
253
|
|
|
If a cost and parameters are provided, gradients are taken immediately |
254
|
|
|
upon construction, and changes to these attributes after construction |
255
|
|
|
will have no effect. |
256
|
|
|
|
257
|
|
|
`gradients` must be an `OrderedDict` if `parameters` is unspecified |
258
|
|
|
because ordinary dictionaries have an unpredictable iteration |
259
|
|
|
order due to hash randomization (which is enabled by default since |
260
|
|
|
versions 2.7.3 and 3.2.3 of Python). This source of variability, |
261
|
|
|
when combined with Theano's heuristic graph optimizations, can cause |
262
|
|
|
serious reproducibility issues. |
263
|
|
|
|
264
|
|
|
""" |
265
|
|
|
def __init__(self, cost=None, parameters=None, step_rule=None, |
266
|
|
|
gradients=None, known_grads=None, consider_constant=None, |
267
|
|
|
**kwargs): |
268
|
|
|
# Set initial values for cost, parameters, gradients. |
269
|
|
|
self.cost = cost |
270
|
|
|
self.parameters = parameters |
271
|
|
|
# Coerce lists of tuples to OrderedDict. Do not coerce Mappings, |
272
|
|
|
# as we don't want to convert dict -> OrderedDict and give it |
273
|
|
|
# an arbitrary, non-deterministic order. |
274
|
|
|
if gradients is not None and not isinstance(gradients, Mapping): |
275
|
|
|
gradients = OrderedDict(gradients) |
276
|
|
|
self.gradients = gradients |
277
|
|
|
|
278
|
|
|
# If we don't have gradients, we'll need to infer them from the |
279
|
|
|
# cost and the parameters, both of which must not be None. |
280
|
|
|
if not self.gradients: |
281
|
|
|
self.gradients = self._compute_gradients(known_grads, |
282
|
|
|
consider_constant) |
283
|
|
|
else: |
284
|
|
|
if cost is not None: |
285
|
|
|
logger.warning(('{}: gradients already specified directly; ' |
286
|
|
|
'cost is unused.' |
287
|
|
|
.format(self.__class__.__name__))) |
288
|
|
|
if self.parameters is None and isinstance(gradients, OrderedDict): |
289
|
|
|
# If the dictionary is ordered, it's safe to use the keys |
290
|
|
|
# as they have a deterministic order. |
291
|
|
|
self.parameters = list(self.gradients.keys()) |
292
|
|
|
elif self.parameters is not None: |
293
|
|
|
# If parameters and gradients.keys() don't match we can |
294
|
|
|
# try to recover if gradients is ordered. |
295
|
|
|
if set(self.parameters) != set(self.gradients.keys()): |
296
|
|
|
logger.warn("Specified parameters list does not match " |
297
|
|
|
"keys in provided gradient dictionary; " |
298
|
|
|
"using parameters inferred from gradients") |
299
|
|
|
if not isinstance(self.gradients, OrderedDict): |
300
|
|
|
raise ValueError(determinism_error) |
301
|
|
|
self.parameters = list(self.gradients.keys()) |
302
|
|
|
else: |
303
|
|
|
# self.parameters is not None, and gradients isn't |
304
|
|
|
# an OrderedDict. We can't do anything safe. |
305
|
|
|
raise ValueError(determinism_error) |
306
|
|
|
if known_grads: |
307
|
|
|
raise ValueError("known_grads has no effect when gradients " |
308
|
|
|
"are passed in") |
309
|
|
|
if consider_constant is not None: |
310
|
|
|
raise ValueError("consider_constant has no effect when " |
311
|
|
|
"gradients are passed in") |
312
|
|
|
|
313
|
|
|
# The order in which the different gradient terms appears |
314
|
|
|
# here matters, as floating point addition is non-commutative (and |
315
|
|
|
# Theano's graph optimizations are not order-independent). |
316
|
|
|
# This is why we do not use .values(). |
317
|
|
|
gradient_values = [self.gradients[p] for p in self.parameters] |
318
|
|
|
self.total_gradient_norm = (l2_norm(gradient_values) |
319
|
|
|
.copy(name="total_gradient_norm")) |
320
|
|
|
|
321
|
|
|
self.step_rule = step_rule if step_rule else Scale() |
322
|
|
|
logger.debug("Computing parameter steps...") |
323
|
|
|
self.steps, self.step_rule_updates = ( |
324
|
|
|
self.step_rule.compute_steps(self.gradients)) |
325
|
|
|
|
326
|
|
|
# Same as gradient_values above: the order may influence a |
327
|
|
|
# bunch of things, so enforce a consistent one (don't use |
328
|
|
|
# .values()). |
329
|
|
|
step_values = [self.steps[p] for p in self.parameters] |
330
|
|
|
self.total_step_norm = (l2_norm(step_values) |
331
|
|
|
.copy(name="total_step_norm")) |
332
|
|
|
|
333
|
|
|
# Once again, iterating on gradients may not be deterministically |
334
|
|
|
# ordered if it is not an OrderedDict. We add the updates here in |
335
|
|
|
# the order specified in self.parameters. Keep it this way to |
336
|
|
|
# maintain reproducibility. |
337
|
|
|
kwargs.setdefault('updates', []).extend( |
338
|
|
|
itertools.chain(((parameter, parameter - self.steps[parameter]) |
339
|
|
|
for parameter in self.parameters), |
340
|
|
|
self.step_rule_updates) |
341
|
|
|
) |
342
|
|
|
super(GradientDescent, self).__init__(**kwargs) |
343
|
|
|
|
344
|
|
|
def _compute_gradients(self, known_grads, consider_constant): |
345
|
|
|
if self.cost is None: |
346
|
|
|
raise ValueError("can't infer gradients; no cost specified") |
347
|
|
|
elif self.parameters is None or len(self.parameters) == 0: |
348
|
|
|
raise ValueError("can't infer gradients; no parameters " |
349
|
|
|
"specified") |
350
|
|
|
# While this strictly speaking could be a dict and not an |
351
|
|
|
# OrderedDict (because we iterate over it in the order of |
352
|
|
|
# self.parameters), this guards a little bit against |
353
|
|
|
# nondeterminism introduced by future refactoring. |
354
|
|
|
logger.info("Taking the cost gradient") |
355
|
|
|
gradients = OrderedDict( |
356
|
|
|
equizip(self.parameters, tensor.grad( |
357
|
|
|
self.cost, self.parameters, |
358
|
|
|
known_grads=known_grads, |
359
|
|
|
consider_constant=consider_constant))) |
360
|
|
|
logger.info("The cost gradient computation graph is built") |
361
|
|
|
return gradients |
362
|
|
|
|
363
|
|
|
|
364
|
|
|
@add_metaclass(ABCMeta) |
365
|
|
|
class StepRule(object): |
366
|
|
|
"""A rule to compute steps for a gradient descent algorithm.""" |
367
|
|
|
def compute_step(self, parameter, previous_step): |
368
|
|
|
"""Build a Theano expression for the step for a parameter. |
369
|
|
|
|
370
|
|
|
This method is called by default implementation of |
371
|
|
|
:meth:`compute_steps`, it relieves from writing a loop each time. |
372
|
|
|
|
373
|
|
|
Parameters |
374
|
|
|
---------- |
375
|
|
|
parameter : :class:`~tensor.TensorSharedVariable` |
376
|
|
|
The parameter. |
377
|
|
|
previous_step : :class:`~tensor.TensorVariable` |
378
|
|
|
Some quantity related to the gradient of the cost with respect |
379
|
|
|
to the parameter, either the gradient itself or a step in a |
380
|
|
|
related direction. |
381
|
|
|
|
382
|
|
|
Returns |
383
|
|
|
------- |
384
|
|
|
step : :class:`~theano.Variable` |
385
|
|
|
Theano variable for the step to take. |
386
|
|
|
updates : list |
387
|
|
|
A list of tuples representing updates to be performed. This |
388
|
|
|
is useful for stateful rules such as :class:`Momentum` which |
389
|
|
|
need to update shared variables after itetations. |
390
|
|
|
|
391
|
|
|
""" |
392
|
|
|
raise NotImplementedError |
393
|
|
|
|
394
|
|
|
def compute_steps(self, previous_steps): |
395
|
|
|
"""Build a Theano expression for steps for all parameters. |
396
|
|
|
|
397
|
|
|
Override this method if you want to process the steps |
398
|
|
|
with respect to all parameters as a whole, not parameter-wise. |
399
|
|
|
|
400
|
|
|
Parameters |
401
|
|
|
---------- |
402
|
|
|
previous_steps : OrderedDict |
403
|
|
|
An :class:`~OrderedDict` of |
404
|
|
|
(:class:`~tensor.TensorSharedVariable` |
405
|
|
|
:class:`~tensor.TensorVariable`) pairs. The keys are the |
406
|
|
|
parameters being trained, the values are the expressions for |
407
|
|
|
quantities related to gradients of the cost with respect to |
408
|
|
|
the parameters, either the gradients themselves or steps in |
409
|
|
|
related directions. |
410
|
|
|
|
411
|
|
|
Returns |
412
|
|
|
------- |
413
|
|
|
steps : OrderedDict |
414
|
|
|
A dictionary of the proposed steps in the same form as |
415
|
|
|
`previous_steps`. |
416
|
|
|
updates : list |
417
|
|
|
A list of tuples representing updates to be performed. |
418
|
|
|
|
419
|
|
|
""" |
420
|
|
|
parameter_wise = [self.compute_step(parameter, |
421
|
|
|
previous_steps[parameter]) |
422
|
|
|
for parameter in previous_steps] |
423
|
|
|
steps, updates = equizip(*parameter_wise) |
424
|
|
|
steps = OrderedDict((parameter, step) for parameter, step |
425
|
|
|
in equizip(previous_steps.keys(), steps)) |
426
|
|
|
updates = list(itertools.chain(*updates)) |
427
|
|
|
return steps, updates |
428
|
|
|
|
429
|
|
|
|
430
|
|
|
class CompositeRule(StepRule): |
431
|
|
|
"""Chains several step rules. |
432
|
|
|
|
433
|
|
|
Parameters |
434
|
|
|
---------- |
435
|
|
|
components : list of :class:`StepRule` |
436
|
|
|
The learning rules to be chained. The rules will be applied in the |
437
|
|
|
order as given. |
438
|
|
|
|
439
|
|
|
""" |
440
|
|
|
def __init__(self, components): |
441
|
|
|
self.components = components |
442
|
|
|
|
443
|
|
|
def compute_steps(self, previous_steps): |
444
|
|
|
steps = previous_steps |
445
|
|
|
updates = [] |
446
|
|
|
for rule in self.components: |
447
|
|
|
steps, more_updates = rule.compute_steps(steps) |
448
|
|
|
updates += more_updates |
449
|
|
|
return steps, updates |
450
|
|
|
|
451
|
|
|
|
452
|
|
|
class Scale(StepRule): |
453
|
|
|
"""A step in the direction proportional to the previous step. |
454
|
|
|
|
455
|
|
|
If used in :class:`GradientDescent` alone, this step rule implements |
456
|
|
|
steepest descent. |
457
|
|
|
|
458
|
|
|
Parameters |
459
|
|
|
---------- |
460
|
|
|
learning_rate : float |
461
|
|
|
The learning rate by which the previous step is multiplied to |
462
|
|
|
produce the step. |
463
|
|
|
|
464
|
|
|
Attributes |
465
|
|
|
---------- |
466
|
|
|
learning_rate : :class:`~tensor.TensorSharedVariable` |
467
|
|
|
The shared variable storing the learning rate used. |
468
|
|
|
|
469
|
|
|
""" |
470
|
|
|
def __init__(self, learning_rate=1.0): |
471
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
472
|
|
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
473
|
|
|
|
474
|
|
|
def compute_step(self, parameter, previous_step): |
475
|
|
|
return self.learning_rate * previous_step, [] |
476
|
|
|
|
477
|
|
|
|
478
|
|
|
class BasicMomentum(StepRule): |
479
|
|
|
"""Accumulates step with exponential discount. |
480
|
|
|
|
481
|
|
|
Parameters |
482
|
|
|
---------- |
483
|
|
|
momentum : float, optional |
484
|
|
|
The momentum coefficient. Defaults to 0. |
485
|
|
|
|
486
|
|
|
Notes |
487
|
|
|
----- |
488
|
|
|
This step rule is intended to be used in conjunction with another |
489
|
|
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
490
|
|
|
experience, look at :class:`Momentum`. |
491
|
|
|
|
492
|
|
|
""" |
493
|
|
|
def __init__(self, momentum=0.): |
494
|
|
|
self.momentum = shared_floatx(momentum, "momentum") |
495
|
|
|
add_role(self.momentum, ALGORITHM_HYPERPARAMETER) |
496
|
|
|
|
497
|
|
|
def compute_step(self, parameter, previous_step): |
498
|
|
|
velocity = _create_algorithm_buffer_for(parameter, "velocity") |
499
|
|
|
step = self.momentum * velocity + previous_step |
500
|
|
|
updates = [(velocity, step)] |
501
|
|
|
return step, updates |
502
|
|
|
|
503
|
|
|
|
504
|
|
|
class Momentum(CompositeRule): |
505
|
|
|
"""Accumulates step with exponential discount. |
506
|
|
|
|
507
|
|
|
Combines :class:`BasicMomentum` and :class:`Scale` to form the |
508
|
|
|
usual momentum step rule. |
509
|
|
|
|
510
|
|
|
Parameters |
511
|
|
|
---------- |
512
|
|
|
learning_rate : float, optional |
513
|
|
|
The learning rate by which the previous step scaled. Defaults to 1. |
514
|
|
|
momentum : float, optional |
515
|
|
|
The momentum coefficient. Defaults to 0. |
516
|
|
|
|
517
|
|
|
Attributes |
518
|
|
|
---------- |
519
|
|
|
learning_rate : :class:`~tensor.SharedVariable` |
520
|
|
|
A variable for learning rate. |
521
|
|
|
momentum : :class:`~tensor.SharedVariable` |
522
|
|
View Code Duplication |
A variable for momentum. |
|
|
|
|
523
|
|
|
|
524
|
|
|
See Also |
525
|
|
|
-------- |
526
|
|
|
:class:`SharedVariableModifier` |
527
|
|
|
|
528
|
|
|
""" |
529
|
|
|
def __init__(self, learning_rate=1.0, momentum=0.): |
530
|
|
|
scale = Scale(learning_rate=learning_rate) |
531
|
|
|
basic_momentum = BasicMomentum(momentum=momentum) |
532
|
|
|
self.learning_rate = scale.learning_rate |
533
|
|
|
self.momentum = basic_momentum.momentum |
534
|
|
|
self.components = [scale, basic_momentum] |
535
|
|
|
|
536
|
|
|
|
537
|
|
|
class AdaDelta(StepRule): |
538
|
|
|
"""Adapts the step size over time using only first order information. |
539
|
|
|
|
540
|
|
|
Parameters |
541
|
|
|
---------- |
542
|
|
|
decay_rate : float, optional |
543
|
|
|
Decay rate in [0, 1]. Defaults to 0.95. |
544
|
|
|
epsilon : float, optional |
545
|
|
|
Stabilizing constant for RMS. Defaults to 1e-6. |
546
|
|
|
|
547
|
|
|
Notes |
548
|
|
|
----- |
549
|
|
|
For more information, see [ADADELTA]_. |
550
|
|
|
|
551
|
|
|
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning |
552
|
|
|
Rate Method*, arXiv:1212.5701. |
553
|
|
|
|
554
|
|
|
""" |
555
|
|
|
def __init__(self, decay_rate=0.95, epsilon=1e-6): |
556
|
|
|
if not 0.0 <= decay_rate <= 1.0: |
557
|
|
|
raise ValueError("decay rate needs to be in [0, 1]") |
558
|
|
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
559
|
|
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
560
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
561
|
|
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
562
|
|
|
|
563
|
|
|
def compute_step(self, parameter, previous_step): |
564
|
|
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
565
|
|
|
parameter, "mean_square_step_tm1") |
566
|
|
|
mean_square_delta_x_tm1 = _create_algorithm_buffer_for( |
567
|
|
|
parameter, "mean_square_delta_x_tm1") |
568
|
|
|
|
569
|
|
|
mean_square_step_t = ( |
570
|
|
|
self.decay_rate * mean_square_step_tm1 + |
571
|
|
|
(1 - self.decay_rate) * tensor.sqr(previous_step) |
572
|
|
|
) |
573
|
|
|
|
574
|
|
|
rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) |
575
|
|
|
rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) |
576
|
|
|
delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step |
577
|
|
|
|
578
|
|
|
mean_square_delta_x_t = ( |
579
|
|
|
self.decay_rate * mean_square_delta_x_tm1 + |
580
|
|
|
(1 - self.decay_rate) * tensor.sqr(delta_x_t) |
581
|
|
|
) |
582
|
|
|
|
583
|
|
|
step = delta_x_t |
584
|
|
|
updates = [(mean_square_step_tm1, mean_square_step_t), |
585
|
|
|
(mean_square_delta_x_tm1, mean_square_delta_x_t)] |
586
|
|
|
return step, updates |
587
|
|
|
|
588
|
|
|
|
589
|
|
|
class BasicRMSProp(StepRule): |
590
|
|
|
"""Scales the step size by a running average of the recent step norms. |
591
|
|
|
|
592
|
|
|
Parameters |
593
|
|
|
---------- |
594
|
|
|
decay_rate : float, optional |
595
|
|
|
How fast the running average decays, value in [0, 1] |
596
|
|
|
(lower is faster). Defaults to 0.9. |
597
|
|
|
max_scaling : float, optional |
598
|
|
|
Maximum scaling of the step size, in case the running average is |
599
|
|
|
really small. Needs to be greater than 0. Defaults to 1e5. |
600
|
|
|
|
601
|
|
|
Notes |
602
|
|
|
----- |
603
|
|
|
This step rule is intended to be used in conjunction with another |
604
|
|
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
605
|
|
|
experience, look at :class:`RMSProp`. |
606
|
|
|
|
607
|
|
|
In general, this step rule should be used _before_ other step rules, |
608
|
|
|
because it has normalization properties that may undo their work. |
609
|
|
|
For instance, it should be applied first when used in conjunction |
610
|
|
|
with :class:`Scale`. |
611
|
|
|
|
612
|
|
|
For more information, see [Hint2014]_. |
613
|
|
|
|
614
|
|
|
""" |
615
|
|
|
def __init__(self, decay_rate=0.9, max_scaling=1e5): |
616
|
|
|
if not 0.0 <= decay_rate <= 1.0: |
617
|
|
|
raise ValueError("decay rate needs to be in [0, 1]") |
618
|
|
|
if max_scaling <= 0: |
619
|
|
|
raise ValueError("max. scaling needs to be greater than 0") |
620
|
|
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
621
|
|
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
622
|
|
|
self.epsilon = 1. / max_scaling |
623
|
|
|
|
624
|
|
|
def compute_step(self, parameter, previous_step): |
625
|
|
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
626
|
|
|
parameter, "mean_square_step_tm1") |
627
|
|
|
mean_square_step_t = ( |
628
|
|
|
self.decay_rate * mean_square_step_tm1 + |
629
|
|
|
(1 - self.decay_rate) * tensor.sqr(previous_step)) |
630
|
|
|
rms_step_t = tensor.maximum( |
631
|
|
|
tensor.sqrt(mean_square_step_t), self.epsilon) |
632
|
|
|
step = previous_step / rms_step_t |
633
|
|
|
updates = [(mean_square_step_tm1, mean_square_step_t)] |
634
|
|
|
return step, updates |
635
|
|
|
|
636
|
|
|
|
637
|
|
|
class RMSProp(CompositeRule): |
638
|
|
|
"""Scales the step size by a running average of the recent step norms. |
639
|
|
|
|
640
|
|
|
Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule |
641
|
|
|
described in [Hint2014]_. |
642
|
|
|
|
643
|
|
|
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, |
644
|
|
|
lecture 6a, |
645
|
|
|
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf |
646
|
|
|
|
647
|
|
|
Parameters |
648
|
|
|
---------- |
649
|
|
|
learning_rate : float, optional |
650
|
|
|
The learning rate by which the previous step scaled. Defaults to 1. |
651
|
|
|
decay_rate : float, optional |
652
|
|
|
How fast the running average decays (lower is faster). |
653
|
|
|
Defaults to 0.9. |
654
|
|
|
max_scaling : float, optional |
655
|
|
|
Maximum scaling of the step size, in case the running average is |
656
|
|
|
really small. Defaults to 1e5. |
657
|
|
|
|
658
|
|
|
Attributes |
659
|
|
|
---------- |
660
|
|
|
learning_rate : :class:`~tensor.SharedVariable` |
661
|
|
|
A variable for learning rate. |
662
|
|
|
decay_rate : :class:`~tensor.SharedVariable` |
663
|
|
|
A variable for decay rate. |
664
|
|
|
|
665
|
|
|
See Also |
666
|
|
|
-------- |
667
|
|
|
:class:`SharedVariableModifier` |
668
|
|
|
|
669
|
|
|
""" |
670
|
|
|
def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5): |
671
|
|
|
basic_rms_prop = BasicRMSProp(decay_rate=decay_rate, |
672
|
|
|
max_scaling=max_scaling) |
673
|
|
|
scale = Scale(learning_rate=learning_rate) |
674
|
|
|
self.learning_rate = scale.learning_rate |
675
|
|
|
self.decay_rate = basic_rms_prop.decay_rate |
676
|
|
|
self.components = [basic_rms_prop, scale] |
677
|
|
|
|
678
|
|
|
|
679
|
|
|
class StepClipping(StepRule): |
680
|
|
|
"""Rescales an entire step if its L2 norm exceeds a threshold. |
681
|
|
|
|
682
|
|
|
When the previous steps are the gradients, this step rule performs |
683
|
|
|
gradient clipping. |
684
|
|
|
|
685
|
|
|
Parameters |
686
|
|
|
---------- |
687
|
|
|
threshold : float, optional |
688
|
|
|
The maximum permitted L2 norm for the step. The step |
689
|
|
|
will be rescaled to be not higher than this quanity. |
690
|
|
|
If ``None``, no rescaling will be applied. |
691
|
|
|
|
692
|
|
|
Attributes |
693
|
|
|
---------- |
694
|
|
|
threshold : :class:`.tensor.TensorSharedVariable` |
695
|
|
|
The shared variable storing the clipping threshold used. |
696
|
|
|
|
697
|
|
|
""" |
698
|
|
|
def __init__(self, threshold=None): |
699
|
|
|
if threshold is not None: |
700
|
|
|
threshold = shared_floatx(threshold, "threshold") |
701
|
|
|
add_role(threshold, ALGORITHM_HYPERPARAMETER) |
702
|
|
|
self.threshold = threshold |
703
|
|
|
|
704
|
|
|
def compute_steps(self, previous_steps): |
705
|
|
|
if self.threshold is None: |
706
|
|
|
steps = previous_steps |
707
|
|
|
else: |
708
|
|
|
norm = l2_norm(previous_steps.values()) |
709
|
|
|
multiplier = tensor.switch(norm < self.threshold, |
710
|
|
|
1, self.threshold / norm) |
711
|
|
|
steps = OrderedDict( |
712
|
|
|
(parameter, step * multiplier) |
713
|
|
|
for parameter, step in previous_steps.items()) |
714
|
|
|
return steps, [] |
715
|
|
|
|
716
|
|
|
|
717
|
|
|
class VariableClipping(StepRule): |
718
|
|
|
"""Clip the maximum norm of individual variables along certain axes. |
719
|
|
|
|
720
|
|
|
This :class:`StepRule` can be used to implement L2 norm constraints on |
721
|
|
|
e.g. the weight vectors of individual hidden units, convolutional |
722
|
|
|
filters or entire weight tensors. Combine with :class:`Restrict` |
723
|
|
|
(and possibly :class:`CompositeRule`), to apply such constraints only |
724
|
|
|
to certain variables and/or apply different norm constraints to |
725
|
|
View Code Duplication |
different variables. |
|
|
|
|
726
|
|
|
|
727
|
|
|
Parameters |
728
|
|
|
---------- |
729
|
|
|
threshold : float |
730
|
|
|
Maximum norm for a given (portion of a) tensor. |
731
|
|
|
axis : int or iterable, optional |
732
|
|
|
An integer single axis, or an iterable collection of integer |
733
|
|
|
axes over which to sum in order to calculate the L2 norm. If |
734
|
|
|
`None` (the default), the norm is computed over all elements |
735
|
|
|
of the tensor. |
736
|
|
|
|
737
|
|
|
Notes |
738
|
|
|
----- |
739
|
|
|
Because of the way the :class:`StepRule` API works, this particular |
740
|
|
|
rule implements norm clipping of the value *after* update in the |
741
|
|
|
following way: it computes ``parameter - previous_step``, scales it |
742
|
|
|
to have (possibly axes-wise) norm(s) of at most `threshold`, |
743
|
|
|
then subtracts *that* value from `parameter` to yield an 'equivalent |
744
|
|
|
step' that respects the desired norm constraints. This procedure |
745
|
|
|
implicitly assumes one is doing simple (stochastic) gradient descent, |
746
|
|
|
and so steps computed by this step rule may not make sense for use |
747
|
|
|
in other contexts. |
748
|
|
|
|
749
|
|
|
Investigations into max-norm regularization date from [Srebro2005]_. |
750
|
|
|
The first appearance of this technique as a regularization method |
751
|
|
|
for the weight vectors of individual hidden units in feed-forward |
752
|
|
|
neural networks may be [Hinton2012]_. |
753
|
|
|
|
754
|
|
|
.. [Srebro2005] Nathan Srebro and Adi Shraibman. |
755
|
|
|
"Rank, Trace-Norm and Max-Norm". *18th Annual Conference |
756
|
|
|
on Learning Theory (COLT)*, June 2005. |
757
|
|
|
|
758
|
|
|
.. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava, |
759
|
|
|
Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov. |
760
|
|
|
"Improving neural networks by preventing co-adaptation of |
761
|
|
|
feature detectors". arXiv:1207.0580. |
762
|
|
|
|
763
|
|
|
""" |
764
|
|
|
def __init__(self, threshold, axis=None): |
765
|
|
|
axis = pack(axis) if axis is not None else () |
766
|
|
|
self.axis = set(axis) |
767
|
|
|
self.threshold = shared_floatx(threshold, "threshold") |
768
|
|
|
add_role(self.threshold, ALGORITHM_HYPERPARAMETER) |
769
|
|
|
if len(axis) != len(self.axis): |
770
|
|
|
raise ValueError("axis must be unique") |
771
|
|
|
|
772
|
|
|
def compute_step(self, parameter, previous_step): |
773
|
|
|
if any(ax >= previous_step.ndim for ax in self.axis): |
774
|
|
|
raise ValueError("Invalid axis {} for {}, ndim={}".format( |
775
|
|
|
self.axis, parameter, previous_step.ndim)) |
776
|
|
|
if len(self.axis) == 0: |
777
|
|
|
norms = l2_norm([parameter - previous_step]) |
778
|
|
|
else: |
779
|
|
|
squares = tensor.sqr(parameter - previous_step) |
780
|
|
|
norms = tensor.sqrt( |
781
|
|
|
reduce(lambda t, a: t.sum(axis=a, keepdims=True), |
782
|
|
|
sorted(self.axis), squares)) |
783
|
|
|
# We want a step s* that is the same as scaling |
784
|
|
|
# (parameter - previous_step) by threshold / norm |
785
|
|
|
# when threshold < norm. |
786
|
|
|
shrinking_step = (parameter - |
787
|
|
|
(self.threshold / norms) * |
788
|
|
|
(parameter - previous_step)) |
789
|
|
|
return tensor.switch(norms > self.threshold, |
790
|
|
|
shrinking_step, |
791
|
|
|
previous_step), () |
792
|
|
|
|
793
|
|
|
|
794
|
|
|
class AdaGrad(StepRule): |
795
|
|
|
"""Implements the AdaGrad learning rule. |
796
|
|
|
|
797
|
|
|
Parameters |
798
|
|
|
---------- |
799
|
|
|
learning_rate : float, optional |
800
|
|
|
Step size. |
801
|
|
|
Default value is set to 0.0002. |
802
|
|
|
epsilon : float, optional |
803
|
|
|
Stabilizing constant for one over root of sum of squares. |
804
|
|
|
Defaults to 1e-6. |
805
|
|
|
|
806
|
|
|
Notes |
807
|
|
|
----- |
808
|
|
|
For more information, see [ADAGRAD]_. |
809
|
|
|
|
810
|
|
|
.. [ADAGRAD] Duchi J, Hazan E, Singer Y., |
811
|
|
|
*Adaptive subgradient methods for online learning and |
812
|
|
|
stochastic optimization*, |
813
|
|
|
http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf |
814
|
|
|
|
815
|
|
|
""" |
816
|
|
|
def __init__(self, learning_rate=0.002, epsilon=1e-6): |
817
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
818
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
819
|
|
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
820
|
|
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
821
|
|
|
|
822
|
|
|
def compute_step(self, parameter, previous_step): |
823
|
|
|
name = 'adagrad_sqs' |
824
|
|
|
if parameter.name: |
825
|
|
|
name += '_' + parameter.name |
826
|
|
|
ssq = _create_algorithm_buffer_for(parameter, name=name) |
827
|
|
|
|
828
|
|
|
ssq_t = (tensor.sqr(previous_step) + ssq) |
829
|
|
|
step = (self.learning_rate * previous_step / |
830
|
|
|
(tensor.sqrt(ssq_t) + self.epsilon)) |
831
|
|
|
|
832
|
|
|
updates = [(ssq, ssq_t)] |
833
|
|
|
|
834
|
|
|
return step, updates |
835
|
|
|
|
836
|
|
|
|
837
|
|
|
class Adam(StepRule): |
838
|
|
|
"""Adam optimizer as described in [King2014]_. |
839
|
|
|
|
840
|
|
|
.. [King2014] Diederik Kingma, Jimmy Ba, |
841
|
|
|
*Adam: A Method for Stochastic Optimization*, |
842
|
|
|
http://arxiv.org/abs/1412.6980 |
843
|
|
|
|
844
|
|
|
Parameters |
845
|
|
|
---------- |
846
|
|
|
learning_rate : float, optional |
847
|
|
|
Step size. |
848
|
|
|
Default value is set to 0.002. |
849
|
|
|
beta1 : float, optional |
850
|
|
|
Exponential decay rate for the first moment estimates. |
851
|
|
|
Default value is set to 0.1. |
852
|
|
|
beta2 : float, optional |
853
|
|
|
Exponential decay rate for the second moment estimates. |
854
|
|
|
Default value is set to 0.001. |
855
|
|
|
epsilon : float, optional |
856
|
|
|
Default value is set to 1e-8. |
857
|
|
|
decay_factor : float, optional |
858
|
|
|
Default value is set to 1 - 1e-8. |
859
|
|
|
|
860
|
|
|
""" |
861
|
|
|
def __init__(self, learning_rate=0.002, |
862
|
|
|
beta1=0.1, beta2=0.001, epsilon=1e-8, |
863
|
|
|
decay_factor=(1 - 1e-8)): |
864
|
|
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
865
|
|
|
self.beta1 = shared_floatx(beta1, "beta1") |
866
|
|
|
self.beta2 = shared_floatx(beta2, "beta2") |
867
|
|
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
868
|
|
|
self.decay_factor = shared_floatx(decay_factor, "decay_factor") |
869
|
|
|
for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon, |
870
|
|
|
self.decay_factor]: |
871
|
|
|
add_role(param, ALGORITHM_HYPERPARAMETER) |
872
|
|
|
|
873
|
|
|
def compute_step(self, parameter, previous_step): |
874
|
|
|
mean = _create_algorithm_buffer_for(parameter, 'mean') |
875
|
|
|
variance = _create_algorithm_buffer_for(parameter, 'variance') |
876
|
|
|
time = shared_floatx(0., 'time') |
877
|
|
|
add_role(time, ALGORITHM_BUFFER) |
878
|
|
|
|
879
|
|
|
t1 = time + 1 |
880
|
|
|
learning_rate = (self.learning_rate * |
881
|
|
|
tensor.sqrt((1. - (1. - self.beta2)**t1)) / |
882
|
|
|
(1. - (1. - self.beta1)**t1)) |
883
|
|
|
beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1) |
884
|
|
|
mean_t = beta_1t * previous_step + (1. - beta_1t) * mean |
885
|
|
|
variance_t = (self.beta2 * tensor.sqr(previous_step) + |
886
|
|
|
(1. - self.beta2) * variance) |
887
|
|
|
step = (learning_rate * mean_t / |
888
|
|
|
(tensor.sqrt(variance_t) + self.epsilon)) |
889
|
|
|
|
890
|
|
|
updates = [(mean, mean_t), |
891
|
|
|
(variance, variance_t), |
892
|
|
|
(time, t1)] |
893
|
|
|
|
894
|
|
|
return step, updates |
895
|
|
|
|
896
|
|
|
|
897
|
|
|
class RemoveNotFinite(StepRule): |
898
|
|
|
"""A step rule that skips steps with non-finite elements. |
899
|
|
|
|
900
|
|
|
Replaces a step (the parameter update of a single shared variable) |
901
|
|
|
which contains non-finite elements (such as ``inf`` or ``NaN``) with a |
902
|
|
|
step rescaling the parameters. |
903
|
|
|
|
904
|
|
|
Parameters |
905
|
|
|
---------- |
906
|
|
|
scaler : float, optional |
907
|
|
|
The scaling applied to the parameter in case the step contains |
908
|
|
|
non-finite elements. Defaults to 1, which means that parameters |
909
|
|
|
will not be changed. |
910
|
|
|
|
911
|
|
|
Notes |
912
|
|
|
----- |
913
|
|
|
This rule should be applied last! |
914
|
|
|
|
915
|
|
|
This trick was originally used in the GroundHog_ framework. |
916
|
|
|
|
917
|
|
|
.. _GroundHog: https://github.com/lisa-groundhog/GroundHog |
918
|
|
|
|
919
|
|
|
""" |
920
|
|
|
def __init__(self, scaler=1): |
921
|
|
|
self.scaler = scaler |
922
|
|
|
|
923
|
|
|
def compute_step(self, parameter, previous_step): |
924
|
|
|
step_sum = tensor.sum(previous_step) |
925
|
|
|
not_finite = (tensor.isnan(step_sum) + |
926
|
|
|
tensor.isinf(step_sum)) |
927
|
|
|
step = tensor.switch( |
928
|
|
|
not_finite > 0, (1 - self.scaler) * parameter, previous_step) |
929
|
|
|
return step, [] |
930
|
|
|
|
931
|
|
|
|
932
|
|
|
class Restrict(StepRule): |
933
|
|
|
"""Applies a given :class:`StepRule` only to certain variables. |
934
|
|
|
|
935
|
|
|
Example applications include clipping steps on only certain parameters, |
936
|
|
|
or scaling a certain kind of parameter's updates (e.g. adding an |
937
|
|
|
additional scalar multiplier to the steps taken on convolutional |
938
|
|
|
filters). |
939
|
|
|
|
940
|
|
|
Parameters |
941
|
|
|
---------- |
942
|
|
|
step_rule : :class:`StepRule` |
943
|
|
|
The :class:`StepRule` to be applied on the given variables. |
944
|
|
|
variables : iterable |
945
|
|
|
A collection of Theano variables on which to apply `step_rule`. |
946
|
|
|
Variables not appearing in this collection will not have |
947
|
|
|
`step_rule` applied to them. |
948
|
|
|
|
949
|
|
|
""" |
950
|
|
|
def __init__(self, step_rule, variables): |
951
|
|
|
self.step_rule = step_rule |
952
|
|
|
self.variables = frozenset(variables) |
953
|
|
|
|
954
|
|
|
def compute_steps(self, previous_steps): |
955
|
|
|
filtered_previous_steps = dict_subset(previous_steps, self.variables) |
956
|
|
|
steps, updates = self.step_rule.compute_steps(filtered_previous_steps) |
957
|
|
|
actual = OrderedDict((parameter, steps[parameter]) |
958
|
|
|
if parameter in steps |
959
|
|
|
else (parameter, previous_steps[parameter]) |
960
|
|
|
for parameter in previous_steps) |
961
|
|
|
return actual, updates |
962
|
|
|
|
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.