Completed
Pull Request — master (#1079)
by David
04:58
created

GradientDescent.add_updates()   A

Complexity

Conditions 3

Size

Total Lines 16

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 3
dl 0
loc 16
rs 9.4285
1
"""Training algorithms."""
0 ignored issues
show
Bug introduced by
There seems to be a cyclic import (blocks.bricks.base -> blocks.graph -> blocks.graph.bn -> blocks.filter).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor. Or pass \
65
on_unused_sources='warn' or on_unused_sources='ignore' to \
66
the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
class GradientDescent(TrainingAlgorithm):
75
    """A base class for all gradient descent algorithms.
76
77
    By "gradient descent" we mean a training algorithm of the following
78
    form:
79
80
    .. code-block::  python
81
82
        for batch in data:
83
            steps = step_rule.compute_steps(parameters,
84
                                            gradients_wr_parameters)
85
            for parameter in parameters:
86
                parameter -= steps[parameter]
87
88
    Note, that the step is *subtracted, not added*! This is done in order
89
    to make step rule chaining possible.
90
91
    Parameters
92
    ----------
93
    cost : :class:`~tensor.TensorVariable`, optional
94
        The objective to be minimized.
95
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
96
        The parameters to be tuned. If not provided, inferred from the
97
        keys of `gradients`.
98
    step_rule : instance of :class:`StepRule`, optional
99
        An object encapsulating most of the algorithm's logic. Its
100
        `compute_steps` method is called to get Theano expression for
101
        steps.  Note, that the step rule might have a state, e.g. to
102
        remember a weighted sum of gradients from previous steps like it is
103
        done in gradient descent with momentum. If ``None``, an instance of
104
        :class:`Scale` is created.
105
    gradients : dict, optional
106
        A dictionary mapping a parameter to an expression for the cost's
107
        gradient with respect to the parameter. If ``None``, the gradient
108
        are taken automatically using :func:`theano.gradient.grad`.
109
    known_grads : dict, optional
110
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
111
        Useful when you know the [approximate] gradients of some
112
        sub-expressions and would like Theano to use that information
113
        to compute parameter gradients. Only makes sense when `gradients`
114
        is `None`.
115
    consider_constant : list, optional
116
        A passthrough to `theano.tensor.grad`'s `consider_constant`
117
        argument.  A list of expressions through which gradients will not
118
        be backpropagated. Only makes sense when `gradients` is `None`.
119
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
120
        Controls behavior when not all sources are used.
121
    theano_func_kwargs : dict, optional
122
        A passthrough to `theano.function` for additional arguments.
123
        Useful for passing `profile` or `mode` arguments to the theano
124
        function that will be compiled for the algorithm.
125
126
    Attributes
127
    ----------
128
    gradients : dict
129
        The gradient dictionary.
130
    step_rule : instance of :class:`StepRule`
131
        The step rule.
132
    updates : list of :class:`~tensor.TensorSharedVariable` updates
133
        Updates to be done for every batch. It is required that the
134
        updates are done using the old values of optimized parameters.
135
136
    Notes
137
    -----
138
    Changing `updates` attribute or calling `add_updates` after
139
    the `initialize` method is called will have no effect.
140
141
    .. todo::
142
143
       Some shared variables are not parameters (e.g. those created by
144
       random streams).
145
146
    .. todo::
147
148
       Due to a rather premature status of the :class:`ComputationGraph`
149
       class the parameter used only inside scans are not fetched
150
       currently.
151
152
    """
153
    def __init__(self, cost=None, parameters=None, step_rule=None,
154
                 gradients=None, known_grads=None, consider_constant=None,
155
                 on_unused_sources='raise', theano_func_kwargs=None, **kwargs):
156
        super(GradientDescent, self).__init__(**kwargs)
157
        self.cost = cost
158
        self._cost_computation_graph = ComputationGraph(self.cost)
159
        self._updates = []
160
        self.parameters = parameters
161
162
        self.gradients = gradients
163
        if not self.gradients:
164
            logger.info("Taking the cost gradient")
165
            self.gradients = dict(
166
                equizip(self.parameters, tensor.grad(
167
                    self.cost, self.parameters,
168
                    known_grads=known_grads,
169
                    consider_constant=consider_constant)))
170
            logger.info("The cost gradient computation graph is built")
171
        else:
172
            if known_grads:
173
                raise ValueError("known_grads has no effect when gradients "
174
                                 "are passed in")
175
            if consider_constant is not None:
176
                raise ValueError("consider_constant has no effect when "
177
                                 "gradients are passed in")
178
        self.step_rule = step_rule if step_rule else Scale()
179
180
        self.total_gradient_norm = l2_norm(
181
            self.gradients.values()).copy(name="total_gradient_norm")
182
        self.steps, self.step_rule_updates = (
183
            self.step_rule.compute_steps(self.gradients))
184
        self.total_step_norm = l2_norm(
185
            self.steps.values()).copy(name="total_step_norm")
186
        self.on_unused_sources = on_unused_sources
187
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
188
                                   is not None else dict())
189
190
    def initialize(self):
191
        logger.info("Initializing the training algorithm")
192
        all_updates = self.updates
193
        # Note: the gradients are computed in the same order in which
194
        # the parameters were given. Keep it like that to ensure
195
        # reproducibility.
196
        for parameter in self.parameters:
197
            all_updates.append((parameter, parameter - self.steps[parameter]))
198
        all_updates += self.step_rule_updates
199
        self._function = theano.function(
200
            self.inputs, [], updates=all_updates, **self.theano_func_kwargs)
201
        logger.info("The training algorithm is initialized")
202
203
    def _validate_source_names(self, batch):
204
        in_names = [v.name for v in self.inputs]
205
206
        if not set(in_names).issubset(set(batch.keys())):
207
            raise ValueError("Didn't find all sources: " +
208
                             source_missing_error.format(
209
                                 sources=batch.keys(),
210
                                 variables=in_names))
211
        if not set(batch.keys()).issubset(set(in_names)):
212
            if self.on_unused_sources == 'ignore':
213
                pass
214
            elif self.on_unused_sources == 'warn':
215
                if not hasattr(self, '_unused_source_warned'):
216
                    logger.warn(variable_mismatch_error.format(
217
                        sources=batch.keys(),
218
                        variables=in_names))
219
                self._unused_source_warned = True
220
            elif self.on_unused_sources == 'raise':
221
                raise ValueError(
222
                    "mismatch of variable names and data sources" +
223
                    variable_mismatch_error.format(
224
                        sources=batch.keys(),
225
                        variables=in_names))
226
            else:
227
                raise ValueError("Wrong value of on_unused_sources: {}."
228
                                 .format(self.on_unused_sources))
229
230
    def process_batch(self, batch):
231
        self._validate_source_names(batch)
232
        ordered_batch = [batch[v.name] for v in self.inputs]
233
        self._function(*ordered_batch)
234
235
    @property
236
    def inputs(self):
237
        """Return inputs of the cost computation graph.
238
239
        Returns
240
        -------
241
        inputs : list of :class:`~tensor.TensorVariable`
242
            Inputs to this graph.
243
244
        """
245
        return self._cost_computation_graph.inputs
246
247
    @property
248
    def updates(self):
249
        return self._updates
250
251
    @updates.setter
252
    def updates(self, value):
253
        self._updates = value
254
255
    def add_updates(self, updates):
256
        """Add updates to the training process.
257
258
        The updates will be done _before_ the parameters are changed.
259
260
        Parameters
261
        ----------
262
        updates : list of tuples or :class:`~collections.OrderedDict`
263
            The updates to add.
264
265
        """
266
        if isinstance(updates, OrderedDict):
267
            updates = list(updates.items())
268
        if not isinstance(updates, list):
269
            raise ValueError
270
        self.updates.extend(updates)
271
272
273
@add_metaclass(ABCMeta)
274
class StepRule(object):
275
    """A rule to compute steps for a gradient descent algorithm."""
276
    def compute_step(self, parameter, previous_step):
277
        """Build a Theano expression for the step for a parameter.
278
279
        This method is called by default implementation of
280
        :meth:`compute_steps`, it relieves from writing a loop each time.
281
282
        Parameters
283
        ----------
284
        parameter : :class:`~tensor.TensorSharedVariable`
285
            The parameter.
286
        previous_step : :class:`~tensor.TensorVariable`
287
            Some quantity related to the gradient of the cost with respect
288
            to the parameter, either the gradient itself or a step in a
289
            related direction.
290
291
        Returns
292
        -------
293
        step : :class:`~theano.Variable`
294
            Theano variable for the step to take.
295
        updates : list
296
            A list of tuples representing updates to be performed. This
297
            is useful for stateful rules such as :class:`Momentum` which
298
            need to update shared variables after itetations.
299
300
        """
301
        raise NotImplementedError
302
303
    def compute_steps(self, previous_steps):
304
        """Build a Theano expression for steps for all parameters.
305
306
        Override this method if you want to process the steps
307
        with respect to all parameters as a whole, not parameter-wise.
308
309
        Parameters
310
        ----------
311
        previous_steps : OrderedDict
312
            An :class:`~OrderedDict` of
313
            (:class:`~tensor.TensorSharedVariable`
314
            :class:`~tensor.TensorVariable`) pairs. The keys are the
315
            parameters being trained, the values are the expressions for
316
            quantities related to gradients of the cost with respect to
317
            the parameters, either the gradients themselves or steps in
318
            related directions.
319
320
        Returns
321
        -------
322
        steps : OrderedDict
323
            A dictionary of the proposed steps in the same form as
324
            `previous_steps`.
325
        updates : list
326
            A list of tuples representing updates to be performed.
327
328
        """
329
        parameter_wise = [self.compute_step(parameter,
330
                                            previous_steps[parameter])
331
                          for parameter in previous_steps]
332
        steps, updates = equizip(*parameter_wise)
333
        steps = OrderedDict((parameter, step) for parameter, step
334
                            in equizip(previous_steps.keys(), steps))
335
        updates = list(itertools.chain(*updates))
336
        return steps, updates
337
338
339
class CompositeRule(StepRule):
340
    """Chains several step rules.
341
342
    Parameters
343
    ----------
344
    components : list of :class:`StepRule`
345
        The learning rules to be chained. The rules will be applied in the
346
        order as given.
347
348
    """
349
    def __init__(self, components):
350
        self.components = components
351
352
    def compute_steps(self, previous_steps):
353
        steps = previous_steps
354
        updates = []
355
        for rule in self.components:
356
            steps, more_updates = rule.compute_steps(steps)
357
            updates += more_updates
358
        return steps, updates
359
360
361
class Scale(StepRule):
362
    """A step in the direction proportional to the previous step.
363
364
    If used in :class:`GradientDescent` alone, this step rule implements
365
    steepest descent.
366
367
    Parameters
368
    ----------
369
    learning_rate : float
370
        The learning rate by which the previous step is multiplied to
371
        produce the step.
372
373
    Attributes
374
    ----------
375
    learning_rate : :class:`~tensor.TensorSharedVariable`
376
        The shared variable storing the learning rate used.
377
378
    """
379
    def __init__(self, learning_rate=1.0):
380
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
381
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
382
383
    def compute_step(self, parameter, previous_step):
384
        return self.learning_rate * previous_step, []
385
386
387
class BasicMomentum(StepRule):
388
    """Accumulates step with exponential discount.
389
390
    Parameters
391
    ----------
392
    momentum : float, optional
393
        The momentum coefficient. Defaults to 0.
394
395
    Notes
396
    -----
397
    This step rule is intended to be used in conjunction with another
398
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
399
    experience, look at :class:`Momentum`.
400
401
    """
402
    def __init__(self, momentum=0.):
403
        self.momentum = shared_floatx(momentum, "momentum")
404
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
405
406
    def compute_step(self, parameter, previous_step):
407
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
408
        step = self.momentum * velocity + previous_step
409
        updates = [(velocity, step)]
410
        return step, updates
411
412
413
class Momentum(CompositeRule):
414
    """Accumulates step with exponential discount.
415
416
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
417
    usual momentum step rule.
418
419
    Parameters
420
    ----------
421
    learning_rate : float, optional
422
        The learning rate by which the previous step scaled. Defaults to 1.
423
    momentum : float, optional
424
        The momentum coefficient. Defaults to 0.
425
426
    Attributes
427
    ----------
428
    learning_rate : :class:`~tensor.SharedVariable`
429
        A variable for learning rate.
430
    momentum : :class:`~tensor.SharedVariable`
431
        A variable for momentum.
432
433
    See Also
434
    --------
435
    :class:`SharedVariableModifier`
436
437
    """
438
    def __init__(self, learning_rate=1.0, momentum=0.):
439
        scale = Scale(learning_rate=learning_rate)
440
        basic_momentum = BasicMomentum(momentum=momentum)
441
        self.learning_rate = scale.learning_rate
442
        self.momentum = basic_momentum.momentum
443
        self.components = [scale, basic_momentum]
444
445
446
class AdaDelta(StepRule):
447
    """Adapts the step size over time using only first order information.
448
449
    Parameters
450
    ----------
451
    decay_rate : float, optional
452
        Decay rate in [0, 1]. Defaults to 0.95.
453
    epsilon : float, optional
454
        Stabilizing constant for RMS. Defaults to 1e-6.
455
456
    Notes
457
    -----
458
    For more information, see [ADADELTA]_.
459
460
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
461
       Rate Method*, arXiv:1212.5701.
462
463
    """
464
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
465
        if not 0.0 <= decay_rate <= 1.0:
466
            raise ValueError("decay rate needs to be in [0, 1]")
467
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
468
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
469
        self.epsilon = shared_floatx(epsilon, "epsilon")
470
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
471
472
    def compute_step(self, parameter, previous_step):
473
        mean_square_step_tm1 = _create_algorithm_buffer_for(
474
            parameter, "mean_square_step_tm1")
475
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
476
            parameter, "mean_square_delta_x_tm1")
477
478
        mean_square_step_t = (
479
            self.decay_rate * mean_square_step_tm1 +
480
            (1 - self.decay_rate) * tensor.sqr(previous_step)
481
        )
482
483
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
484
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
485
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
486
487
        mean_square_delta_x_t = (
488
            self.decay_rate * mean_square_delta_x_tm1 +
489
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
490
        )
491
492
        step = delta_x_t
493
        updates = [(mean_square_step_tm1, mean_square_step_t),
494
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
495
        return step, updates
496
497
498
class BasicRMSProp(StepRule):
499
    """Scales the step size by a running average of the recent step norms.
500
501
    Parameters
502
    ----------
503
    decay_rate : float, optional
504
        How fast the running average decays, value in [0, 1]
505
        (lower is faster).  Defaults to 0.9.
506
    max_scaling : float, optional
507
        Maximum scaling of the step size, in case the running average is
508
        really small. Needs to be greater than 0. Defaults to 1e5.
509
510
    Notes
511
    -----
512
    This step rule is intended to be used in conjunction with another
513
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
514
    experience, look at :class:`RMSProp`.
515
516
    In general, this step rule should be used _before_ other step rules,
517
    because it has normalization properties that may undo their work.
518
    For instance, it should be applied first when used in conjunction
519
    with :class:`Scale`.
520
521
    For more information, see [Hint2014]_.
522 View Code Duplication
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    """
524
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
525
        if not 0.0 <= decay_rate <= 1.0:
526
            raise ValueError("decay rate needs to be in [0, 1]")
527
        if max_scaling <= 0:
528
            raise ValueError("max. scaling needs to be greater than 0")
529
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
530
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
531
        self.epsilon = 1. / max_scaling
532
533
    def compute_step(self, parameter, previous_step):
534
        mean_square_step_tm1 = _create_algorithm_buffer_for(
535
            parameter, "mean_square_step_tm1")
536
        mean_square_step_t = (
537
            self.decay_rate * mean_square_step_tm1 +
538
            (1 - self.decay_rate) * tensor.sqr(previous_step))
539
        rms_step_t = tensor.maximum(
540
            tensor.sqrt(mean_square_step_t), self.epsilon)
541
        step = previous_step / rms_step_t
542
        updates = [(mean_square_step_tm1, mean_square_step_t)]
543
        return step, updates
544
545
546
class RMSProp(CompositeRule):
547
    """Scales the step size by a running average of the recent step norms.
548
549
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
550
    described in [Hint2014]_.
551
552
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
553
       lecture 6a,
554
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
555
556
    Parameters
557
    ----------
558
    learning_rate : float, optional
559
        The learning rate by which the previous step scaled. Defaults to 1.
560
    decay_rate : float, optional
561
        How fast the running average decays (lower is faster).
562
        Defaults to 0.9.
563
    max_scaling : float, optional
564
        Maximum scaling of the step size, in case the running average is
565
        really small. Defaults to 1e5.
566
567
    Attributes
568
    ----------
569
    learning_rate : :class:`~tensor.SharedVariable`
570
        A variable for learning rate.
571
    decay_rate : :class:`~tensor.SharedVariable`
572
        A variable for decay rate.
573
574
    See Also
575
    --------
576
    :class:`SharedVariableModifier`
577
578
    """
579
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
580
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
581
                                      max_scaling=max_scaling)
582
        scale = Scale(learning_rate=learning_rate)
583
        self.learning_rate = scale.learning_rate
584
        self.decay_rate = basic_rms_prop.decay_rate
585
        self.components = [basic_rms_prop, scale]
586
587
588
class StepClipping(StepRule):
589
    """Rescales an entire step if its L2 norm exceeds a threshold.
590
591
    When the previous steps are the gradients, this step rule performs
592
    gradient clipping.
593
594
    Parameters
595
    ----------
596
    threshold : float, optional
597
        The maximum permitted L2 norm for the step. The step
598
        will be rescaled to be not higher than this quanity.
599
        If ``None``, no rescaling will be applied.
600
601
    Attributes
602
    ----------
603
    threshold : :class:`.tensor.TensorSharedVariable`
604
        The shared variable storing the clipping threshold used.
605
606
    """
607
    def __init__(self, threshold=None):
608
        if threshold:
609
            self.threshold = shared_floatx(threshold, "threshold")
610
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
611
612
    def compute_steps(self, previous_steps):
613
        if not hasattr(self, 'threshold'):
614
            return previous_steps
615
        norm = l2_norm(previous_steps.values())
616
        multiplier = tensor.switch(norm < self.threshold,
617
                                   1, self.threshold / norm)
618
        steps = OrderedDict(
619
            (parameter, step * multiplier)
620
            for parameter, step in previous_steps.items())
621
        return steps, []
622
623
624
class VariableClipping(StepRule):
625
    """Clip the maximum norm of individual variables along certain axes.
626
627
    This :class:`StepRule` can be used to implement L2 norm constraints on
628
    e.g. the weight vectors of individual hidden units, convolutional
629
    filters or entire weight tensors. Combine with :class:`Restrict`
630
    (and possibly :class:`CompositeRule`), to apply such constraints only
631
    to certain variables and/or apply different norm constraints to
632
    different variables.
633
634
    Parameters
635
    ----------
636
    threshold : float
637
        Maximum norm for a given (portion of a) tensor.
638
    axis : int or iterable, optional
639
        An integer single axis, or an iterable collection of integer
640
        axes over which to sum in order to calculate the L2 norm. If
641
        `None` (the default), the norm is computed over all elements
642
        of the tensor.
643
644
    Notes
645
    -----
646
    Because of the way the :class:`StepRule` API works, this particular
647
    rule implements norm clipping of the value *after* update in the
648
    following way: it computes ``parameter - previous_step``, scales it
649
    to have (possibly axes-wise) norm(s) of at most `threshold`,
650
    then subtracts *that* value from `parameter` to yield an 'equivalent
651
    step' that respects the desired norm constraints. This procedure
652
    implicitly assumes one is doing simple (stochastic) gradient descent,
653
    and so steps computed by this step rule may not make sense for use
654
    in other contexts.
655
656
    Investigations into max-norm regularization date from [Srebro2005]_.
657
    The first appearance of this technique as a regularization method
658
    for the weight vectors of individual hidden units in feed-forward
659
    neural networks may be [Hinton2012]_.
660
661
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
662
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
663
       on Learning Theory (COLT)*, June 2005.
664
665
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
666
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
667
       "Improving neural networks by preventing co-adaptation of
668
       feature detectors". arXiv:1207.0580.
669
670
    """
671
    def __init__(self, threshold, axis=None):
672
        axis = pack(axis) if axis is not None else ()
673
        self.axis = set(axis)
674
        self.threshold = shared_floatx(threshold, "threshold")
675
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
676
        if len(axis) != len(self.axis):
677
            raise ValueError("axis must be unique")
678
679
    def compute_step(self, parameter, previous_step):
680
        if any(ax >= previous_step.ndim for ax in self.axis):
681
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
682
                self.axis, parameter, previous_step.ndim))
683
        if len(self.axis) == 0:
684
            norms = l2_norm([parameter - previous_step])
685
        else:
686
            squares = tensor.sqr(parameter - previous_step)
687
            norms = tensor.sqrt(
688
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
689
                       sorted(self.axis), squares))
690
        # We want a step s* that is the same as scaling
691
        # (parameter - previous_step) by threshold / norm
692
        # when threshold < norm.
693
        shrinking_step = (parameter -
694
                          (self.threshold / norms) *
695
                          (parameter - previous_step))
696
        return tensor.switch(norms > self.threshold,
697
                             shrinking_step,
698
                             previous_step), ()
699
700
701
class AdaGrad(StepRule):
702
    """Implements the AdaGrad learning rule.
703
704
    Parameters
705
    ----------
706
    learning_rate : float, optional
707
        Step size.
708
        Default value is set to 0.0002.
709
    epsilon : float, optional
710
        Stabilizing constant for one over root of sum of squares.
711
        Defaults to 1e-6.
712
713
    Notes
714
    -----
715
    For more information, see [ADAGRAD]_.
716
717
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
718
       *Adaptive subgradient methods for online learning and
719
        stochastic optimization*,
720
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
721
722
    """
723
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
724
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
725 View Code Duplication
        self.epsilon = shared_floatx(epsilon, "epsilon")
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
727
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
728
729
    def compute_step(self, parameter, previous_step):
730
        name = 'adagrad_sqs'
731
        if parameter.name:
732
            name += '_' + parameter.name
733
        ssq = _create_algorithm_buffer_for(parameter, name=name)
734
735
        ssq_t = (tensor.sqr(previous_step) + ssq)
736
        step = (self.learning_rate * previous_step /
737
                (tensor.sqrt(ssq_t) + self.epsilon))
738
739
        updates = [(ssq, ssq_t)]
740
741
        return step, updates
742
743
744
class Adam(StepRule):
745
    """Adam optimizer as described in [King2014]_.
746
747
    .. [King2014] Diederik Kingma, Jimmy Ba,
748
       *Adam: A Method for Stochastic Optimization*,
749
       http://arxiv.org/abs/1412.6980
750
751
    Parameters
752
    ----------
753
    learning_rate : float, optional
754
        Step size.
755
        Default value is set to 0.002.
756
    beta1 : float, optional
757
        Exponential decay rate for the first moment estimates.
758
        Default value is set to 0.1.
759
    beta2 : float, optional
760
        Exponential decay rate for the second moment estimates.
761
        Default value is set to 0.001.
762
    epsilon : float, optional
763
        Default value is set to 1e-8.
764
    decay_factor : float, optional
765
        Default value is set to 1 - 1e-8.
766
767
    """
768
    def __init__(self, learning_rate=0.002,
769
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
770
                 decay_factor=(1 - 1e-8)):
771
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
772
        self.beta1 = shared_floatx(beta1, "beta1")
773
        self.beta2 = shared_floatx(beta2, "beta2")
774
        self.epsilon = shared_floatx(epsilon, "epsilon")
775
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
776
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
777
                      self.decay_factor]:
778
            add_role(param, ALGORITHM_HYPERPARAMETER)
779
780
    def compute_step(self, parameter, previous_step):
781
        mean = _create_algorithm_buffer_for(parameter, 'mean')
782
        variance = _create_algorithm_buffer_for(parameter, 'variance')
783
        time = shared_floatx(0., 'time')
784
        add_role(time, ALGORITHM_BUFFER)
785
786
        t1 = time + 1
787
        learning_rate = (self.learning_rate *
788
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
789
                         (1. - (1. - self.beta1)**t1))
790
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
791
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
792
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
793
                      (1. - self.beta2) * variance)
794
        step = (learning_rate * mean_t /
795
                (tensor.sqrt(variance_t) + self.epsilon))
796
797
        updates = [(mean, mean_t),
798
                   (variance, variance_t),
799
                   (time, t1)]
800
801
        return step, updates
802
803
804
class RemoveNotFinite(StepRule):
805
    """A step rule that skips steps with non-finite elements.
806
807
    Replaces a step (the parameter update of a single shared variable)
808
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
809
    step rescaling the parameters.
810
811
    Parameters
812
    ----------
813
    scaler : float, optional
814
        The scaling applied to the parameter in case the step contains
815
        non-finite elements. Defaults to 1, which means that parameters
816
        will not be changed.
817
818
    Notes
819
    -----
820
    This rule should be applied last!
821
822
    This trick was originally used in the GroundHog_ framework.
823
824
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
825
826
    """
827
    def __init__(self, scaler=1):
828
        self.scaler = scaler
829
830
    def compute_step(self, parameter, previous_step):
831
        step_sum = tensor.sum(previous_step)
832
        not_finite = (tensor.isnan(step_sum) +
833
                      tensor.isinf(step_sum))
834
        step = tensor.switch(
835
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
836
        return step, []
837
838
839
class Restrict(StepRule):
840
    """Applies a given :class:`StepRule` only to certain variables.
841
842
    Example applications include clipping steps on only certain parameters,
843
    or scaling a certain kind of parameter's updates (e.g. adding an
844
    additional scalar multiplier to the steps taken on convolutional
845
    filters).
846
847
    Parameters
848
    ----------
849
    step_rule : :class:`StepRule`
850
        The :class:`StepRule` to be applied on the given variables.
851
    variables : iterable
852
        A collection of Theano variables on which to apply `step_rule`.
853
        Variables not appearing in this collection will not have
854
        `step_rule` applied to them.
855
856
    """
857
    def __init__(self, step_rule, variables):
858
        self.step_rule = step_rule
859
        self.variables = frozenset(variables)
860
861
    def compute_steps(self, previous_steps):
862
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
863
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
864
        actual = OrderedDict((parameter, steps[parameter])
865
                             if parameter in steps
866
                             else (parameter, previous_steps[parameter])
867
                             for parameter in previous_steps)
868
        return actual, updates
869