Completed
Pull Request — master (#1016)
by David
01:35
created

blocks.algorithms.Adam.compute_step()   A

Complexity

Conditions 1

Size

Total Lines 22

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 22
rs 9.2
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
class DifferentiableCostMinimizer(TrainingAlgorithm):
60
    """Minimizes a differentiable cost given as a Theano expression.
61
62
    Very often the goal of training is to minimize the expected value of a
63
    Theano expression. Batch processing in this cases typically consists of
64
    running a (or a few) Theano functions.
65
    :class:`DifferentiableCostMinimizer` is the base class for such
66
    algorithms.
67
68
    Parameters
69
    ----------
70
    cost : :class:`~tensor.TensorVariable`
71
        The objective to be minimized.
72
    parameters : list of :class:`~tensor.TensorSharedVariable`
73
        The parameters to be tuned.
74
75
    Attributes
76
    ----------
77
    updates : list of :class:`~tensor.TensorSharedVariable` updates
78
        Updates to be done for every batch. It is required that the
79
        updates are done using the old values of optimized parameters.
80
    cost : :class:`~tensor.TensorVariable`
81
        The objective to be minimized.
82
    parameters : list of :class:`~tensor.TensorSharedVariable`
83
        The parameters to be tuned.
84
85
    Notes
86
    -----
87
    Changing `updates` attribute or calling `add_updates` after
88
    the `initialize` method is called will have no effect.
89
90
    .. todo::
91
92
       Some shared variables are not parameters (e.g. those created by
93
       random streams).
94
95
    .. todo::
96
97
       Due to a rather premature status of the :class:`ComputationGraph`
98
       class the parameter used only inside scans are not fetched
99
       currently.
100
101
    """
102
    def __init__(self, cost, parameters):
103
        self.cost = cost
104
        self.parameters = parameters
105
        self._cost_computation_graph = ComputationGraph(self.cost)
106
        self._updates = []
107
108
    @property
109
    def inputs(self):
110
        """Return inputs of the cost computation graph.
111
112
        Returns
113
        -------
114
        inputs : list of :class:`~tensor.TensorVariable`
115
            Inputs to this graph.
116
117
        """
118
        return self._cost_computation_graph.inputs
119
120
    @property
121
    def updates(self):
122
        return self._updates
123
124
    @updates.setter
125
    def updates(self, value):
126
        self._updates = value
127
128
    def add_updates(self, updates):
129
        """Add updates to the training process.
130
131
        The updates will be done _before_ the parameters are changed.
132
133
        Parameters
134
        ----------
135
        updates : list of tuples or :class:`~collections.OrderedDict`
136
            The updates to add.
137
138
        """
139
        if isinstance(updates, OrderedDict):
140
            updates = list(updates.items())
141
        if not isinstance(updates, list):
142
            raise ValueError
143
        self.updates.extend(updates)
144
145
146
variable_mismatch_error = """
147
148
Blocks tried to match the sources ({sources}) of the training dataset to \
149
the names of the Theano variables ({variables}), but failed to do so. \
150
If you want to train on a subset of the sources that your dataset provides, \
151
pass the `sources` keyword argument to its constructor. Or pass \
152
on_unused_sources='warn' or on_unused_sources='ignore' to \
153
the GradientDescent algorithm."""
154
155
source_missing_error = """
156
157
Blocks didn't find all the sources ({sources}) of the training dataset \
158
that match the names of the Theano variables ({variables})."""
159
160
161
class GradientDescent(DifferentiableCostMinimizer):
162
    """A base class for all gradient descent algorithms.
163
164
    By "gradient descent" we mean a training algorithm of the following
165
    form:
166
167
    .. code-block::  python
168
169
        for batch in data:
170
            steps = step_rule.compute_steps(parameters,
171
                                            gradients_wr_parameters)
172
            for parameter in parameters:
173
                parameter -= steps[parameter]
174
175
    Note, that the step is *subtracted, not added*! This is done in order
176
    to make step rule chaining possible.
177
178
    Parameters
179
    ----------
180
    step_rule : instance of :class:`StepRule`, optional
181
        An object encapsulating most of the algorithm's logic. Its
182
        `compute_steps` method is called to get Theano expression for
183
        steps.  Note, that the step rule might have a state, e.g. to
184
        remember a weighted sum of gradients from previous steps like it is
185
        done in gradient descent with momentum. If ``None``, an instance of
186
        :class:`Scale` is created.
187
    gradients : dict, optional
188
        A dictionary mapping a parameter to an expression for the cost's
189
        gradient with respect to the parameter. If ``None``, the gradient
190
        are taken automatically using :func:`theano.gradient.grad`.
191
    known_grads : dict, optional
192
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
193
        Useful when you know the [approximate] gradients of some
194
        sub-expressions and would like Theano to use that information
195
        to compute parameter gradients. Only makes sense when `gradients`
196
        is `None`.
197
    consider_constant : list, optional
198
        A passthrough to `theano.tensor.grad`'s `consider_constant`
199
        argument.  A list of expressions through which gradients will not
200
        be backpropagated. Only makes sense when `gradients` is `None`.
201
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
202
        Controls behavior when not all sources are used.
203
    theano_func_kwargs : dict, optional
204
        A passthrough to `theano.function` for additional arguments.
205
        Useful for passing `profile` or `mode` arguments to the theano
206
        function that will be compiled for the algorithm.
207
208
    Attributes
209
    ----------
210
    gradients : dict
211
        The gradient dictionary.
212
    step_rule : instance of :class:`StepRule`
213
        The step rule.
214
215
    """
216
    def __init__(self, step_rule=None, gradients=None, known_grads=None,
217
                 consider_constant=None, on_unused_sources='raise',
218
                 theano_func_kwargs=None, **kwargs):
219
        if gradients:
220
            kwargs.setdefault("parameters", gradients.keys())
221
        super(GradientDescent, self).__init__(**kwargs)
222
223
        self.gradients = gradients
224
        if not self.gradients:
225
            logger.info("Taking the cost gradient")
226
            self.gradients = dict(
227
                equizip(self.parameters, tensor.grad(
228
                    self.cost, self.parameters,
229
                    known_grads=known_grads,
230
                    consider_constant=consider_constant)))
231
            logger.info("The cost gradient computation graph is built")
232
        else:
233
            if known_grads:
234
                raise ValueError("known_grads has no effect when gradients "
235
                                 "are passed in")
236
            if consider_constant is not None:
237
                raise ValueError("consider_constant has no effect when "
238
                                 "gradients are passed in")
239
        self.step_rule = step_rule if step_rule else Scale()
240
241
        self.total_gradient_norm = l2_norm(
242
            self.gradients.values()).copy(name="total_gradient_norm")
243
        self.steps, self.step_rule_updates = (
244
            self.step_rule.compute_steps(self.gradients))
245
        self.total_step_norm = l2_norm(
246
            self.steps.values()).copy(name="total_step_norm")
247
        self.on_unused_sources = on_unused_sources
248
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
249
                                   is not None else dict())
250
251
    def initialize(self):
252
        logger.info("Initializing the training algorithm")
253
        all_updates = self.updates
254
        # Note: the gradients are computed in the same order in which
255
        # the parameters were given. Keep it like that to ensure
256
        # reproducibility.
257
        for parameter in self.parameters:
258
            all_updates.append((parameter, parameter - self.steps[parameter]))
259
        all_updates += self.step_rule_updates
260
        self._function = theano.function(
261
            self.inputs, [], updates=all_updates, **self.theano_func_kwargs)
262
        logger.info("The training algorithm is initialized")
263
264
    def _validate_source_names(self, batch):
265
        in_names = [v.name for v in self.inputs]
266
267
        if not set(in_names).issubset(set(batch.keys())):
268
            raise ValueError("Didn't find all sources: " +
269
                             source_missing_error.format(
270
                                 sources=batch.keys(),
271
                                 variables=in_names))
272
        if not set(batch.keys()).issubset(set(in_names)):
273
            if self.on_unused_sources == 'ignore':
274
                pass
275
            elif self.on_unused_sources == 'warn':
276
                if not hasattr(self, '_unused_source_warned'):
277
                    logger.warn(variable_mismatch_error.format(
278
                        sources=batch.keys(),
279
                        variables=in_names))
280
                self._unused_source_warned = True
281
            elif self.on_unused_sources == 'raise':
282
                raise ValueError(
283
                    "mismatch of variable names and data sources" +
284
                    variable_mismatch_error.format(
285
                        sources=batch.keys(),
286
                        variables=in_names))
287
            else:
288
                raise ValueError("Wrong value of on_unused_sources: {}."
289
                                 .format(self.on_unused_sources))
290
291
    def process_batch(self, batch):
292
        self._validate_source_names(batch)
293
        ordered_batch = [batch[v.name] for v in self.inputs]
294
        self._function(*ordered_batch)
295
296
297
@add_metaclass(ABCMeta)
298
class StepRule(object):
299
    """A rule to compute steps for a gradient descent algorithm."""
300
    def compute_step(self, parameter, previous_step):
301
        """Build a Theano expression for the step for a parameter.
302
303
        This method is called by default implementation of
304
        :meth:`compute_steps`, it relieves from writing a loop each time.
305
306
        Parameters
307
        ----------
308
        parameter : :class:`~tensor.TensorSharedVariable`
309
            The parameter.
310
        previous_step : :class:`~tensor.TensorVariable`
311
            Some quantity related to the gradient of the cost with respect
312
            to the parameter, either the gradient itself or a step in a
313
            related direction.
314
315
        Returns
316
        -------
317
        step : :class:`~theano.Variable`
318
            Theano variable for the step to take.
319
        updates : list
320
            A list of tuples representing updates to be performed. This
321
            is useful for stateful rules such as :class:`Momentum` which
322
            need to update shared variables after itetations.
323
324
        """
325
        raise NotImplementedError
326
327
    def compute_steps(self, previous_steps):
328
        """Build a Theano expression for steps for all parameters.
329
330
        Override this method if you want to process the steps
331
        with respect to all parameters as a whole, not parameter-wise.
332
333
        Parameters
334
        ----------
335
        previous_steps : OrderedDict
336
            An :class:`~OrderedDict` of
337
            (:class:`~tensor.TensorSharedVariable`
338
            :class:`~tensor.TensorVariable`) pairs. The keys are the
339
            parameters being trained, the values are the expressions for
340
            quantities related to gradients of the cost with respect to
341
            the parameters, either the gradients themselves or steps in
342
            related directions.
343
344
        Returns
345
        -------
346
        steps : OrderedDict
347
            A dictionary of the proposed steps in the same form as
348
            `previous_steps`.
349
        updates : list
350
            A list of tuples representing updates to be performed.
351
352
        """
353
        parameter_wise = [self.compute_step(parameter,
354
                                            previous_steps[parameter])
355
                          for parameter in previous_steps]
356
        steps, updates = equizip(*parameter_wise)
357
        steps = OrderedDict((parameter, step) for parameter, step
358
                            in equizip(previous_steps.keys(), steps))
359
        updates = list(itertools.chain(*updates))
360
        return steps, updates
361
362
363
class CompositeRule(StepRule):
364
    """Chains several step rules.
365
366
    Parameters
367
    ----------
368
    components : list of :class:`StepRule`
369
        The learning rules to be chained. The rules will be applied in the
370
        order as given.
371
372
    """
373
    def __init__(self, components):
374
        self.components = components
375
376
    def compute_steps(self, previous_steps):
377
        steps = previous_steps
378
        updates = []
379
        for rule in self.components:
380
            steps, more_updates = rule.compute_steps(steps)
381
            updates += more_updates
382
        return steps, updates
383
384
385
class Scale(StepRule):
386
    """A step in the direction proportional to the previous step.
387
388
    If used in :class:`GradientDescent` alone, this step rule implements
389
    steepest descent.
390
391
    Parameters
392
    ----------
393
    learning_rate : float
394
        The learning rate by which the previous step is multiplied to
395
        produce the step.
396
397
    Attributes
398
    ----------
399
    learning_rate : :class:`~tensor.TensorSharedVariable`
400
        The shared variable storing the learning rate used.
401
402
    """
403
    def __init__(self, learning_rate=1.0):
404
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
405
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
406
407
    def compute_step(self, parameter, previous_step):
408
        return self.learning_rate * previous_step, []
409
410
411
class BasicMomentum(StepRule):
412
    """Accumulates step with exponential discount.
413
414
    Parameters
415
    ----------
416
    momentum : float, optional
417
        The momentum coefficient. Defaults to 0.
418
419
    Notes
420
    -----
421
    This step rule is intended to be used in conjunction with another
422
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
423
    experience, look at :class:`Momentum`.
424
425
    """
426
    def __init__(self, momentum=0.):
427
        self.momentum = shared_floatx(momentum, "momentum")
428
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
429
430
    def compute_step(self, parameter, previous_step):
431
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
432
        step = self.momentum * velocity + previous_step
433
        updates = [(velocity, step)]
434
        return step, updates
435
436
437
class Momentum(CompositeRule):
438
    """Accumulates step with exponential discount.
439
440
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
441
    usual momentum step rule.
442
443
    Parameters
444
    ----------
445
    learning_rate : float, optional
446
        The learning rate by which the previous step scaled. Defaults to 1.
447
    momentum : float, optional
448
        The momentum coefficient. Defaults to 0.
449
450
    Attributes
451
    ----------
452
    learning_rate : :class:`~tensor.SharedVariable`
453
        A variable for learning rate.
454
    momentum : :class:`~tensor.SharedVariable`
455
        A variable for momentum.
456
457
    See Also
458
    --------
459
    :class:`SharedVariableModifier`
460
461
    """
462
    def __init__(self, learning_rate=1.0, momentum=0.):
463
        scale = Scale(learning_rate=learning_rate)
464
        basic_momentum = BasicMomentum(momentum=momentum)
465
        self.learning_rate = scale.learning_rate
466
        self.momentum = basic_momentum.momentum
467
        self.components = [scale, basic_momentum]
468
469
470
class AdaDelta(StepRule):
471
    """Adapts the step size over time using only first order information.
472
473
    Parameters
474
    ----------
475
    decay_rate : float, optional
476
        Decay rate in [0, 1]. Defaults to 0.95.
477
    epsilon : float, optional
478
        Stabilizing constant for RMS. Defaults to 1e-6.
479
480
    Notes
481
    -----
482
    For more information, see [ADADELTA]_.
483
484
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
485
       Rate Method*, arXiv:1212.5701.
486
487
    """
488
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
489
        if not 0.0 <= decay_rate <= 1.0:
490
            raise ValueError("decay rate needs to be in [0, 1]")
491
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
492
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
493
        self.epsilon = shared_floatx(epsilon, "epsilon")
494
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
495
496
    def compute_step(self, parameter, previous_step):
497
        mean_square_step_tm1 = _create_algorithm_buffer_for(
498
            parameter, "mean_square_step_tm1")
499
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
500
            parameter, "mean_square_delta_x_tm1")
501
502
        mean_square_step_t = (
503
            self.decay_rate * mean_square_step_tm1 +
504
            (1 - self.decay_rate) * tensor.sqr(previous_step)
505
        )
506
507
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
508
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
509
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
510
511
        mean_square_delta_x_t = (
512
            self.decay_rate * mean_square_delta_x_tm1 +
513
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
514
        )
515
516
        step = delta_x_t
517
        updates = [(mean_square_step_tm1, mean_square_step_t),
518
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
519
        return step, updates
520
521
522
class BasicRMSProp(StepRule):
523
    """Scales the step size by a running average of the recent step norms.
524
525
    Parameters
526
    ----------
527
    decay_rate : float, optional
528
        How fast the running average decays, value in [0, 1]
529
        (lower is faster).  Defaults to 0.9.
530
    max_scaling : float, optional
531
        Maximum scaling of the step size, in case the running average is
532
        really small. Needs to be greater than 0. Defaults to 1e5.
533
534
    Notes
535
    -----
536
    This step rule is intended to be used in conjunction with another
537
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
538
    experience, look at :class:`RMSProp`.
539
540
    In general, this step rule should be used _before_ other step rules,
541
    because it has normalization properties that may undo their work.
542
    For instance, it should be applied first when used in conjunction
543
    with :class:`Scale`.
544
545
    For more information, see [Hint2014]_.
546
547
    """
548
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
549
        if not 0.0 <= decay_rate <= 1.0:
550
            raise ValueError("decay rate needs to be in [0, 1]")
551
        if max_scaling <= 0:
552
            raise ValueError("max. scaling needs to be greater than 0")
553
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
554
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
555
        self.epsilon = 1. / max_scaling
556
557
    def compute_step(self, parameter, previous_step):
558
        mean_square_step_tm1 = _create_algorithm_buffer_for(
559
            parameter, "mean_square_step_tm1")
560
        mean_square_step_t = (
561
            self.decay_rate * mean_square_step_tm1 +
562
            (1 - self.decay_rate) * tensor.sqr(previous_step))
563
        rms_step_t = tensor.maximum(
564
            tensor.sqrt(mean_square_step_t), self.epsilon)
565
        step = previous_step / rms_step_t
566
        updates = [(mean_square_step_tm1, mean_square_step_t)]
567
        return step, updates
568
569
570
class RMSProp(CompositeRule):
571
    """Scales the step size by a running average of the recent step norms.
572
573
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
574
    described in [Hint2014]_.
575
576
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
577
       lecture 6a,
578
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
579
580
    Parameters
581
    ----------
582
    learning_rate : float, optional
583
        The learning rate by which the previous step scaled. Defaults to 1.
584
    decay_rate : float, optional
585
        How fast the running average decays (lower is faster).
586
        Defaults to 0.9.
587
    max_scaling : float, optional
588
        Maximum scaling of the step size, in case the running average is
589
        really small. Defaults to 1e5.
590
591
    Attributes
592
    ----------
593
    learning_rate : :class:`~tensor.SharedVariable`
594
        A variable for learning rate.
595
    decay_rate : :class:`~tensor.SharedVariable`
596
        A variable for decay rate.
597
598
    See Also
599
    --------
600
    :class:`SharedVariableModifier`
601
602
    """
603
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
604
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
605
                                      max_scaling=max_scaling)
606
        scale = Scale(learning_rate=learning_rate)
607
        self.learning_rate = scale.learning_rate
608
        self.decay_rate = basic_rms_prop.decay_rate
609
        self.components = [basic_rms_prop, scale]
610
611
612
class StepClipping(StepRule):
613
    """Rescales an entire step if its L2 norm exceeds a threshold.
614
615
    When the previous steps are the gradients, this step rule performs
616
    gradient clipping.
617
618
    Parameters
619
    ----------
620
    threshold : float, optional
621
        The maximum permitted L2 norm for the step. The step
622
        will be rescaled to be not higher than this quanity.
623
        If ``None``, no rescaling will be applied.
624
625
    Attributes
626
    ----------
627
    threshold : :class:`.tensor.TensorSharedVariable`
628
        The shared variable storing the clipping threshold used.
629
630
    """
631
    def __init__(self, threshold=None):
632
        if threshold:
633
            self.threshold = shared_floatx(threshold, "threshold")
634
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
635
636
    def compute_steps(self, previous_steps):
637
        if not hasattr(self, 'threshold'):
638
            return previous_steps
639
        norm = l2_norm(previous_steps.values())
640
        multiplier = tensor.switch(norm < self.threshold,
641
                                   1, self.threshold / norm)
642
        steps = OrderedDict(
643
            (parameter, step * multiplier)
644
            for parameter, step in previous_steps.items())
645
        return steps, []
646
647
648
class VariableClipping(StepRule):
649
    """Clip the maximum norm of individual variables along certain axes.
650
651
    This :class:`StepRule` can be used to implement L2 norm constraints on
652
    e.g. the weight vectors of individual hidden units, convolutional
653
    filters or entire weight tensors. Combine with :class:`Restrict`
654
    (and possibly :class:`CompositeRule`), to apply such constraints only
655
    to certain variables and/or apply different norm constraints to
656
    different variables.
657
658
    Parameters
659
    ----------
660
    threshold : float
661
        Maximum norm for a given (portion of a) tensor.
662
    axis : int or iterable, optional
663
        An integer single axis, or an iterable collection of integer
664
        axes over which to sum in order to calculate the L2 norm. If
665
        `None` (the default), the norm is computed over all elements
666
        of the tensor.
667
668
    Notes
669
    -----
670
    Because of the way the :class:`StepRule` API works, this particular
671
    rule implements norm clipping of the value *after* update in the
672
    following way: it computes ``parameter - previous_step``, scales it
673
    to have (possibly axes-wise) norm(s) of at most `threshold`,
674
    then subtracts *that* value from `parameter` to yield an 'equivalent
675
    step' that respects the desired norm constraints. This procedure
676
    implicitly assumes one is doing simple (stochastic) gradient descent,
677
    and so steps computed by this step rule may not make sense for use
678
    in other contexts.
679
680
    Investigations into max-norm regularization date from [Srebro2005]_.
681
    The first appearance of this technique as a regularization method
682
    for the weight vectors of individual hidden units in feed-forward
683
    neural networks may be [Hinton2012]_.
684
685
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
686
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
687
       on Learning Theory (COLT)*, June 2005.
688
689
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
690
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
691
       "Improving neural networks by preventing co-adaptation of
692
       feature detectors". arXiv:1207.0580.
693
694
    """
695
    def __init__(self, threshold, axis=None):
696
        axis = pack(axis) if axis is not None else ()
697
        self.axis = set(axis)
698
        self.threshold = shared_floatx(threshold, "threshold")
699
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
700
        if len(axis) != len(self.axis):
701
            raise ValueError("axis must be unique")
702
703
    def compute_step(self, parameter, previous_step):
704
        if any(ax >= previous_step.ndim for ax in self.axis):
705
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
706
                self.axis, parameter, previous_step.ndim))
707
        if len(self.axis) == 0:
708
            norms = l2_norm([parameter - previous_step])
709
        else:
710
            squares = tensor.sqr(parameter - previous_step)
711
            norms = tensor.sqrt(
712
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
713
                       sorted(self.axis), squares))
714
        # We want a step s* that is the same as scaling
715
        # (parameter - previous_step) by threshold / norm
716
        # when threshold < norm.
717
        shrinking_step = (parameter -
718
                          (self.threshold / norms) *
719
                          (parameter - previous_step))
720
        return tensor.switch(norms > self.threshold,
721
                             shrinking_step,
722
                             previous_step), ()
723
724
725
class AdaGrad(StepRule):
726
    """Implements the AdaGrad learning rule.
727
728
    Parameters
729
    ----------
730
    learning_rate : float, optional
731
        Step size.
732
        Default value is set to 0.0002.
733
    epsilon : float, optional
734
        Stabilizing constant for one over root of sum of squares.
735
        Defaults to 1e-6.
736
737
    Notes
738
    -----
739
    For more information, see [ADAGRAD]_.
740
741
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
742
       *Adaptive subgradient methods for online learning and
743
        stochastic optimization*,
744
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
745
746
    """
747
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
748
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
749
        self.epsilon = shared_floatx(epsilon, "epsilon")
750
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
751
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
752
753
    def compute_step(self, parameter, previous_step):
754
        name = 'adagrad_sqs'
755
        if parameter.name:
756
            name += '_' + parameter.name
757
        ssq = _create_algorithm_buffer_for(parameter, name=name)
758
759
        ssq_t = (tensor.sqr(previous_step) + ssq)
760
        step = (self.learning_rate * previous_step /
761
                (tensor.sqrt(ssq_t) + self.epsilon))
762
763
        updates = [(ssq, ssq_t)]
764
765
        return step, updates
766
767
768
class Adam(StepRule):
769
    """Adam optimizer as described in [King2014]_.
770
771
    .. [King2014] Diederik Kingma, Jimmy Ba,
772
       *Adam: A Method for Stochastic Optimization*,
773
       http://arxiv.org/abs/1412.6980
774
775
    Parameters
776
    ----------
777
    learning_rate : float, optional
778
        Step size.
779
        Default value is set to 0.0002.
780
    beta1 : float, optional
781
        Exponential decay rate for the first moment estimates.
782
        Default value is set to 0.1.
783
    beta2 : float, optional
784
        Exponential decay rate for the second moment estimates.
785
        Default value is set to 0.001.
786
    epsilon : float, optional
787
        Default value is set to 1e-8.
788
    decay_factor : float, optional
789
        Default value is set to 1 - 1e-8.
790
791
    """
792
    def __init__(self, learning_rate=0.002,
793
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
794
                 decay_factor=(1 - 1e-8)):
795
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
796
        self.beta1 = shared_floatx(beta1, "beta1")
797
        self.beta2 = shared_floatx(beta2, "beta2")
798
        self.epsilon = shared_floatx(epsilon, "epsilon")
799
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
800
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
801
                      self.decay_factor]:
802
            add_role(param, ALGORITHM_HYPERPARAMETER)
803
804
    def compute_step(self, parameter, previous_step):
805
        mean = _create_algorithm_buffer_for(parameter, 'mean')
806
        variance = _create_algorithm_buffer_for(parameter, 'variance')
807
        time = shared_floatx(0., 'time')
808
        add_role(time, ALGORITHM_BUFFER)
809
810
        t1 = time + 1
811
        learning_rate = (self.learning_rate *
812
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
813
                         (1. - (1. - self.beta1)**t1))
814
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
815
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
816
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
817
                      (1. - self.beta2) * variance)
818
        step = (learning_rate * mean_t /
819
                (tensor.sqrt(variance_t) + self.epsilon))
820
821
        updates = [(mean, mean_t),
822
                   (variance, variance_t),
823
                   (time, t1)]
824
825
        return step, updates
826
827
828
class RemoveNotFinite(StepRule):
829
    """A step rule that skips steps with non-finite elements.
830
831
    Replaces a step (the parameter update of a single shared variable)
832
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
833
    step rescaling the parameters.
834
835
    Parameters
836
    ----------
837
    scaler : float, optional
838
        The scaling applied to the parameter in case the step contains
839
        non-finite elements. Defaults to 1, which means that parameters
840
        will not be changed.
841
842
    Notes
843
    -----
844
    This rule should be applied last!
845
846
    This trick was originally used in the GroundHog_ framework.
847
848
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
849
850
    """
851
    def __init__(self, scaler=1):
852
        self.scaler = scaler
853
854
    def compute_step(self, parameter, previous_step):
855
        step_sum = tensor.sum(previous_step)
856
        not_finite = (tensor.isnan(step_sum) +
857
                      tensor.isinf(step_sum))
858
        step = tensor.switch(
859
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
860
        return step, []
861
862
863
class Restrict(StepRule):
864
    """Applies a given :class:`StepRule` only to certain variables.
865
866
    Example applications include clipping steps on only certain parameters,
867
    or scaling a certain kind of parameter's updates (e.g. adding an
868
    additional scalar multiplier to the steps taken on convolutional
869
    filters).
870
871
    Parameters
872
    ----------
873
    step_rule : :class:`StepRule`
874
        The :class:`StepRule` to be applied on the given variables.
875
    variables : iterable
876
        A collection of Theano variables on which to apply `step_rule`.
877
        Variables not appearing in this collection will not have
878
        `step_rule` applied to them.
879
880
    """
881
    def __init__(self, step_rule, variables):
882
        self.step_rule = step_rule
883
        self.variables = frozenset(variables)
884
885
    def compute_steps(self, previous_steps):
886
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
887
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
888
        actual = OrderedDict((parameter, steps[parameter])
889
                             if parameter in steps
890
                             else (parameter, previous_steps[parameter])
891
                             for parameter in previous_steps)
892
        return actual, updates
893