Completed
Pull Request — master (#1079)
by David
05:37 queued 34s
created

UpdatesAlgorithm   A

Complexity

Total Complexity 9

Size/Duplication

Total Lines 48
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 48
rs 10
wmc 9

4 Methods

Rating   Name   Duplication   Size   Complexity  
A __init__() 0 6 3
A initialize() 0 3 1
A updates() 0 3 1
A add_updates() 0 16 3
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor. Or pass \
65
on_unused_sources='warn' or on_unused_sources='ignore' to \
66
the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
class UpdatesAlgorithm(TrainingAlgorithm):
75
    """Base class for algorithms that use Theano functions with updates.
76
77
    Parameters
78
    ----------
79
    updates : list of tuples or :class:`~collections.OrderedDict`
80
        The updates that should be performed.
81
    theano_func_kwargs : dict, optional
82
        A passthrough to `theano.function` for additional arguments.
83
        Useful for passing `profile` or `mode` arguments to the theano
84
        function that will be compiled for the algorithm.
85
86
    """
87
    def __init__(self, updates=None, theano_func_kwargs=None,
88
                 **kwargs):
89
        self.updates = [] if updates is None else updates
90
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
91
                                   is not None else dict())
92
        super(UpdatesAlgorithm, self).__init__(**kwargs)
93
94
    def initialize(self):
95
        self._function = theano.function(
96
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
97
98
    @property
99
    def updates(self):
100
        return self._updates
101
102
    @updates.setter
103
    def updates(self, value):
104
        self._updates = value
105
106
    def add_updates(self, updates):
107
        """Add updates to the training process.
108
109
        The updates will be done _before_ the parameters are changed.
110
111
        Parameters
112
        ----------
113
        updates : list of tuples or :class:`~collections.OrderedDict`
114
            The updates to add.
115
116
        """
117
        if isinstance(updates, OrderedDict):
118
            updates = list(updates.items())
119
        if not isinstance(updates, list):
120
            raise ValueError
121
        self.updates.extend(updates)
122
123
124
class GradientDescent(UpdatesAlgorithm):
125
    """A base class for all gradient descent algorithms.
126
127
    By "gradient descent" we mean a training algorithm of the following
128
    form:
129
130
    .. code-block::  python
131
132
        for batch in data:
133
            steps = step_rule.compute_steps(parameters,
134
                                            gradients_wr_parameters)
135
            for parameter in parameters:
136
                parameter -= steps[parameter]
137
138
    Note, that the step is *subtracted, not added*! This is done in order
139
    to make step rule chaining possible.
140
141
    Parameters
142
    ----------
143
    cost : :class:`~tensor.TensorVariable`, optional
144
        The objective to be minimized.
145
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
146
        The parameters to be tuned. If not provided, inferred from the
147
        keys of `gradients`.
148
    step_rule : instance of :class:`StepRule`, optional
149
        An object encapsulating most of the algorithm's logic. Its
150
        `compute_steps` method is called to get Theano expression for
151
        steps.  Note, that the step rule might have a state, e.g. to
152
        remember a weighted sum of gradients from previous steps like it is
153
        done in gradient descent with momentum. If ``None``, an instance of
154
        :class:`Scale` is created.
155
    gradients : dict, optional
156
        A dictionary mapping a parameter to an expression for the cost's
157
        gradient with respect to the parameter. If ``None``, the gradient
158
        are taken automatically using :func:`theano.gradient.grad`.
159
    known_grads : dict, optional
160
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
161
        Useful when you know the [approximate] gradients of some
162
        sub-expressions and would like Theano to use that information
163
        to compute parameter gradients. Only makes sense when `gradients`
164
        is `None`.
165
    consider_constant : list, optional
166
        A passthrough to `theano.tensor.grad`'s `consider_constant`
167
        argument.  A list of expressions through which gradients will not
168
        be backpropagated. Only makes sense when `gradients` is `None`.
169
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
170
        Controls behavior when not all sources are used.
171
    Attributes
172
    ----------
173
    gradients : dict
174
        The gradient dictionary.
175
    step_rule : instance of :class:`StepRule`
176
        The step rule.
177
    updates : list of :class:`~tensor.TensorSharedVariable` updates
178
        Updates to be done for every batch. It is required that the
179
        updates are done using the old values of optimized parameters.
180
181
    Notes
182
    -----
183
    Changing `updates` attribute or calling `add_updates` after
184
    the `initialize` method is called will have no effect.
185
186
    .. todo::
187
188
       Some shared variables are not parameters (e.g. those created by
189
       random streams).
190
191
    .. todo::
192
193
       Due to a rather premature status of the :class:`ComputationGraph`
194
       class the parameter used only inside scans are not fetched
195
       currently.
196
197
    """
198
    def __init__(self, cost=None, parameters=None, step_rule=None,
199
                 gradients=None, known_grads=None, consider_constant=None,
200
                 on_unused_sources='raise', **kwargs):
201
        super(GradientDescent, self).__init__(**kwargs)
202
        # Set initial values for cost, parameters, gradients.
203
        self.cost = cost
204
        self.parameters = parameters
205
        self.gradients = gradients
206
207
        # If we don't have gradients, we'll need to infer them from the
208
        # cost and the parameters, both of which must not be None.
209
        if not self.gradients:
210
            if self.cost is None:
211
                raise ValueError("can't infer gradients; no cost specified")
212
            elif self.parameters is None or len(self.parameters) == 0:
213
                raise ValueError("can't infer gradients; "
214
                                 "no parameters specified")
215
            self.inputs = ComputationGraph(cost).inputs
216
            logger.info("Taking the cost gradient")
217
            self.gradients = dict(
218
                equizip(self.parameters, tensor.grad(
219
                    self.cost, self.parameters,
220
                    known_grads=known_grads,
221
                    consider_constant=consider_constant)))
222
            logger.info("The cost gradient computation graph is built")
223
        else:
224
            # If we have gradients, we get parameters from that.
225
            # If you're specifying both then something is screwy.
226
            if self.parameters is not None:
227
                logger.warning('{} received both gradients and parameters '
228
                               'arguments; using parameters deduced from '
229
                               'gradients')
230
            gradients_dict = dict(gradients)
231
            self.parameters = list(gradients_dict.keys())
232
            self.inputs = ComputationGraph(gradients_dict.values()).inputs
233
            if known_grads:
234
                raise ValueError("known_grads has no effect when gradients "
235
                                 "are passed in")
236
            if consider_constant is not None:
237
                raise ValueError("consider_constant has no effect when "
238
                                 "gradients are passed in")
239
        self.step_rule = step_rule if step_rule else Scale()
240
241
        self.total_gradient_norm = l2_norm(
242
            self.gradients.values()).copy(name="total_gradient_norm")
243
        self.steps, self.step_rule_updates = (
244
            self.step_rule.compute_steps(self.gradients))
245
        self.total_step_norm = l2_norm(
246
            self.steps.values()).copy(name="total_step_norm")
247
        self.on_unused_sources = on_unused_sources
248
249
    def initialize(self):
250
        logger.info("Initializing the training algorithm")
251
        # Note: the gradients are computed in the same order in which
252
        # the parameters were given. Keep it like that to ensure
253
        # reproducibility.
254
        for parameter in self.parameters:
255
            self.updates.append((parameter, parameter - self.steps[parameter]))
256
        self.updates += self.step_rule_updates
257
        super(GradientDescent, self).initialize()
258
        logger.info("The training algorithm is initialized")
259
260
    def _validate_source_names(self, batch):
261
        in_names = [v.name for v in self.inputs]
262
263
        if not set(in_names).issubset(set(batch.keys())):
264
            raise ValueError("Didn't find all sources: " +
265
                             source_missing_error.format(
266
                                 sources=batch.keys(),
267
                                 variables=in_names))
268
        if not set(batch.keys()).issubset(set(in_names)):
269
            if self.on_unused_sources == 'ignore':
270
                pass
271
            elif self.on_unused_sources == 'warn':
272
                if not hasattr(self, '_unused_source_warned'):
273
                    logger.warn(variable_mismatch_error.format(
274
                        sources=batch.keys(),
275
                        variables=in_names))
276
                self._unused_source_warned = True
277
            elif self.on_unused_sources == 'raise':
278
                raise ValueError(
279
                    "mismatch of variable names and data sources" +
280
                    variable_mismatch_error.format(
281
                        sources=batch.keys(),
282
                        variables=in_names))
283
            else:
284
                raise ValueError("Wrong value of on_unused_sources: {}."
285
                                 .format(self.on_unused_sources))
286
287
    def process_batch(self, batch):
288
        self._validate_source_names(batch)
289
        ordered_batch = [batch[v.name] for v in self.inputs]
290
        self._function(*ordered_batch)
291
292
293
@add_metaclass(ABCMeta)
294
class StepRule(object):
295
    """A rule to compute steps for a gradient descent algorithm."""
296
    def compute_step(self, parameter, previous_step):
297
        """Build a Theano expression for the step for a parameter.
298
299
        This method is called by default implementation of
300
        :meth:`compute_steps`, it relieves from writing a loop each time.
301
302
        Parameters
303
        ----------
304
        parameter : :class:`~tensor.TensorSharedVariable`
305
            The parameter.
306
        previous_step : :class:`~tensor.TensorVariable`
307
            Some quantity related to the gradient of the cost with respect
308
            to the parameter, either the gradient itself or a step in a
309
            related direction.
310
311
        Returns
312
        -------
313
        step : :class:`~theano.Variable`
314
            Theano variable for the step to take.
315
        updates : list
316
            A list of tuples representing updates to be performed. This
317
            is useful for stateful rules such as :class:`Momentum` which
318
            need to update shared variables after itetations.
319
320
        """
321
        raise NotImplementedError
322
323
    def compute_steps(self, previous_steps):
324
        """Build a Theano expression for steps for all parameters.
325
326
        Override this method if you want to process the steps
327
        with respect to all parameters as a whole, not parameter-wise.
328
329
        Parameters
330
        ----------
331
        previous_steps : OrderedDict
332
            An :class:`~OrderedDict` of
333
            (:class:`~tensor.TensorSharedVariable`
334
            :class:`~tensor.TensorVariable`) pairs. The keys are the
335
            parameters being trained, the values are the expressions for
336
            quantities related to gradients of the cost with respect to
337
            the parameters, either the gradients themselves or steps in
338
            related directions.
339
340
        Returns
341
        -------
342
        steps : OrderedDict
343
            A dictionary of the proposed steps in the same form as
344
            `previous_steps`.
345
        updates : list
346
            A list of tuples representing updates to be performed.
347
348
        """
349
        parameter_wise = [self.compute_step(parameter,
350
                                            previous_steps[parameter])
351
                          for parameter in previous_steps]
352
        steps, updates = equizip(*parameter_wise)
353
        steps = OrderedDict((parameter, step) for parameter, step
354
                            in equizip(previous_steps.keys(), steps))
355
        updates = list(itertools.chain(*updates))
356
        return steps, updates
357
358
359
class CompositeRule(StepRule):
360
    """Chains several step rules.
361
362
    Parameters
363
    ----------
364
    components : list of :class:`StepRule`
365
        The learning rules to be chained. The rules will be applied in the
366
        order as given.
367
368
    """
369
    def __init__(self, components):
370
        self.components = components
371
372
    def compute_steps(self, previous_steps):
373
        steps = previous_steps
374
        updates = []
375
        for rule in self.components:
376
            steps, more_updates = rule.compute_steps(steps)
377
            updates += more_updates
378
        return steps, updates
379
380
381
class Scale(StepRule):
382
    """A step in the direction proportional to the previous step.
383
384
    If used in :class:`GradientDescent` alone, this step rule implements
385
    steepest descent.
386
387
    Parameters
388
    ----------
389
    learning_rate : float
390
        The learning rate by which the previous step is multiplied to
391
        produce the step.
392
393
    Attributes
394
    ----------
395
    learning_rate : :class:`~tensor.TensorSharedVariable`
396
        The shared variable storing the learning rate used.
397
398
    """
399
    def __init__(self, learning_rate=1.0):
400
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
401
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
402
403
    def compute_step(self, parameter, previous_step):
404
        return self.learning_rate * previous_step, []
405
406
407
class BasicMomentum(StepRule):
408
    """Accumulates step with exponential discount.
409
410
    Parameters
411
    ----------
412
    momentum : float, optional
413
        The momentum coefficient. Defaults to 0.
414
415
    Notes
416
    -----
417
    This step rule is intended to be used in conjunction with another
418
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
419
    experience, look at :class:`Momentum`.
420
421
    """
422
    def __init__(self, momentum=0.):
423
        self.momentum = shared_floatx(momentum, "momentum")
424
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
425
426
    def compute_step(self, parameter, previous_step):
427
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
428
        step = self.momentum * velocity + previous_step
429
        updates = [(velocity, step)]
430
        return step, updates
431
432
433
class Momentum(CompositeRule):
434
    """Accumulates step with exponential discount.
435
436
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
437
    usual momentum step rule.
438
439
    Parameters
440
    ----------
441
    learning_rate : float, optional
442
        The learning rate by which the previous step scaled. Defaults to 1.
443
    momentum : float, optional
444
        The momentum coefficient. Defaults to 0.
445
446
    Attributes
447
    ----------
448
    learning_rate : :class:`~tensor.SharedVariable`
449
        A variable for learning rate.
450
    momentum : :class:`~tensor.SharedVariable`
451
        A variable for momentum.
452
453
    See Also
454
    --------
455
    :class:`SharedVariableModifier`
456
457
    """
458
    def __init__(self, learning_rate=1.0, momentum=0.):
459
        scale = Scale(learning_rate=learning_rate)
460
        basic_momentum = BasicMomentum(momentum=momentum)
461
        self.learning_rate = scale.learning_rate
462
        self.momentum = basic_momentum.momentum
463
        self.components = [scale, basic_momentum]
464
465
466
class AdaDelta(StepRule):
467
    """Adapts the step size over time using only first order information.
468
469
    Parameters
470
    ----------
471
    decay_rate : float, optional
472
        Decay rate in [0, 1]. Defaults to 0.95.
473
    epsilon : float, optional
474
        Stabilizing constant for RMS. Defaults to 1e-6.
475
476
    Notes
477
    -----
478
    For more information, see [ADADELTA]_.
479
480
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
481
       Rate Method*, arXiv:1212.5701.
482
483
    """
484
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
485
        if not 0.0 <= decay_rate <= 1.0:
486
            raise ValueError("decay rate needs to be in [0, 1]")
487
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
488
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
489
        self.epsilon = shared_floatx(epsilon, "epsilon")
490
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
491
492
    def compute_step(self, parameter, previous_step):
493
        mean_square_step_tm1 = _create_algorithm_buffer_for(
494
            parameter, "mean_square_step_tm1")
495
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
496
            parameter, "mean_square_delta_x_tm1")
497
498
        mean_square_step_t = (
499
            self.decay_rate * mean_square_step_tm1 +
500
            (1 - self.decay_rate) * tensor.sqr(previous_step)
501
        )
502
503
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
504
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
505
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
506
507
        mean_square_delta_x_t = (
508
            self.decay_rate * mean_square_delta_x_tm1 +
509
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
510
        )
511
512
        step = delta_x_t
513
        updates = [(mean_square_step_tm1, mean_square_step_t),
514
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
515
        return step, updates
516
517
518
class BasicRMSProp(StepRule):
519
    """Scales the step size by a running average of the recent step norms.
520
521
    Parameters
522 View Code Duplication
    ----------
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    decay_rate : float, optional
524
        How fast the running average decays, value in [0, 1]
525
        (lower is faster).  Defaults to 0.9.
526
    max_scaling : float, optional
527
        Maximum scaling of the step size, in case the running average is
528
        really small. Needs to be greater than 0. Defaults to 1e5.
529
530
    Notes
531
    -----
532
    This step rule is intended to be used in conjunction with another
533
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
534
    experience, look at :class:`RMSProp`.
535
536
    In general, this step rule should be used _before_ other step rules,
537
    because it has normalization properties that may undo their work.
538
    For instance, it should be applied first when used in conjunction
539
    with :class:`Scale`.
540
541
    For more information, see [Hint2014]_.
542
543
    """
544
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
545
        if not 0.0 <= decay_rate <= 1.0:
546
            raise ValueError("decay rate needs to be in [0, 1]")
547
        if max_scaling <= 0:
548
            raise ValueError("max. scaling needs to be greater than 0")
549
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
550
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
551
        self.epsilon = 1. / max_scaling
552
553
    def compute_step(self, parameter, previous_step):
554
        mean_square_step_tm1 = _create_algorithm_buffer_for(
555
            parameter, "mean_square_step_tm1")
556
        mean_square_step_t = (
557
            self.decay_rate * mean_square_step_tm1 +
558
            (1 - self.decay_rate) * tensor.sqr(previous_step))
559
        rms_step_t = tensor.maximum(
560
            tensor.sqrt(mean_square_step_t), self.epsilon)
561
        step = previous_step / rms_step_t
562
        updates = [(mean_square_step_tm1, mean_square_step_t)]
563
        return step, updates
564
565
566
class RMSProp(CompositeRule):
567
    """Scales the step size by a running average of the recent step norms.
568
569
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
570
    described in [Hint2014]_.
571
572
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
573
       lecture 6a,
574
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
575
576
    Parameters
577
    ----------
578
    learning_rate : float, optional
579
        The learning rate by which the previous step scaled. Defaults to 1.
580
    decay_rate : float, optional
581
        How fast the running average decays (lower is faster).
582
        Defaults to 0.9.
583
    max_scaling : float, optional
584
        Maximum scaling of the step size, in case the running average is
585
        really small. Defaults to 1e5.
586
587
    Attributes
588
    ----------
589
    learning_rate : :class:`~tensor.SharedVariable`
590
        A variable for learning rate.
591
    decay_rate : :class:`~tensor.SharedVariable`
592
        A variable for decay rate.
593
594
    See Also
595
    --------
596
    :class:`SharedVariableModifier`
597
598
    """
599
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
600
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
601
                                      max_scaling=max_scaling)
602
        scale = Scale(learning_rate=learning_rate)
603
        self.learning_rate = scale.learning_rate
604
        self.decay_rate = basic_rms_prop.decay_rate
605
        self.components = [basic_rms_prop, scale]
606
607
608
class StepClipping(StepRule):
609
    """Rescales an entire step if its L2 norm exceeds a threshold.
610
611
    When the previous steps are the gradients, this step rule performs
612
    gradient clipping.
613
614
    Parameters
615
    ----------
616
    threshold : float, optional
617
        The maximum permitted L2 norm for the step. The step
618
        will be rescaled to be not higher than this quanity.
619
        If ``None``, no rescaling will be applied.
620
621
    Attributes
622
    ----------
623
    threshold : :class:`.tensor.TensorSharedVariable`
624
        The shared variable storing the clipping threshold used.
625
626
    """
627
    def __init__(self, threshold=None):
628
        if threshold:
629
            self.threshold = shared_floatx(threshold, "threshold")
630
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
631
632
    def compute_steps(self, previous_steps):
633
        if not hasattr(self, 'threshold'):
634
            return previous_steps
635
        norm = l2_norm(previous_steps.values())
636
        multiplier = tensor.switch(norm < self.threshold,
637
                                   1, self.threshold / norm)
638
        steps = OrderedDict(
639
            (parameter, step * multiplier)
640
            for parameter, step in previous_steps.items())
641
        return steps, []
642
643
644
class VariableClipping(StepRule):
645
    """Clip the maximum norm of individual variables along certain axes.
646
647
    This :class:`StepRule` can be used to implement L2 norm constraints on
648
    e.g. the weight vectors of individual hidden units, convolutional
649
    filters or entire weight tensors. Combine with :class:`Restrict`
650
    (and possibly :class:`CompositeRule`), to apply such constraints only
651
    to certain variables and/or apply different norm constraints to
652
    different variables.
653
654
    Parameters
655
    ----------
656
    threshold : float
657
        Maximum norm for a given (portion of a) tensor.
658
    axis : int or iterable, optional
659
        An integer single axis, or an iterable collection of integer
660
        axes over which to sum in order to calculate the L2 norm. If
661
        `None` (the default), the norm is computed over all elements
662
        of the tensor.
663
664
    Notes
665
    -----
666
    Because of the way the :class:`StepRule` API works, this particular
667
    rule implements norm clipping of the value *after* update in the
668
    following way: it computes ``parameter - previous_step``, scales it
669
    to have (possibly axes-wise) norm(s) of at most `threshold`,
670
    then subtracts *that* value from `parameter` to yield an 'equivalent
671
    step' that respects the desired norm constraints. This procedure
672
    implicitly assumes one is doing simple (stochastic) gradient descent,
673
    and so steps computed by this step rule may not make sense for use
674
    in other contexts.
675
676
    Investigations into max-norm regularization date from [Srebro2005]_.
677
    The first appearance of this technique as a regularization method
678
    for the weight vectors of individual hidden units in feed-forward
679
    neural networks may be [Hinton2012]_.
680
681
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
682
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
683
       on Learning Theory (COLT)*, June 2005.
684
685
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
686
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
687
       "Improving neural networks by preventing co-adaptation of
688
       feature detectors". arXiv:1207.0580.
689
690
    """
691
    def __init__(self, threshold, axis=None):
692
        axis = pack(axis) if axis is not None else ()
693
        self.axis = set(axis)
694
        self.threshold = shared_floatx(threshold, "threshold")
695
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
696
        if len(axis) != len(self.axis):
697
            raise ValueError("axis must be unique")
698
699
    def compute_step(self, parameter, previous_step):
700
        if any(ax >= previous_step.ndim for ax in self.axis):
701
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
702
                self.axis, parameter, previous_step.ndim))
703
        if len(self.axis) == 0:
704
            norms = l2_norm([parameter - previous_step])
705
        else:
706
            squares = tensor.sqr(parameter - previous_step)
707
            norms = tensor.sqrt(
708
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
709
                       sorted(self.axis), squares))
710
        # We want a step s* that is the same as scaling
711
        # (parameter - previous_step) by threshold / norm
712
        # when threshold < norm.
713
        shrinking_step = (parameter -
714
                          (self.threshold / norms) *
715
                          (parameter - previous_step))
716
        return tensor.switch(norms > self.threshold,
717
                             shrinking_step,
718
                             previous_step), ()
719
720
721
class AdaGrad(StepRule):
722
    """Implements the AdaGrad learning rule.
723
724
    Parameters
725 View Code Duplication
    ----------
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
    learning_rate : float, optional
727
        Step size.
728
        Default value is set to 0.0002.
729
    epsilon : float, optional
730
        Stabilizing constant for one over root of sum of squares.
731
        Defaults to 1e-6.
732
733
    Notes
734
    -----
735
    For more information, see [ADAGRAD]_.
736
737
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
738
       *Adaptive subgradient methods for online learning and
739
        stochastic optimization*,
740
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
741
742
    """
743
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
744
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
745
        self.epsilon = shared_floatx(epsilon, "epsilon")
746
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
747
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
748
749
    def compute_step(self, parameter, previous_step):
750
        name = 'adagrad_sqs'
751
        if parameter.name:
752
            name += '_' + parameter.name
753
        ssq = _create_algorithm_buffer_for(parameter, name=name)
754
755
        ssq_t = (tensor.sqr(previous_step) + ssq)
756
        step = (self.learning_rate * previous_step /
757
                (tensor.sqrt(ssq_t) + self.epsilon))
758
759
        updates = [(ssq, ssq_t)]
760
761
        return step, updates
762
763
764
class Adam(StepRule):
765
    """Adam optimizer as described in [King2014]_.
766
767
    .. [King2014] Diederik Kingma, Jimmy Ba,
768
       *Adam: A Method for Stochastic Optimization*,
769
       http://arxiv.org/abs/1412.6980
770
771
    Parameters
772
    ----------
773
    learning_rate : float, optional
774
        Step size.
775
        Default value is set to 0.002.
776
    beta1 : float, optional
777
        Exponential decay rate for the first moment estimates.
778
        Default value is set to 0.1.
779
    beta2 : float, optional
780
        Exponential decay rate for the second moment estimates.
781
        Default value is set to 0.001.
782
    epsilon : float, optional
783
        Default value is set to 1e-8.
784
    decay_factor : float, optional
785
        Default value is set to 1 - 1e-8.
786
787
    """
788
    def __init__(self, learning_rate=0.002,
789
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
790
                 decay_factor=(1 - 1e-8)):
791
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
792
        self.beta1 = shared_floatx(beta1, "beta1")
793
        self.beta2 = shared_floatx(beta2, "beta2")
794
        self.epsilon = shared_floatx(epsilon, "epsilon")
795
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
796
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
797
                      self.decay_factor]:
798
            add_role(param, ALGORITHM_HYPERPARAMETER)
799
800
    def compute_step(self, parameter, previous_step):
801
        mean = _create_algorithm_buffer_for(parameter, 'mean')
802
        variance = _create_algorithm_buffer_for(parameter, 'variance')
803
        time = shared_floatx(0., 'time')
804
        add_role(time, ALGORITHM_BUFFER)
805
806
        t1 = time + 1
807
        learning_rate = (self.learning_rate *
808
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
809
                         (1. - (1. - self.beta1)**t1))
810
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
811
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
812
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
813
                      (1. - self.beta2) * variance)
814
        step = (learning_rate * mean_t /
815
                (tensor.sqrt(variance_t) + self.epsilon))
816
817
        updates = [(mean, mean_t),
818
                   (variance, variance_t),
819
                   (time, t1)]
820
821
        return step, updates
822
823
824
class RemoveNotFinite(StepRule):
825
    """A step rule that skips steps with non-finite elements.
826
827
    Replaces a step (the parameter update of a single shared variable)
828
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
829
    step rescaling the parameters.
830
831
    Parameters
832
    ----------
833
    scaler : float, optional
834
        The scaling applied to the parameter in case the step contains
835
        non-finite elements. Defaults to 1, which means that parameters
836
        will not be changed.
837
838
    Notes
839
    -----
840
    This rule should be applied last!
841
842
    This trick was originally used in the GroundHog_ framework.
843
844
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
845
846
    """
847
    def __init__(self, scaler=1):
848
        self.scaler = scaler
849
850
    def compute_step(self, parameter, previous_step):
851
        step_sum = tensor.sum(previous_step)
852
        not_finite = (tensor.isnan(step_sum) +
853
                      tensor.isinf(step_sum))
854
        step = tensor.switch(
855
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
856
        return step, []
857
858
859
class Restrict(StepRule):
860
    """Applies a given :class:`StepRule` only to certain variables.
861
862
    Example applications include clipping steps on only certain parameters,
863
    or scaling a certain kind of parameter's updates (e.g. adding an
864
    additional scalar multiplier to the steps taken on convolutional
865
    filters).
866
867
    Parameters
868
    ----------
869
    step_rule : :class:`StepRule`
870
        The :class:`StepRule` to be applied on the given variables.
871
    variables : iterable
872
        A collection of Theano variables on which to apply `step_rule`.
873
        Variables not appearing in this collection will not have
874
        `step_rule` applied to them.
875
876
    """
877
    def __init__(self, step_rule, variables):
878
        self.step_rule = step_rule
879
        self.variables = frozenset(variables)
880
881
    def compute_steps(self, previous_steps):
882
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
883
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
884
        actual = OrderedDict((parameter, steps[parameter])
885
                             if parameter in steps
886
                             else (parameter, previous_steps[parameter])
887
                             for parameter in previous_steps)
888
        return actual, updates
889