Completed
Pull Request — master (#1079)
by David
04:45
created

GradientDescent.add_updates()   A

Complexity

Conditions 3

Size

Total Lines 16

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 3
dl 0
loc 16
rs 9.4285
1
"""Training algorithms."""
0 ignored issues
show
Bug introduced by
There seems to be a cyclic import (blocks.bricks.base -> blocks.graph -> blocks.graph.bn -> blocks.filter).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.simple -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor. Or pass \
65
on_unused_sources='warn' or on_unused_sources='ignore' to \
66
the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
class GradientDescent(TrainingAlgorithm):
75
    """A base class for all gradient descent algorithms.
76
77
    By "gradient descent" we mean a training algorithm of the following
78
    form:
79
80
    .. code-block::  python
81
82
        for batch in data:
83
            steps = step_rule.compute_steps(parameters,
84
                                            gradients_wr_parameters)
85
            for parameter in parameters:
86
                parameter -= steps[parameter]
87
88
    Note, that the step is *subtracted, not added*! This is done in order
89
    to make step rule chaining possible.
90
91
    Parameters
92
    ----------
93
    cost : :class:`~tensor.TensorVariable`, optional
94
        The objective to be minimized.
95
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
96
        The parameters to be tuned. If not provided, inferred from the
97
        keys of `gradients`.
98
    step_rule : instance of :class:`StepRule`, optional
99
        An object encapsulating most of the algorithm's logic. Its
100
        `compute_steps` method is called to get Theano expression for
101
        steps.  Note, that the step rule might have a state, e.g. to
102
        remember a weighted sum of gradients from previous steps like it is
103
        done in gradient descent with momentum. If ``None``, an instance of
104
        :class:`Scale` is created.
105
    gradients : dict, optional
106
        A dictionary mapping a parameter to an expression for the cost's
107
        gradient with respect to the parameter. If ``None``, the gradient
108
        are taken automatically using :func:`theano.gradient.grad`.
109
    known_grads : dict, optional
110
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
111
        Useful when you know the [approximate] gradients of some
112
        sub-expressions and would like Theano to use that information
113
        to compute parameter gradients. Only makes sense when `gradients`
114
        is `None`.
115
    consider_constant : list, optional
116
        A passthrough to `theano.tensor.grad`'s `consider_constant`
117
        argument.  A list of expressions through which gradients will not
118
        be backpropagated. Only makes sense when `gradients` is `None`.
119
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
120
        Controls behavior when not all sources are used.
121
    theano_func_kwargs : dict, optional
122
        A passthrough to `theano.function` for additional arguments.
123
        Useful for passing `profile` or `mode` arguments to the theano
124
        function that will be compiled for the algorithm.
125
126
    Attributes
127
    ----------
128
    gradients : dict
129
        The gradient dictionary.
130
    step_rule : instance of :class:`StepRule`
131
        The step rule.
132
    updates : list of :class:`~tensor.TensorSharedVariable` updates
133
        Updates to be done for every batch. It is required that the
134
        updates are done using the old values of optimized parameters.
135
136
    Notes
137
    -----
138
    Changing `updates` attribute or calling `add_updates` after
139
    the `initialize` method is called will have no effect.
140
141
    .. todo::
142
143
       Some shared variables are not parameters (e.g. those created by
144
       random streams).
145
146
    .. todo::
147
148
       Due to a rather premature status of the :class:`ComputationGraph`
149
       class the parameter used only inside scans are not fetched
150
       currently.
151
152
    """
153
    def __init__(self, cost=None, parameters=None, step_rule=None,
154
                 gradients=None, known_grads=None, consider_constant=None,
155
                 on_unused_sources='raise', theano_func_kwargs=None, **kwargs):
156
        self.cost = cost
157
        self._cost_computation_graph = ComputationGraph(self.cost)
158
        self._updates = []
159
        self.parameters = parameters
160
        if gradients:
161
            kwargs.setdefault("parameters", gradients.keys())
162
        super(GradientDescent, self).__init__(**kwargs)
163
164
        self.gradients = gradients
165
        if not self.gradients:
166
            logger.info("Taking the cost gradient")
167
            self.gradients = dict(
168
                equizip(self.parameters, tensor.grad(
169
                    self.cost, self.parameters,
170
                    known_grads=known_grads,
171
                    consider_constant=consider_constant)))
172
            logger.info("The cost gradient computation graph is built")
173
        else:
174
            if known_grads:
175
                raise ValueError("known_grads has no effect when gradients "
176
                                 "are passed in")
177
            if consider_constant is not None:
178
                raise ValueError("consider_constant has no effect when "
179
                                 "gradients are passed in")
180
        self.step_rule = step_rule if step_rule else Scale()
181
182
        self.total_gradient_norm = l2_norm(
183
            self.gradients.values()).copy(name="total_gradient_norm")
184
        self.steps, self.step_rule_updates = (
185
            self.step_rule.compute_steps(self.gradients))
186
        self.total_step_norm = l2_norm(
187
            self.steps.values()).copy(name="total_step_norm")
188
        self.on_unused_sources = on_unused_sources
189
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
190
                                   is not None else dict())
191
192
    def initialize(self):
193
        logger.info("Initializing the training algorithm")
194
        all_updates = self.updates
195
        # Note: the gradients are computed in the same order in which
196
        # the parameters were given. Keep it like that to ensure
197
        # reproducibility.
198
        for parameter in self.parameters:
199
            all_updates.append((parameter, parameter - self.steps[parameter]))
200
        all_updates += self.step_rule_updates
201
        self._function = theano.function(
202
            self.inputs, [], updates=all_updates, **self.theano_func_kwargs)
203
        logger.info("The training algorithm is initialized")
204
205
    def _validate_source_names(self, batch):
206
        in_names = [v.name for v in self.inputs]
207
208
        if not set(in_names).issubset(set(batch.keys())):
209
            raise ValueError("Didn't find all sources: " +
210
                             source_missing_error.format(
211
                                 sources=batch.keys(),
212
                                 variables=in_names))
213
        if not set(batch.keys()).issubset(set(in_names)):
214
            if self.on_unused_sources == 'ignore':
215
                pass
216
            elif self.on_unused_sources == 'warn':
217
                if not hasattr(self, '_unused_source_warned'):
218
                    logger.warn(variable_mismatch_error.format(
219
                        sources=batch.keys(),
220
                        variables=in_names))
221
                self._unused_source_warned = True
222
            elif self.on_unused_sources == 'raise':
223
                raise ValueError(
224
                    "mismatch of variable names and data sources" +
225
                    variable_mismatch_error.format(
226
                        sources=batch.keys(),
227
                        variables=in_names))
228
            else:
229
                raise ValueError("Wrong value of on_unused_sources: {}."
230
                                 .format(self.on_unused_sources))
231
232
    def process_batch(self, batch):
233
        self._validate_source_names(batch)
234
        ordered_batch = [batch[v.name] for v in self.inputs]
235
        self._function(*ordered_batch)
236
237
    @property
238
    def inputs(self):
239
        """Return inputs of the cost computation graph.
240
241
        Returns
242
        -------
243
        inputs : list of :class:`~tensor.TensorVariable`
244
            Inputs to this graph.
245
246
        """
247
        return self._cost_computation_graph.inputs
248
249
    @property
250
    def updates(self):
251
        return self._updates
252
253
    @updates.setter
254
    def updates(self, value):
255
        self._updates = value
256
257
    def add_updates(self, updates):
258
        """Add updates to the training process.
259
260
        The updates will be done _before_ the parameters are changed.
261
262
        Parameters
263
        ----------
264
        updates : list of tuples or :class:`~collections.OrderedDict`
265
            The updates to add.
266
267
        """
268
        if isinstance(updates, OrderedDict):
269
            updates = list(updates.items())
270
        if not isinstance(updates, list):
271
            raise ValueError
272
        self.updates.extend(updates)
273
274
275
@add_metaclass(ABCMeta)
276
class StepRule(object):
277
    """A rule to compute steps for a gradient descent algorithm."""
278
    def compute_step(self, parameter, previous_step):
279
        """Build a Theano expression for the step for a parameter.
280
281
        This method is called by default implementation of
282
        :meth:`compute_steps`, it relieves from writing a loop each time.
283
284
        Parameters
285
        ----------
286
        parameter : :class:`~tensor.TensorSharedVariable`
287
            The parameter.
288
        previous_step : :class:`~tensor.TensorVariable`
289
            Some quantity related to the gradient of the cost with respect
290
            to the parameter, either the gradient itself or a step in a
291
            related direction.
292
293
        Returns
294
        -------
295
        step : :class:`~theano.Variable`
296
            Theano variable for the step to take.
297
        updates : list
298
            A list of tuples representing updates to be performed. This
299
            is useful for stateful rules such as :class:`Momentum` which
300
            need to update shared variables after itetations.
301
302
        """
303
        raise NotImplementedError
304
305
    def compute_steps(self, previous_steps):
306
        """Build a Theano expression for steps for all parameters.
307
308
        Override this method if you want to process the steps
309
        with respect to all parameters as a whole, not parameter-wise.
310
311
        Parameters
312
        ----------
313
        previous_steps : OrderedDict
314
            An :class:`~OrderedDict` of
315
            (:class:`~tensor.TensorSharedVariable`
316
            :class:`~tensor.TensorVariable`) pairs. The keys are the
317
            parameters being trained, the values are the expressions for
318
            quantities related to gradients of the cost with respect to
319
            the parameters, either the gradients themselves or steps in
320
            related directions.
321
322
        Returns
323
        -------
324
        steps : OrderedDict
325
            A dictionary of the proposed steps in the same form as
326
            `previous_steps`.
327
        updates : list
328
            A list of tuples representing updates to be performed.
329
330
        """
331
        parameter_wise = [self.compute_step(parameter,
332
                                            previous_steps[parameter])
333
                          for parameter in previous_steps]
334
        steps, updates = equizip(*parameter_wise)
335
        steps = OrderedDict((parameter, step) for parameter, step
336
                            in equizip(previous_steps.keys(), steps))
337
        updates = list(itertools.chain(*updates))
338
        return steps, updates
339
340
341
class CompositeRule(StepRule):
342
    """Chains several step rules.
343
344
    Parameters
345
    ----------
346
    components : list of :class:`StepRule`
347
        The learning rules to be chained. The rules will be applied in the
348
        order as given.
349
350
    """
351
    def __init__(self, components):
352
        self.components = components
353
354
    def compute_steps(self, previous_steps):
355
        steps = previous_steps
356
        updates = []
357
        for rule in self.components:
358
            steps, more_updates = rule.compute_steps(steps)
359
            updates += more_updates
360
        return steps, updates
361
362
363
class Scale(StepRule):
364
    """A step in the direction proportional to the previous step.
365
366
    If used in :class:`GradientDescent` alone, this step rule implements
367
    steepest descent.
368
369
    Parameters
370
    ----------
371
    learning_rate : float
372
        The learning rate by which the previous step is multiplied to
373
        produce the step.
374
375
    Attributes
376
    ----------
377
    learning_rate : :class:`~tensor.TensorSharedVariable`
378
        The shared variable storing the learning rate used.
379
380
    """
381
    def __init__(self, learning_rate=1.0):
382
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
383
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
384
385
    def compute_step(self, parameter, previous_step):
386
        return self.learning_rate * previous_step, []
387
388
389
class BasicMomentum(StepRule):
390
    """Accumulates step with exponential discount.
391
392
    Parameters
393
    ----------
394
    momentum : float, optional
395
        The momentum coefficient. Defaults to 0.
396
397
    Notes
398
    -----
399
    This step rule is intended to be used in conjunction with another
400
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
401
    experience, look at :class:`Momentum`.
402
403
    """
404
    def __init__(self, momentum=0.):
405
        self.momentum = shared_floatx(momentum, "momentum")
406
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
407
408
    def compute_step(self, parameter, previous_step):
409
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
410
        step = self.momentum * velocity + previous_step
411
        updates = [(velocity, step)]
412
        return step, updates
413
414
415
class Momentum(CompositeRule):
416
    """Accumulates step with exponential discount.
417
418
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
419
    usual momentum step rule.
420
421
    Parameters
422
    ----------
423
    learning_rate : float, optional
424
        The learning rate by which the previous step scaled. Defaults to 1.
425
    momentum : float, optional
426
        The momentum coefficient. Defaults to 0.
427
428
    Attributes
429
    ----------
430
    learning_rate : :class:`~tensor.SharedVariable`
431
        A variable for learning rate.
432
    momentum : :class:`~tensor.SharedVariable`
433
        A variable for momentum.
434
435
    See Also
436
    --------
437
    :class:`SharedVariableModifier`
438
439
    """
440
    def __init__(self, learning_rate=1.0, momentum=0.):
441
        scale = Scale(learning_rate=learning_rate)
442
        basic_momentum = BasicMomentum(momentum=momentum)
443
        self.learning_rate = scale.learning_rate
444
        self.momentum = basic_momentum.momentum
445
        self.components = [scale, basic_momentum]
446
447
448
class AdaDelta(StepRule):
449
    """Adapts the step size over time using only first order information.
450
451
    Parameters
452
    ----------
453
    decay_rate : float, optional
454
        Decay rate in [0, 1]. Defaults to 0.95.
455
    epsilon : float, optional
456
        Stabilizing constant for RMS. Defaults to 1e-6.
457
458
    Notes
459
    -----
460
    For more information, see [ADADELTA]_.
461
462
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
463
       Rate Method*, arXiv:1212.5701.
464
465
    """
466
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
467
        if not 0.0 <= decay_rate <= 1.0:
468
            raise ValueError("decay rate needs to be in [0, 1]")
469
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
470
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
471
        self.epsilon = shared_floatx(epsilon, "epsilon")
472
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
473
474
    def compute_step(self, parameter, previous_step):
475
        mean_square_step_tm1 = _create_algorithm_buffer_for(
476
            parameter, "mean_square_step_tm1")
477
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
478
            parameter, "mean_square_delta_x_tm1")
479
480
        mean_square_step_t = (
481
            self.decay_rate * mean_square_step_tm1 +
482
            (1 - self.decay_rate) * tensor.sqr(previous_step)
483
        )
484
485
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
486
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
487
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
488
489
        mean_square_delta_x_t = (
490
            self.decay_rate * mean_square_delta_x_tm1 +
491
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
492
        )
493
494
        step = delta_x_t
495
        updates = [(mean_square_step_tm1, mean_square_step_t),
496
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
497
        return step, updates
498
499
500
class BasicRMSProp(StepRule):
501
    """Scales the step size by a running average of the recent step norms.
502
503
    Parameters
504
    ----------
505
    decay_rate : float, optional
506
        How fast the running average decays, value in [0, 1]
507
        (lower is faster).  Defaults to 0.9.
508
    max_scaling : float, optional
509
        Maximum scaling of the step size, in case the running average is
510
        really small. Needs to be greater than 0. Defaults to 1e5.
511
512
    Notes
513
    -----
514
    This step rule is intended to be used in conjunction with another
515
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
516
    experience, look at :class:`RMSProp`.
517
518
    In general, this step rule should be used _before_ other step rules,
519
    because it has normalization properties that may undo their work.
520
    For instance, it should be applied first when used in conjunction
521
    with :class:`Scale`.
522 View Code Duplication
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    For more information, see [Hint2014]_.
524
525
    """
526
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
527
        if not 0.0 <= decay_rate <= 1.0:
528
            raise ValueError("decay rate needs to be in [0, 1]")
529
        if max_scaling <= 0:
530
            raise ValueError("max. scaling needs to be greater than 0")
531
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
532
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
533
        self.epsilon = 1. / max_scaling
534
535
    def compute_step(self, parameter, previous_step):
536
        mean_square_step_tm1 = _create_algorithm_buffer_for(
537
            parameter, "mean_square_step_tm1")
538
        mean_square_step_t = (
539
            self.decay_rate * mean_square_step_tm1 +
540
            (1 - self.decay_rate) * tensor.sqr(previous_step))
541
        rms_step_t = tensor.maximum(
542
            tensor.sqrt(mean_square_step_t), self.epsilon)
543
        step = previous_step / rms_step_t
544
        updates = [(mean_square_step_tm1, mean_square_step_t)]
545
        return step, updates
546
547
548
class RMSProp(CompositeRule):
549
    """Scales the step size by a running average of the recent step norms.
550
551
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
552
    described in [Hint2014]_.
553
554
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
555
       lecture 6a,
556
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
557
558
    Parameters
559
    ----------
560
    learning_rate : float, optional
561
        The learning rate by which the previous step scaled. Defaults to 1.
562
    decay_rate : float, optional
563
        How fast the running average decays (lower is faster).
564
        Defaults to 0.9.
565
    max_scaling : float, optional
566
        Maximum scaling of the step size, in case the running average is
567
        really small. Defaults to 1e5.
568
569
    Attributes
570
    ----------
571
    learning_rate : :class:`~tensor.SharedVariable`
572
        A variable for learning rate.
573
    decay_rate : :class:`~tensor.SharedVariable`
574
        A variable for decay rate.
575
576
    See Also
577
    --------
578
    :class:`SharedVariableModifier`
579
580
    """
581
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
582
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
583
                                      max_scaling=max_scaling)
584
        scale = Scale(learning_rate=learning_rate)
585
        self.learning_rate = scale.learning_rate
586
        self.decay_rate = basic_rms_prop.decay_rate
587
        self.components = [basic_rms_prop, scale]
588
589
590
class StepClipping(StepRule):
591
    """Rescales an entire step if its L2 norm exceeds a threshold.
592
593
    When the previous steps are the gradients, this step rule performs
594
    gradient clipping.
595
596
    Parameters
597
    ----------
598
    threshold : float, optional
599
        The maximum permitted L2 norm for the step. The step
600
        will be rescaled to be not higher than this quanity.
601
        If ``None``, no rescaling will be applied.
602
603
    Attributes
604
    ----------
605
    threshold : :class:`.tensor.TensorSharedVariable`
606
        The shared variable storing the clipping threshold used.
607
608
    """
609
    def __init__(self, threshold=None):
610
        if threshold:
611
            self.threshold = shared_floatx(threshold, "threshold")
612
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
613
614
    def compute_steps(self, previous_steps):
615
        if not hasattr(self, 'threshold'):
616
            return previous_steps
617
        norm = l2_norm(previous_steps.values())
618
        multiplier = tensor.switch(norm < self.threshold,
619
                                   1, self.threshold / norm)
620
        steps = OrderedDict(
621
            (parameter, step * multiplier)
622
            for parameter, step in previous_steps.items())
623
        return steps, []
624
625
626
class VariableClipping(StepRule):
627
    """Clip the maximum norm of individual variables along certain axes.
628
629
    This :class:`StepRule` can be used to implement L2 norm constraints on
630
    e.g. the weight vectors of individual hidden units, convolutional
631
    filters or entire weight tensors. Combine with :class:`Restrict`
632
    (and possibly :class:`CompositeRule`), to apply such constraints only
633
    to certain variables and/or apply different norm constraints to
634
    different variables.
635
636
    Parameters
637
    ----------
638
    threshold : float
639
        Maximum norm for a given (portion of a) tensor.
640
    axis : int or iterable, optional
641
        An integer single axis, or an iterable collection of integer
642
        axes over which to sum in order to calculate the L2 norm. If
643
        `None` (the default), the norm is computed over all elements
644
        of the tensor.
645
646
    Notes
647
    -----
648
    Because of the way the :class:`StepRule` API works, this particular
649
    rule implements norm clipping of the value *after* update in the
650
    following way: it computes ``parameter - previous_step``, scales it
651
    to have (possibly axes-wise) norm(s) of at most `threshold`,
652
    then subtracts *that* value from `parameter` to yield an 'equivalent
653
    step' that respects the desired norm constraints. This procedure
654
    implicitly assumes one is doing simple (stochastic) gradient descent,
655
    and so steps computed by this step rule may not make sense for use
656
    in other contexts.
657
658
    Investigations into max-norm regularization date from [Srebro2005]_.
659
    The first appearance of this technique as a regularization method
660
    for the weight vectors of individual hidden units in feed-forward
661
    neural networks may be [Hinton2012]_.
662
663
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
664
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
665
       on Learning Theory (COLT)*, June 2005.
666
667
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
668
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
669
       "Improving neural networks by preventing co-adaptation of
670
       feature detectors". arXiv:1207.0580.
671
672
    """
673
    def __init__(self, threshold, axis=None):
674
        axis = pack(axis) if axis is not None else ()
675
        self.axis = set(axis)
676
        self.threshold = shared_floatx(threshold, "threshold")
677
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
678
        if len(axis) != len(self.axis):
679
            raise ValueError("axis must be unique")
680
681
    def compute_step(self, parameter, previous_step):
682
        if any(ax >= previous_step.ndim for ax in self.axis):
683
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
684
                self.axis, parameter, previous_step.ndim))
685
        if len(self.axis) == 0:
686
            norms = l2_norm([parameter - previous_step])
687
        else:
688
            squares = tensor.sqr(parameter - previous_step)
689
            norms = tensor.sqrt(
690
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
691
                       sorted(self.axis), squares))
692
        # We want a step s* that is the same as scaling
693
        # (parameter - previous_step) by threshold / norm
694
        # when threshold < norm.
695
        shrinking_step = (parameter -
696
                          (self.threshold / norms) *
697
                          (parameter - previous_step))
698
        return tensor.switch(norms > self.threshold,
699
                             shrinking_step,
700
                             previous_step), ()
701
702
703
class AdaGrad(StepRule):
704
    """Implements the AdaGrad learning rule.
705
706
    Parameters
707
    ----------
708
    learning_rate : float, optional
709
        Step size.
710
        Default value is set to 0.0002.
711
    epsilon : float, optional
712
        Stabilizing constant for one over root of sum of squares.
713
        Defaults to 1e-6.
714
715
    Notes
716
    -----
717
    For more information, see [ADAGRAD]_.
718
719
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
720
       *Adaptive subgradient methods for online learning and
721
        stochastic optimization*,
722
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
723
724
    """
725 View Code Duplication
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
727
        self.epsilon = shared_floatx(epsilon, "epsilon")
728
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
729
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
730
731
    def compute_step(self, parameter, previous_step):
732
        name = 'adagrad_sqs'
733
        if parameter.name:
734
            name += '_' + parameter.name
735
        ssq = _create_algorithm_buffer_for(parameter, name=name)
736
737
        ssq_t = (tensor.sqr(previous_step) + ssq)
738
        step = (self.learning_rate * previous_step /
739
                (tensor.sqrt(ssq_t) + self.epsilon))
740
741
        updates = [(ssq, ssq_t)]
742
743
        return step, updates
744
745
746
class Adam(StepRule):
747
    """Adam optimizer as described in [King2014]_.
748
749
    .. [King2014] Diederik Kingma, Jimmy Ba,
750
       *Adam: A Method for Stochastic Optimization*,
751
       http://arxiv.org/abs/1412.6980
752
753
    Parameters
754
    ----------
755
    learning_rate : float, optional
756
        Step size.
757
        Default value is set to 0.002.
758
    beta1 : float, optional
759
        Exponential decay rate for the first moment estimates.
760
        Default value is set to 0.1.
761
    beta2 : float, optional
762
        Exponential decay rate for the second moment estimates.
763
        Default value is set to 0.001.
764
    epsilon : float, optional
765
        Default value is set to 1e-8.
766
    decay_factor : float, optional
767
        Default value is set to 1 - 1e-8.
768
769
    """
770
    def __init__(self, learning_rate=0.002,
771
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
772
                 decay_factor=(1 - 1e-8)):
773
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
774
        self.beta1 = shared_floatx(beta1, "beta1")
775
        self.beta2 = shared_floatx(beta2, "beta2")
776
        self.epsilon = shared_floatx(epsilon, "epsilon")
777
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
778
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
779
                      self.decay_factor]:
780
            add_role(param, ALGORITHM_HYPERPARAMETER)
781
782
    def compute_step(self, parameter, previous_step):
783
        mean = _create_algorithm_buffer_for(parameter, 'mean')
784
        variance = _create_algorithm_buffer_for(parameter, 'variance')
785
        time = shared_floatx(0., 'time')
786
        add_role(time, ALGORITHM_BUFFER)
787
788
        t1 = time + 1
789
        learning_rate = (self.learning_rate *
790
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
791
                         (1. - (1. - self.beta1)**t1))
792
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
793
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
794
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
795
                      (1. - self.beta2) * variance)
796
        step = (learning_rate * mean_t /
797
                (tensor.sqrt(variance_t) + self.epsilon))
798
799
        updates = [(mean, mean_t),
800
                   (variance, variance_t),
801
                   (time, t1)]
802
803
        return step, updates
804
805
806
class RemoveNotFinite(StepRule):
807
    """A step rule that skips steps with non-finite elements.
808
809
    Replaces a step (the parameter update of a single shared variable)
810
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
811
    step rescaling the parameters.
812
813
    Parameters
814
    ----------
815
    scaler : float, optional
816
        The scaling applied to the parameter in case the step contains
817
        non-finite elements. Defaults to 1, which means that parameters
818
        will not be changed.
819
820
    Notes
821
    -----
822
    This rule should be applied last!
823
824
    This trick was originally used in the GroundHog_ framework.
825
826
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
827
828
    """
829
    def __init__(self, scaler=1):
830
        self.scaler = scaler
831
832
    def compute_step(self, parameter, previous_step):
833
        step_sum = tensor.sum(previous_step)
834
        not_finite = (tensor.isnan(step_sum) +
835
                      tensor.isinf(step_sum))
836
        step = tensor.switch(
837
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
838
        return step, []
839
840
841
class Restrict(StepRule):
842
    """Applies a given :class:`StepRule` only to certain variables.
843
844
    Example applications include clipping steps on only certain parameters,
845
    or scaling a certain kind of parameter's updates (e.g. adding an
846
    additional scalar multiplier to the steps taken on convolutional
847
    filters).
848
849
    Parameters
850
    ----------
851
    step_rule : :class:`StepRule`
852
        The :class:`StepRule` to be applied on the given variables.
853
    variables : iterable
854
        A collection of Theano variables on which to apply `step_rule`.
855
        Variables not appearing in this collection will not have
856
        `step_rule` applied to them.
857
858
    """
859
    def __init__(self, step_rule, variables):
860
        self.step_rule = step_rule
861
        self.variables = frozenset(variables)
862
863
    def compute_steps(self, previous_steps):
864
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
865
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
866
        actual = OrderedDict((parameter, steps[parameter])
867
                             if parameter in steps
868
                             else (parameter, previous_steps[parameter])
869
                             for parameter in previous_steps)
870
        return actual, updates
871