Test Failed
Pull Request — master (#1191)
by
unknown
15:50
created

GradientDescent._compute_gradients()   A

Complexity

Conditions 4

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 18
rs 9.2
c 0
b 0
f 0
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from collections import Mapping
7
from six.moves import reduce
8
9
from picklable_itertools.extras import equizip
10
11
import theano
12
from six import add_metaclass
13
from theano import tensor
14
15
from blocks.graph import ComputationGraph
16
from blocks.model import Model
17
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
18
from blocks.theano_expressions import l2_norm
19
from blocks.utils import dict_subset, pack
20
from blocks.utils.theano_utils import (
21
    shared_floatx_zeros_matching, shared_floatx)
22
23
logger = logging.getLogger(__name__)
24
25
26
def _create_algorithm_buffer_for(param, *args, **kwargs):
27
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
28
    buf.tag.for_parameter = param
29
    add_role(buf, ALGORITHM_BUFFER)
30
    return buf
31
32
33
@add_metaclass(ABCMeta)
34
class TrainingAlgorithm(object):
35
    """Base class for training algorithms.
36
37
    A training algorithm object has a simple life-cycle.
38
    First it is initialized by calling its :meth:`initialize` method.
39
    At this stage, for instance, Theano functions can be compiled.
40
    After that the :meth:`process_batch` method is repeatedly
41
    called with a batch of training data as a parameter.
42
43
    """
44
    @abstractmethod
45
    def initialize(self, **kwargs):
46
        """Initialize the training algorithm."""
47
        pass
48
49
    @abstractmethod
50
    def process_batch(self, batch):
51
        """Process a batch of training data.
52
53
        Attributes
54
        ----------
55
        batch : dict
56
            A dictionary of (source name, data) pairs.
57
58
        """
59
        pass
60
61
62
variable_mismatch_error = """
63
64
Blocks tried to match the sources ({sources}) of the training dataset to \
65
the names of the Theano variables ({variables}), but failed to do so. \
66
If you want to train on a subset of the sources that your dataset provides, \
67
pass the `sources` keyword argument to its constructor, use the \
68
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
69
or on_unused_sources='ignore' to the GradientDescent algorithm."""
70
71
source_missing_error = """
72
73
Blocks didn't find all the sources ({sources}) of the training dataset \
74
that match the names of the Theano variables ({variables})."""
75
76
77
determinism_error = """Cannot infer parameter list in a fixed order.
78
79
Because dictionaries are unordered (and Python uses randomized hashing, \
80
which can change the iteration order over the same dictionary from one \
81
interpreter session to the next), Blocks cannot infer the parameters list \
82
from a plain dictionary of gradients in an order that is reproducible \
83
across interpreter sessions; please either specify the parameters \
84
explicitly or pass gradients as an OrderedDict (though exercise care in \
85
constructing that OrderedDict, as an OrderedDict created by iterating \
86
over an unordered iterable (e.g. a dict) will still have an arbitrary \
87
and unpredictable order that could cause problems with \
88
reproducibility)."""
89
90
91
class UpdatesAlgorithm(TrainingAlgorithm):
92
    """Base class for algorithms that use Theano functions with updates.
93
94
    Parameters
95
    ----------
96
    updates : list of tuples or :class:`~collections.OrderedDict`
97
        The updates that should be performed.
98
    theano_func_kwargs : dict, optional
99
        A passthrough to `theano.function` for additional arguments.
100
        Useful for passing `profile` or `mode` arguments to the theano
101
        function that will be compiled for the algorithm.
102
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
103
        Controls behavior when not all sources in a batch are used
104
        (i.e. there is no variable with a matching name in the inputs
105
        of the computational graph of the updates).
106
107
    Attributes
108
    ----------
109
    updates : list of :class:`~tensor.TensorSharedVariable` updates
110
        Updates to be done for every batch. It is required that the
111
        updates are done using the old values of optimized parameters.
112
113
    Notes
114
    -----
115
    Changing `updates` attribute or calling `add_updates` after
116
    the `initialize` method is called will have no effect.
117
118
    """
119
    def __init__(self, updates=None, theano_func_kwargs=None,
120
                 on_unused_sources='raise', **kwargs):
121
        self.updates = [] if updates is None else updates
122
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
123
                                   is not None else dict())
124
        self.on_unused_sources = on_unused_sources
125
        super(UpdatesAlgorithm, self).__init__(**kwargs)
126
127
    def initialize(self):
128
        logger.info("Initializing the training algorithm")
129
        update_values = [new_value for _, new_value in self.updates]
130
        logger.debug("Inferring graph inputs...")
131
        self.inputs = ComputationGraph(update_values).inputs
132
        logger.debug("Compiling training function...")
133
        self._function = theano.function(
134
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
135
        logger.info("The training algorithm is initialized")
136
137
    @property
138
    def updates(self):
139
        return self._updates
140
141
    @updates.setter
142
    def updates(self, value):
143
        self._updates = value
144
145
    def add_updates(self, updates):
146
        """Add updates to the training process.
147
148
        The updates will be done _before_ the parameters are changed.
149
150
        Parameters
151
        ----------
152
        updates : list of tuples or :class:`~collections.OrderedDict`
153
            The updates to add.
154
155
        """
156
        if isinstance(updates, OrderedDict):
157
            updates = list(updates.items())
158
        if not isinstance(updates, list):
159
            raise ValueError
160
        self.updates.extend(updates)
161
162
    def _validate_source_names(self, batch):
163
        in_names = [v.name for v in self.inputs]
164
165
        if not set(in_names).issubset(set(batch.keys())):
166
            raise ValueError("Didn't find all sources: " +
167
                             source_missing_error.format(
168
                                 sources=batch.keys(),
169
                                 variables=in_names))
170
        if not set(batch.keys()).issubset(set(in_names)):
171
            if self.on_unused_sources == 'ignore':
172
                pass
173
            elif self.on_unused_sources == 'warn':
174
                if not hasattr(self, '_unused_source_warned'):
175
                    logger.warn(variable_mismatch_error.format(
176
                        sources=batch.keys(),
177
                        variables=in_names))
178
                self._unused_source_warned = True
179
            elif self.on_unused_sources == 'raise':
180
                raise ValueError(
181
                    "mismatch of variable names and data sources" +
182
                    variable_mismatch_error.format(
183
                        sources=batch.keys(),
184
                        variables=in_names))
185
            else:
186
                raise ValueError("Wrong value of on_unused_sources: {}."
187
                                 .format(self.on_unused_sources))
188
189
    def process_batch(self, batch):
190
        self._validate_source_names(batch)
191
        ordered_batch = [batch[v.name] for v in self.inputs]
192
        self._function(*ordered_batch)
193
194
    def check_sanity(self, model):
195
        # Do nothing
196
        pass
197
198
199
class GradientDescent(UpdatesAlgorithm):
200
    """A base class for all gradient descent algorithms.
201
202
    By "gradient descent" we mean a training algorithm of the following
203
    form:
204
205
    .. code-block::  python
206
207
        for batch in data:
208
            steps = step_rule.compute_steps(parameters,
209
                                            gradients_wr_parameters)
210
            for parameter in parameters:
211
                parameter -= steps[parameter]
212
213
    Note, that the step is *subtracted, not added*! This is done in order
214
    to make step rule chaining possible.
215
216
    Parameters
217
    ----------
218
    cost : :class:`~tensor.TensorVariable`, optional
219
        The objective to be minimized. Unused if `gradients` is specified.
220
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
221
        The parameters to be tuned. If not provided, inferred from the
222
        keys of `gradients` (in which case `gradients` *must* be an
223
        `OrderedDict`).
224
    step_rule : instance of :class:`StepRule`, optional
225
        An object encapsulating most of the algorithm's logic. Its
226
        `compute_steps` method is called to get Theano expression for
227
        steps.  Note, that the step rule might have a state, e.g. to
228
        remember a weighted sum of gradients from previous steps like it is
229
        done in gradient descent with momentum. If ``None``, an instance of
230
        :class:`Scale` is created.
231
    gradients : OrderedDict or list of 2-tuples, optional
232
        A dictionary mapping a parameter to an expression for the cost's
233
        gradient with respect to the parameter, or equivalently, a list of
234
        (parameter, gradient) tuples. If ``None``, the gradient
235
        are taken automatically using :func:`theano.gradient.grad`.
236
    known_grads : dict, optional
237
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
238
        Useful when you know the [approximate] gradients of some
239
        sub-expressions and would like Theano to use that information
240
        to compute parameter gradients. Only makes sense when `gradients`
241
        is `None`.
242
    consider_constant : list, optional
243
        A passthrough to `theano.tensor.grad`'s `consider_constant`
244
        argument.  A list of expressions through which gradients will not
245
        be backpropagated. Only makes sense when `gradients` is `None`.
246
247
    Attributes
248
    ----------
249
    gradients : OrderedDict
250
        The gradient dictionary.
251
    step_rule : instance of :class:`StepRule`
252
        The step rule.
253
254
    Notes
255
    -----
256
    Changing `updates` attribute or calling `add_updates` after
257
    the `initialize` method is called will have no effect.
258
259
    If a cost and parameters are provided, gradients are taken immediately
260
    upon construction, and changes to these attributes after construction
261
    will have no effect.
262
263
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
264
    because ordinary dictionaries have an unpredictable iteration
265
    order due to hash randomization (which is enabled by default since
266
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
267
    when combined with Theano's heuristic graph optimizations, can cause
268
    serious reproducibility issues.
269
270
    """
271
    def __init__(self, cost=None, parameters=None, step_rule=None,
272
                 gradients=None, known_grads=None, consider_constant=None,
273
                 **kwargs):
274
        # Set initial values for cost, parameters, gradients.
275
        self.cost = cost
276
        self.parameters = parameters
277
        # Coerce lists of tuples to OrderedDict. Do not coerce Mappings,
278
        # as we don't want to convert dict -> OrderedDict and give it
279
        # an arbitrary, non-deterministic order.
280
        if gradients is not None and not isinstance(gradients, Mapping):
281
            gradients = OrderedDict(gradients)
282
        self.gradients = gradients
283
284
        # If we don't have gradients, we'll need to infer them from the
285
        # cost and the parameters, both of which must not be None.
286
        if not self.gradients:
287
            self.gradients = self._compute_gradients(known_grads,
288
                                                     consider_constant)
289
        else:
290
            if cost is not None:
291
                logger.warning(('{}: gradients already specified directly; '
292
                                'cost is unused.'
293
                                .format(self.__class__.__name__)))
294
            if self.parameters is None and isinstance(gradients, OrderedDict):
295
                # If the dictionary is ordered, it's safe to use the keys
296
                # as they have a deterministic order.
297
                self.parameters = list(self.gradients.keys())
298
            elif self.parameters is not None:
299
                # If parameters and gradients.keys() don't match we can
300
                # try to recover if gradients is ordered.
301
                if set(self.parameters) != set(self.gradients.keys()):
302
                    logger.warn("Specified parameters list does not match "
303
                                "keys in provided gradient dictionary; "
304
                                "using parameters inferred from gradients")
305
                    if not isinstance(self.gradients, OrderedDict):
306
                        raise ValueError(determinism_error)
307
                    self.parameters = list(self.gradients.keys())
308
            else:
309
                # self.parameters is not None, and gradients isn't
310
                # an OrderedDict. We can't do anything safe.
311
                raise ValueError(determinism_error)
312
            if known_grads:
313
                raise ValueError("known_grads has no effect when gradients "
314
                                 "are passed in")
315
            if consider_constant is not None:
316
                raise ValueError("consider_constant has no effect when "
317
                                 "gradients are passed in")
318
319
        # The order in which the different gradient terms appears
320
        # here matters, as floating point addition is non-commutative (and
321
        # Theano's graph optimizations are not order-independent).
322
        # This is why we do not use .values().
323
        gradient_values = [self.gradients[p] for p in self.parameters]
324
        self.total_gradient_norm = (l2_norm(gradient_values)
325
                                    .copy(name="total_gradient_norm"))
326
327
        self.step_rule = step_rule if step_rule else Scale()
328
        logger.debug("Computing parameter steps...")
329
        self.steps, self.step_rule_updates = (
330
            self.step_rule.compute_steps(self.gradients))
331
332
        # Same as gradient_values above: the order may influence a
333
        # bunch of things, so enforce a consistent one (don't use
334
        # .values()).
335
        step_values = [self.steps[p] for p in self.parameters]
336
        self.total_step_norm = (l2_norm(step_values)
337
                                .copy(name="total_step_norm"))
338
339
        # Once again, iterating on gradients may not be deterministically
340
        # ordered if it is not an OrderedDict. We add the updates here in
341
        # the order specified in self.parameters. Keep it this way to
342
        # maintain reproducibility.
343
        kwargs.setdefault('updates', []).extend(
344
            itertools.chain(((parameter, parameter - self.steps[parameter])
345
                             for parameter in self.parameters),
346
                            self.step_rule_updates)
347
        )
348
        super(GradientDescent, self).__init__(**kwargs)
349
350
    def _compute_gradients(self, known_grads, consider_constant):
351
        if self.cost is None:
352
            raise ValueError("can't infer gradients; no cost specified")
353
        elif self.parameters is None or len(self.parameters) == 0:
354
            raise ValueError("can't infer gradients; no parameters "
355
                             "specified")
356
        # While this strictly speaking could be a dict and not an
357
        # OrderedDict (because we iterate over it in the order of
358
        # self.parameters), this guards a little bit against
359
        # nondeterminism introduced by future refactoring.
360
        logger.info("Taking the cost gradient")
361
        gradients = OrderedDict(
362
            equizip(self.parameters, tensor.grad(
363
                self.cost, self.parameters,
364
                known_grads=known_grads,
365
                consider_constant=consider_constant)))
366
        logger.info("The cost gradient computation graph is built")
367
        return gradients
368
369
    def check_sanity(self, model):
370
        # Sanity check for the most common case
371
        if (model and isinstance(model, Model) and
372
                isinstance(self.algorithm, GradientDescent)):
373
            if not (set(model.get_parameter_dict().values()) ==
374
                    set(self.algorithm.parameters)):
375
                logger.warning("different parameters for model and algorithm")
376
377
378
@add_metaclass(ABCMeta)
379
class StepRule(object):
380
    """A rule to compute steps for a gradient descent algorithm."""
381
    def compute_step(self, parameter, previous_step):
382
        """Build a Theano expression for the step for a parameter.
383
384
        This method is called by default implementation of
385
        :meth:`compute_steps`, it relieves from writing a loop each time.
386
387
        Parameters
388
        ----------
389
        parameter : :class:`~tensor.TensorSharedVariable`
390
            The parameter.
391
        previous_step : :class:`~tensor.TensorVariable`
392
            Some quantity related to the gradient of the cost with respect
393
            to the parameter, either the gradient itself or a step in a
394
            related direction.
395
396
        Returns
397
        -------
398
        step : :class:`~theano.Variable`
399
            Theano variable for the step to take.
400
        updates : list
401
            A list of tuples representing updates to be performed. This
402
            is useful for stateful rules such as :class:`Momentum` which
403
            need to update shared variables after itetations.
404
405
        """
406
        raise NotImplementedError
407
408
    def compute_steps(self, previous_steps):
409
        """Build a Theano expression for steps for all parameters.
410
411
        Override this method if you want to process the steps
412
        with respect to all parameters as a whole, not parameter-wise.
413
414
        Parameters
415
        ----------
416
        previous_steps : OrderedDict
417
            An :class:`~OrderedDict` of
418
            (:class:`~tensor.TensorSharedVariable`
419
            :class:`~tensor.TensorVariable`) pairs. The keys are the
420
            parameters being trained, the values are the expressions for
421
            quantities related to gradients of the cost with respect to
422
            the parameters, either the gradients themselves or steps in
423
            related directions.
424
425
        Returns
426
        -------
427
        steps : OrderedDict
428
            A dictionary of the proposed steps in the same form as
429
            `previous_steps`.
430
        updates : list
431
            A list of tuples representing updates to be performed.
432
433
        """
434
        parameter_wise = [self.compute_step(parameter,
435
                                            previous_steps[parameter])
436
                          for parameter in previous_steps]
437
        steps, updates = equizip(*parameter_wise)
438
        steps = OrderedDict((parameter, step) for parameter, step
439
                            in equizip(previous_steps.keys(), steps))
440
        updates = list(itertools.chain(*updates))
441
        return steps, updates
442
443
444
class CompositeRule(StepRule):
445
    """Chains several step rules.
446
447
    Parameters
448
    ----------
449
    components : list of :class:`StepRule`
450
        The learning rules to be chained. The rules will be applied in the
451
        order as given.
452
453
    """
454
    def __init__(self, components):
455
        self.components = components
456
457
    def compute_steps(self, previous_steps):
458
        steps = previous_steps
459
        updates = []
460
        for rule in self.components:
461
            steps, more_updates = rule.compute_steps(steps)
462
            updates += more_updates
463
        return steps, updates
464
465
466
class Scale(StepRule):
467
    """A step in the direction proportional to the previous step.
468
469
    If used in :class:`GradientDescent` alone, this step rule implements
470
    steepest descent.
471
472
    Parameters
473
    ----------
474
    learning_rate : float
475
        The learning rate by which the previous step is multiplied to
476
        produce the step.
477
478
    Attributes
479
    ----------
480
    learning_rate : :class:`~tensor.TensorSharedVariable`
481
        The shared variable storing the learning rate used.
482
483
    """
484
    def __init__(self, learning_rate=1.0):
485
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
486
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
487
488
    def compute_step(self, parameter, previous_step):
489
        return self.learning_rate * previous_step, []
490
491
492
class BasicMomentum(StepRule):
493
    """Accumulates step with exponential discount.
494
495
    Parameters
496
    ----------
497
    momentum : float, optional
498
        The momentum coefficient. Defaults to 0.
499
500
    Notes
501
    -----
502
    This step rule is intended to be used in conjunction with another
503
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
504
    experience, look at :class:`Momentum`.
505
506
    """
507
    def __init__(self, momentum=0.):
508
        self.momentum = shared_floatx(momentum, "momentum")
509
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
510
511
    def compute_step(self, parameter, previous_step):
512
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
513
        step = self.momentum * velocity + previous_step
514
        updates = [(velocity, step)]
515
        return step, updates
516
517
518
class Momentum(CompositeRule):
519
    """Accumulates step with exponential discount.
520
521
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
522 View Code Duplication
    usual momentum step rule.
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
524
    Parameters
525
    ----------
526
    learning_rate : float, optional
527
        The learning rate by which the previous step scaled. Defaults to 1.
528
    momentum : float, optional
529
        The momentum coefficient. Defaults to 0.
530
531
    Attributes
532
    ----------
533
    learning_rate : :class:`~tensor.SharedVariable`
534
        A variable for learning rate.
535
    momentum : :class:`~tensor.SharedVariable`
536
        A variable for momentum.
537
538
    See Also
539
    --------
540
    :class:`SharedVariableModifier`
541
542
    """
543
    def __init__(self, learning_rate=1.0, momentum=0.):
544
        scale = Scale(learning_rate=learning_rate)
545
        basic_momentum = BasicMomentum(momentum=momentum)
546
        self.learning_rate = scale.learning_rate
547
        self.momentum = basic_momentum.momentum
548
        self.components = [scale, basic_momentum]
549
550
551
class AdaDelta(StepRule):
552
    """Adapts the step size over time using only first order information.
553
554
    Parameters
555
    ----------
556
    decay_rate : float, optional
557
        Decay rate in [0, 1]. Defaults to 0.95.
558
    epsilon : float, optional
559
        Stabilizing constant for RMS. Defaults to 1e-6.
560
561
    Notes
562
    -----
563
    For more information, see [ADADELTA]_.
564
565
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
566
       Rate Method*, arXiv:1212.5701.
567
568
    """
569
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
570
        if not 0.0 <= decay_rate <= 1.0:
571
            raise ValueError("decay rate needs to be in [0, 1]")
572
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
573
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
574
        self.epsilon = shared_floatx(epsilon, "epsilon")
575
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
576
577
    def compute_step(self, parameter, previous_step):
578
        mean_square_step_tm1 = _create_algorithm_buffer_for(
579
            parameter, "mean_square_step_tm1")
580
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
581
            parameter, "mean_square_delta_x_tm1")
582
583
        mean_square_step_t = (
584
            self.decay_rate * mean_square_step_tm1 +
585
            (1 - self.decay_rate) * tensor.sqr(previous_step)
586
        )
587
588
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
589
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
590
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
591
592
        mean_square_delta_x_t = (
593
            self.decay_rate * mean_square_delta_x_tm1 +
594
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
595
        )
596
597
        step = delta_x_t
598
        updates = [(mean_square_step_tm1, mean_square_step_t),
599
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
600
        return step, updates
601
602
603
class BasicRMSProp(StepRule):
604
    """Scales the step size by a running average of the recent step norms.
605
606
    Parameters
607
    ----------
608
    decay_rate : float, optional
609
        How fast the running average decays, value in [0, 1]
610
        (lower is faster).  Defaults to 0.9.
611
    max_scaling : float, optional
612
        Maximum scaling of the step size, in case the running average is
613
        really small. Needs to be greater than 0. Defaults to 1e5.
614
615
    Notes
616
    -----
617
    This step rule is intended to be used in conjunction with another
618
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
619
    experience, look at :class:`RMSProp`.
620
621
    In general, this step rule should be used _before_ other step rules,
622
    because it has normalization properties that may undo their work.
623
    For instance, it should be applied first when used in conjunction
624
    with :class:`Scale`.
625
626
    For more information, see [Hint2014]_.
627
628
    """
629
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
630
        if not 0.0 <= decay_rate <= 1.0:
631
            raise ValueError("decay rate needs to be in [0, 1]")
632
        if max_scaling <= 0:
633
            raise ValueError("max. scaling needs to be greater than 0")
634
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
635
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
636
        self.epsilon = 1. / max_scaling
637
638
    def compute_step(self, parameter, previous_step):
639
        mean_square_step_tm1 = _create_algorithm_buffer_for(
640
            parameter, "mean_square_step_tm1")
641
        mean_square_step_t = (
642
            self.decay_rate * mean_square_step_tm1 +
643
            (1 - self.decay_rate) * tensor.sqr(previous_step))
644
        rms_step_t = tensor.maximum(
645
            tensor.sqrt(mean_square_step_t), self.epsilon)
646
        step = previous_step / rms_step_t
647
        updates = [(mean_square_step_tm1, mean_square_step_t)]
648
        return step, updates
649
650
651
class RMSProp(CompositeRule):
652
    """Scales the step size by a running average of the recent step norms.
653
654
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
655
    described in [Hint2014]_.
656
657
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
658
       lecture 6a,
659
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
660
661
    Parameters
662
    ----------
663
    learning_rate : float, optional
664
        The learning rate by which the previous step scaled. Defaults to 1.
665
    decay_rate : float, optional
666
        How fast the running average decays (lower is faster).
667
        Defaults to 0.9.
668
    max_scaling : float, optional
669
        Maximum scaling of the step size, in case the running average is
670
        really small. Defaults to 1e5.
671
672
    Attributes
673
    ----------
674
    learning_rate : :class:`~tensor.SharedVariable`
675
        A variable for learning rate.
676
    decay_rate : :class:`~tensor.SharedVariable`
677
        A variable for decay rate.
678
679
    See Also
680
    --------
681
    :class:`SharedVariableModifier`
682
683
    """
684
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
685
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
686
                                      max_scaling=max_scaling)
687
        scale = Scale(learning_rate=learning_rate)
688
        self.learning_rate = scale.learning_rate
689
        self.decay_rate = basic_rms_prop.decay_rate
690
        self.components = [basic_rms_prop, scale]
691
692
693
class StepClipping(StepRule):
694
    """Rescales an entire step if its L2 norm exceeds a threshold.
695
696
    When the previous steps are the gradients, this step rule performs
697
    gradient clipping.
698
699
    Parameters
700
    ----------
701
    threshold : float, optional
702
        The maximum permitted L2 norm for the step. The step
703
        will be rescaled to be not higher than this quanity.
704
        If ``None``, no rescaling will be applied.
705
706
    Attributes
707
    ----------
708
    threshold : :class:`.tensor.TensorSharedVariable`
709
        The shared variable storing the clipping threshold used.
710
711
    """
712
    def __init__(self, threshold=None):
713
        if threshold is not None:
714
            threshold = shared_floatx(threshold, "threshold")
715
            add_role(threshold, ALGORITHM_HYPERPARAMETER)
716
        self.threshold = threshold
717
718
    def compute_steps(self, previous_steps):
719
        if self.threshold is None:
720
            steps = previous_steps
721
        else:
722
            norm = l2_norm(previous_steps.values())
723
            multiplier = tensor.switch(norm < self.threshold,
724
                                       1, self.threshold / norm)
725 View Code Duplication
            steps = OrderedDict(
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
                (parameter, step * multiplier)
727
                for parameter, step in previous_steps.items())
728
        return steps, []
729
730
731
class VariableClipping(StepRule):
732
    """Clip the maximum norm of individual variables along certain axes.
733
734
    This :class:`StepRule` can be used to implement L2 norm constraints on
735
    e.g. the weight vectors of individual hidden units, convolutional
736
    filters or entire weight tensors. Combine with :class:`Restrict`
737
    (and possibly :class:`CompositeRule`), to apply such constraints only
738
    to certain variables and/or apply different norm constraints to
739
    different variables.
740
741
    Parameters
742
    ----------
743
    threshold : float
744
        Maximum norm for a given (portion of a) tensor.
745
    axis : int or iterable, optional
746
        An integer single axis, or an iterable collection of integer
747
        axes over which to sum in order to calculate the L2 norm. If
748
        `None` (the default), the norm is computed over all elements
749
        of the tensor.
750
751
    Notes
752
    -----
753
    Because of the way the :class:`StepRule` API works, this particular
754
    rule implements norm clipping of the value *after* update in the
755
    following way: it computes ``parameter - previous_step``, scales it
756
    to have (possibly axes-wise) norm(s) of at most `threshold`,
757
    then subtracts *that* value from `parameter` to yield an 'equivalent
758
    step' that respects the desired norm constraints. This procedure
759
    implicitly assumes one is doing simple (stochastic) gradient descent,
760
    and so steps computed by this step rule may not make sense for use
761
    in other contexts.
762
763
    Investigations into max-norm regularization date from [Srebro2005]_.
764
    The first appearance of this technique as a regularization method
765
    for the weight vectors of individual hidden units in feed-forward
766
    neural networks may be [Hinton2012]_.
767
768
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
769
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
770
       on Learning Theory (COLT)*, June 2005.
771
772
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
773
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
774
       "Improving neural networks by preventing co-adaptation of
775
       feature detectors". arXiv:1207.0580.
776
777
    """
778
    def __init__(self, threshold, axis=None):
779
        axis = pack(axis) if axis is not None else ()
780
        self.axis = set(axis)
781
        self.threshold = shared_floatx(threshold, "threshold")
782
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
783
        if len(axis) != len(self.axis):
784
            raise ValueError("axis must be unique")
785
786
    def compute_step(self, parameter, previous_step):
787
        if any(ax >= previous_step.ndim for ax in self.axis):
788
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
789
                self.axis, parameter, previous_step.ndim))
790
        if len(self.axis) == 0:
791
            norms = l2_norm([parameter - previous_step])
792
        else:
793
            squares = tensor.sqr(parameter - previous_step)
794
            norms = tensor.sqrt(
795
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
796
                       sorted(self.axis), squares))
797
        # We want a step s* that is the same as scaling
798
        # (parameter - previous_step) by threshold / norm
799
        # when threshold < norm.
800
        shrinking_step = (parameter -
801
                          (self.threshold / norms) *
802
                          (parameter - previous_step))
803
        return tensor.switch(norms > self.threshold,
804
                             shrinking_step,
805
                             previous_step), ()
806
807
808
class AdaGrad(StepRule):
809
    """Implements the AdaGrad learning rule.
810
811
    Parameters
812
    ----------
813
    learning_rate : float, optional
814
        Step size.
815
        Default value is set to 0.0002.
816
    epsilon : float, optional
817
        Stabilizing constant for one over root of sum of squares.
818
        Defaults to 1e-6.
819
820
    Notes
821
    -----
822
    For more information, see [ADAGRAD]_.
823
824
    .. [ADAGRAD] Duchi J, Hazan E, Singer Y.,
825
       *Adaptive subgradient methods for online learning and
826
       stochastic optimization*,
827
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
828
829
    """
830
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
831
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
832
        self.epsilon = shared_floatx(epsilon, "epsilon")
833
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
834
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
835
836
    def compute_step(self, parameter, previous_step):
837
        name = 'adagrad_sqs'
838
        if parameter.name:
839
            name += '_' + parameter.name
840
        ssq = _create_algorithm_buffer_for(parameter, name=name)
841
842
        ssq_t = (tensor.sqr(previous_step) + ssq)
843
        step = (self.learning_rate * previous_step /
844
                (tensor.sqrt(ssq_t) + self.epsilon))
845
846
        updates = [(ssq, ssq_t)]
847
848
        return step, updates
849
850
851
class Adam(StepRule):
852
    """Adam optimizer as described in [King2014]_.
853
854
    .. [King2014] Diederik Kingma, Jimmy Ba,
855
       *Adam: A Method for Stochastic Optimization*,
856
       http://arxiv.org/abs/1412.6980
857
858
    Parameters
859
    ----------
860
    learning_rate : float, optional
861
        Step size.
862
        Default value is set to 0.002.
863
    beta1 : float, optional
864
        Exponential decay rate for the first moment estimates.
865
        Default value is set to 0.9.
866
    beta2 : float, optional
867
        Exponential decay rate for the second moment estimates.
868
        Default value is set to 0.999.
869
    epsilon : float, optional
870
        Default value is set to 1e-8.
871
    decay_factor : float, optional
872
        Default value is set to 1.
873
874
    """
875
    def __init__(self, learning_rate=0.002,
876
                 beta1=0.9, beta2=0.999, epsilon=1e-8,
877
                 decay_factor=1):
878
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
879
        self.beta1 = shared_floatx(beta1, "beta1")
880
        self.beta2 = shared_floatx(beta2, "beta2")
881
        self.epsilon = shared_floatx(epsilon, "epsilon")
882
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
883
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
884
                      self.decay_factor]:
885
            add_role(param, ALGORITHM_HYPERPARAMETER)
886
887
    def compute_step(self, parameter, previous_step):
888
        mean = _create_algorithm_buffer_for(parameter, 'mean')
889
        variance = _create_algorithm_buffer_for(parameter, 'variance')
890
        time = shared_floatx(0., 'time')
891
        add_role(time, ALGORITHM_BUFFER)
892
893
        t1 = time + 1
894
        beta_1_decayed = self.beta1 * self.decay_factor ** (t1 - 1)
895
        learning_rate = (self.learning_rate *
896
                         tensor.sqrt(1. - self.beta2**t1) /
897
                         (1. - beta_1_decayed**t1))
898
        mean_t = beta_1_decayed * mean + (1. - beta_1_decayed) * previous_step
899
        variance_t = (self.beta2 * variance +
900
                      (1. - self.beta2) * tensor.sqr(previous_step))
901
        step = (learning_rate * mean_t /
902
                (tensor.sqrt(variance_t) + self.epsilon))
903
904
        updates = [(mean, mean_t),
905
                   (variance, variance_t),
906
                   (time, t1)]
907
908
        return step, updates
909
910
911
class RemoveNotFinite(StepRule):
912
    """A step rule that skips steps with non-finite elements.
913
914
    Replaces a step (the parameter update of a single shared variable)
915
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
916
    step rescaling the parameters.
917
918
    Parameters
919
    ----------
920
    scaler : float, optional
921
        The scaling applied to the parameter in case the step contains
922
        non-finite elements. Defaults to 1, which means that parameters
923
        will not be changed.
924
925
    Notes
926
    -----
927
    This rule should be applied last!
928
929
    This trick was originally used in the GroundHog_ framework.
930
931
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
932
933
    """
934
    def __init__(self, scaler=1):
935
        self.scaler = scaler
936
937
    def compute_step(self, parameter, previous_step):
938
        step_sum = tensor.sum(previous_step)
939
        not_finite = (tensor.isnan(step_sum) +
940
                      tensor.isinf(step_sum))
941
        step = tensor.switch(
942
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
943
        return step, []
944
945
946
class Restrict(StepRule):
947
    """Applies a given :class:`StepRule` only to certain variables.
948
949
    Example applications include clipping steps on only certain parameters,
950
    or scaling a certain kind of parameter's updates (e.g. adding an
951
    additional scalar multiplier to the steps taken on convolutional
952
    filters).
953
954
    Parameters
955
    ----------
956
    step_rule : :class:`StepRule`
957
        The :class:`StepRule` to be applied on the given variables.
958
    variables : iterable
959
        A collection of Theano variables on which to apply `step_rule`.
960
        Variables not appearing in this collection will not have
961
        `step_rule` applied to them.
962
963
    """
964
    def __init__(self, step_rule, variables):
965
        self.step_rule = step_rule
966
        self.variables = frozenset(variables)
967
968
    def compute_steps(self, previous_steps):
969
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
970
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
971
        actual = OrderedDict((parameter, steps[parameter])
972
                             if parameter in steps
973
                             else (parameter, previous_steps[parameter])
974
                             for parameter in previous_steps)
975
        return actual, updates
976