Test Failed
Pull Request — master (#1191)
by
unknown
18:01
created

TrainingAlgorithm.check_sanity()   A

Complexity

Conditions 1

Size

Total Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
c 1
b 0
f 0
dl 0
loc 10
rs 9.4285
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from collections import Mapping
7
from six.moves import reduce
8
9
from picklable_itertools.extras import equizip
10
11
import theano
12
from six import add_metaclass
13
from theano import tensor
14
15
from blocks.graph import ComputationGraph
16
from blocks.model import Model
17
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
18
from blocks.theano_expressions import l2_norm
19
from blocks.utils import dict_subset, pack
20
from blocks.utils.theano_utils import (
21
    shared_floatx_zeros_matching, shared_floatx)
22
23
logger = logging.getLogger(__name__)
24
25
26
def _create_algorithm_buffer_for(param, *args, **kwargs):
27
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
28
    buf.tag.for_parameter = param
29
    add_role(buf, ALGORITHM_BUFFER)
30
    return buf
31
32
33
@add_metaclass(ABCMeta)
34
class TrainingAlgorithm(object):
35
    """Base class for training algorithms.
36
37
    A training algorithm object has a simple life-cycle.
38
    First it is initialized by calling its :meth:`initialize` method.
39
    At this stage, for instance, Theano functions can be compiled.
40
    After that the :meth:`process_batch` method is repeatedly
41
    called with a batch of training data as a parameter.
42
43
    """
44
    @abstractmethod
45
    def initialize(self, **kwargs):
46
        """Initialize the training algorithm."""
47
        pass
48
49
    @abstractmethod
50
    def process_batch(self, batch):
51
        """Process a batch of training data.
52
53
        Attributes
54
        ----------
55
        batch : dict
56
            A dictionary of (source name, data) pairs.
57
58
        """
59
        pass
60
61
    def check_sanity(self, model):
62
        """Check that the algorithm is suitable to the model
63
64
        Parameters
65
        ----------
66
        model : object
67
            Model used for training.
68
69
        """
70
        pass
71
72
73
variable_mismatch_error = """
74
75
Blocks tried to match the sources ({sources}) of the training dataset to \
76
the names of the Theano variables ({variables}), but failed to do so. \
77
If you want to train on a subset of the sources that your dataset provides, \
78
pass the `sources` keyword argument to its constructor, use the \
79
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
80
or on_unused_sources='ignore' to the GradientDescent algorithm."""
81
82
source_missing_error = """
83
84
Blocks didn't find all the sources ({sources}) of the training dataset \
85
that match the names of the Theano variables ({variables})."""
86
87
88
determinism_error = """Cannot infer parameter list in a fixed order.
89
90
Because dictionaries are unordered (and Python uses randomized hashing, \
91
which can change the iteration order over the same dictionary from one \
92
interpreter session to the next), Blocks cannot infer the parameters list \
93
from a plain dictionary of gradients in an order that is reproducible \
94
across interpreter sessions; please either specify the parameters \
95
explicitly or pass gradients as an OrderedDict (though exercise care in \
96
constructing that OrderedDict, as an OrderedDict created by iterating \
97
over an unordered iterable (e.g. a dict) will still have an arbitrary \
98
and unpredictable order that could cause problems with \
99
reproducibility)."""
100
101
102
class UpdatesAlgorithm(TrainingAlgorithm):
103
    """Base class for algorithms that use Theano functions with updates.
104
105
    Parameters
106
    ----------
107
    updates : list of tuples or :class:`~collections.OrderedDict`
108
        The updates that should be performed.
109
    theano_func_kwargs : dict, optional
110
        A passthrough to `theano.function` for additional arguments.
111
        Useful for passing `profile` or `mode` arguments to the theano
112
        function that will be compiled for the algorithm.
113
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
114
        Controls behavior when not all sources in a batch are used
115
        (i.e. there is no variable with a matching name in the inputs
116
        of the computational graph of the updates).
117
118
    Attributes
119
    ----------
120
    updates : list of :class:`~tensor.TensorSharedVariable` updates
121
        Updates to be done for every batch. It is required that the
122
        updates are done using the old values of optimized parameters.
123
124
    Notes
125
    -----
126
    Changing `updates` attribute or calling `add_updates` after
127
    the `initialize` method is called will have no effect.
128
129
    """
130
    def __init__(self, updates=None, theano_func_kwargs=None,
131
                 on_unused_sources='raise', **kwargs):
132
        self.updates = [] if updates is None else updates
133
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
134
                                   is not None else dict())
135
        self.on_unused_sources = on_unused_sources
136
        super(UpdatesAlgorithm, self).__init__(**kwargs)
137
138
    def initialize(self):
139
        logger.info("Initializing the training algorithm")
140
        update_values = [new_value for _, new_value in self.updates]
141
        logger.debug("Inferring graph inputs...")
142
        self.inputs = ComputationGraph(update_values).inputs
143
        logger.debug("Compiling training function...")
144
        self._function = theano.function(
145
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
146
        logger.info("The training algorithm is initialized")
147
148
    @property
149
    def updates(self):
150
        return self._updates
151
152
    @updates.setter
153
    def updates(self, value):
154
        self._updates = value
155
156
    def add_updates(self, updates):
157
        """Add updates to the training process.
158
159
        The updates will be done _before_ the parameters are changed.
160
161
        Parameters
162
        ----------
163
        updates : list of tuples or :class:`~collections.OrderedDict`
164
            The updates to add.
165
166
        """
167
        if isinstance(updates, OrderedDict):
168
            updates = list(updates.items())
169
        if not isinstance(updates, list):
170
            raise ValueError
171
        self.updates.extend(updates)
172
173
    def _validate_source_names(self, batch):
174
        in_names = [v.name for v in self.inputs]
175
176
        if not set(in_names).issubset(set(batch.keys())):
177
            raise ValueError("Didn't find all sources: " +
178
                             source_missing_error.format(
179
                                 sources=batch.keys(),
180
                                 variables=in_names))
181
        if not set(batch.keys()).issubset(set(in_names)):
182
            if self.on_unused_sources == 'ignore':
183
                pass
184
            elif self.on_unused_sources == 'warn':
185
                if not hasattr(self, '_unused_source_warned'):
186
                    logger.warn(variable_mismatch_error.format(
187
                        sources=batch.keys(),
188
                        variables=in_names))
189
                self._unused_source_warned = True
190
            elif self.on_unused_sources == 'raise':
191
                raise ValueError(
192
                    "mismatch of variable names and data sources" +
193
                    variable_mismatch_error.format(
194
                        sources=batch.keys(),
195
                        variables=in_names))
196
            else:
197
                raise ValueError("Wrong value of on_unused_sources: {}."
198
                                 .format(self.on_unused_sources))
199
200
    def process_batch(self, batch):
201
        self._validate_source_names(batch)
202
        ordered_batch = [batch[v.name] for v in self.inputs]
203
        self._function(*ordered_batch)
204
205
206
class GradientDescent(UpdatesAlgorithm):
207
    """A base class for all gradient descent algorithms.
208
209
    By "gradient descent" we mean a training algorithm of the following
210
    form:
211
212
    .. code-block::  python
213
214
        for batch in data:
215
            steps = step_rule.compute_steps(parameters,
216
                                            gradients_wr_parameters)
217
            for parameter in parameters:
218
                parameter -= steps[parameter]
219
220
    Note, that the step is *subtracted, not added*! This is done in order
221
    to make step rule chaining possible.
222
223
    Parameters
224
    ----------
225
    cost : :class:`~tensor.TensorVariable`, optional
226
        The objective to be minimized. Unused if `gradients` is specified.
227
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
228
        The parameters to be tuned. If not provided, inferred from the
229
        keys of `gradients` (in which case `gradients` *must* be an
230
        `OrderedDict`).
231
    step_rule : instance of :class:`StepRule`, optional
232
        An object encapsulating most of the algorithm's logic. Its
233
        `compute_steps` method is called to get Theano expression for
234
        steps.  Note, that the step rule might have a state, e.g. to
235
        remember a weighted sum of gradients from previous steps like it is
236
        done in gradient descent with momentum. If ``None``, an instance of
237
        :class:`Scale` is created.
238
    gradients : OrderedDict or list of 2-tuples, optional
239
        A dictionary mapping a parameter to an expression for the cost's
240
        gradient with respect to the parameter, or equivalently, a list of
241
        (parameter, gradient) tuples. If ``None``, the gradient
242
        are taken automatically using :func:`theano.gradient.grad`.
243
    known_grads : dict, optional
244
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
245
        Useful when you know the [approximate] gradients of some
246
        sub-expressions and would like Theano to use that information
247
        to compute parameter gradients. Only makes sense when `gradients`
248
        is `None`.
249
    consider_constant : list, optional
250
        A passthrough to `theano.tensor.grad`'s `consider_constant`
251
        argument.  A list of expressions through which gradients will not
252
        be backpropagated. Only makes sense when `gradients` is `None`.
253
254
    Attributes
255
    ----------
256
    gradients : OrderedDict
257
        The gradient dictionary.
258
    step_rule : instance of :class:`StepRule`
259
        The step rule.
260
261
    Notes
262
    -----
263
    Changing `updates` attribute or calling `add_updates` after
264
    the `initialize` method is called will have no effect.
265
266
    If a cost and parameters are provided, gradients are taken immediately
267
    upon construction, and changes to these attributes after construction
268
    will have no effect.
269
270
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
271
    because ordinary dictionaries have an unpredictable iteration
272
    order due to hash randomization (which is enabled by default since
273
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
274
    when combined with Theano's heuristic graph optimizations, can cause
275
    serious reproducibility issues.
276
277
    """
278
    def __init__(self, cost=None, parameters=None, step_rule=None,
279
                 gradients=None, known_grads=None, consider_constant=None,
280
                 **kwargs):
281
        # Set initial values for cost, parameters, gradients.
282
        self.cost = cost
283
        self.parameters = parameters
284
        # Coerce lists of tuples to OrderedDict. Do not coerce Mappings,
285
        # as we don't want to convert dict -> OrderedDict and give it
286
        # an arbitrary, non-deterministic order.
287
        if gradients is not None and not isinstance(gradients, Mapping):
288
            gradients = OrderedDict(gradients)
289
        self.gradients = gradients
290
291
        # If we don't have gradients, we'll need to infer them from the
292
        # cost and the parameters, both of which must not be None.
293
        if not self.gradients:
294
            self.gradients = self._compute_gradients(known_grads,
295
                                                     consider_constant)
296
        else:
297
            if cost is not None:
298
                logger.warning(('{}: gradients already specified directly; '
299
                                'cost is unused.'
300
                                .format(self.__class__.__name__)))
301
            if self.parameters is None and isinstance(gradients, OrderedDict):
302
                # If the dictionary is ordered, it's safe to use the keys
303
                # as they have a deterministic order.
304
                self.parameters = list(self.gradients.keys())
305
            elif self.parameters is not None:
306
                # If parameters and gradients.keys() don't match we can
307
                # try to recover if gradients is ordered.
308
                if set(self.parameters) != set(self.gradients.keys()):
309
                    logger.warn("Specified parameters list does not match "
310
                                "keys in provided gradient dictionary; "
311
                                "using parameters inferred from gradients")
312
                    if not isinstance(self.gradients, OrderedDict):
313
                        raise ValueError(determinism_error)
314
                    self.parameters = list(self.gradients.keys())
315
            else:
316
                # self.parameters is not None, and gradients isn't
317
                # an OrderedDict. We can't do anything safe.
318
                raise ValueError(determinism_error)
319
            if known_grads:
320
                raise ValueError("known_grads has no effect when gradients "
321
                                 "are passed in")
322
            if consider_constant is not None:
323
                raise ValueError("consider_constant has no effect when "
324
                                 "gradients are passed in")
325
326
        # The order in which the different gradient terms appears
327
        # here matters, as floating point addition is non-commutative (and
328
        # Theano's graph optimizations are not order-independent).
329
        # This is why we do not use .values().
330
        gradient_values = [self.gradients[p] for p in self.parameters]
331
        self.total_gradient_norm = (l2_norm(gradient_values)
332
                                    .copy(name="total_gradient_norm"))
333
334
        self.step_rule = step_rule if step_rule else Scale()
335
        logger.debug("Computing parameter steps...")
336
        self.steps, self.step_rule_updates = (
337
            self.step_rule.compute_steps(self.gradients))
338
339
        # Same as gradient_values above: the order may influence a
340
        # bunch of things, so enforce a consistent one (don't use
341
        # .values()).
342
        step_values = [self.steps[p] for p in self.parameters]
343
        self.total_step_norm = (l2_norm(step_values)
344
                                .copy(name="total_step_norm"))
345
346
        # Once again, iterating on gradients may not be deterministically
347
        # ordered if it is not an OrderedDict. We add the updates here in
348
        # the order specified in self.parameters. Keep it this way to
349
        # maintain reproducibility.
350
        kwargs.setdefault('updates', []).extend(
351
            itertools.chain(((parameter, parameter - self.steps[parameter])
352
                             for parameter in self.parameters),
353
                            self.step_rule_updates)
354
        )
355
        super(GradientDescent, self).__init__(**kwargs)
356
357
    def _compute_gradients(self, known_grads, consider_constant):
358
        if self.cost is None:
359
            raise ValueError("can't infer gradients; no cost specified")
360
        elif self.parameters is None or len(self.parameters) == 0:
361
            raise ValueError("can't infer gradients; no parameters "
362
                             "specified")
363
        # While this strictly speaking could be a dict and not an
364
        # OrderedDict (because we iterate over it in the order of
365
        # self.parameters), this guards a little bit against
366
        # nondeterminism introduced by future refactoring.
367
        logger.info("Taking the cost gradient")
368
        gradients = OrderedDict(
369
            equizip(self.parameters, tensor.grad(
370
                self.cost, self.parameters,
371
                known_grads=known_grads,
372
                consider_constant=consider_constant)))
373
        logger.info("The cost gradient computation graph is built")
374
        return gradients
375
376
    def check_sanity(self, model):
377
        # Sanity check for the most common case
378
        if (model and isinstance(model, Model) and
379
                isinstance(self.algorithm, GradientDescent)):
380
            if not (set(model.get_parameter_dict().values()) ==
381
                    set(self.algorithm.parameters)):
382
                logger.warning("different parameters for model and algorithm")
383
384
385
@add_metaclass(ABCMeta)
386
class StepRule(object):
387
    """A rule to compute steps for a gradient descent algorithm."""
388
    def compute_step(self, parameter, previous_step):
389
        """Build a Theano expression for the step for a parameter.
390
391
        This method is called by default implementation of
392
        :meth:`compute_steps`, it relieves from writing a loop each time.
393
394
        Parameters
395
        ----------
396
        parameter : :class:`~tensor.TensorSharedVariable`
397
            The parameter.
398
        previous_step : :class:`~tensor.TensorVariable`
399
            Some quantity related to the gradient of the cost with respect
400
            to the parameter, either the gradient itself or a step in a
401
            related direction.
402
403
        Returns
404
        -------
405
        step : :class:`~theano.Variable`
406
            Theano variable for the step to take.
407
        updates : list
408
            A list of tuples representing updates to be performed. This
409
            is useful for stateful rules such as :class:`Momentum` which
410
            need to update shared variables after itetations.
411
412
        """
413
        raise NotImplementedError
414
415
    def compute_steps(self, previous_steps):
416
        """Build a Theano expression for steps for all parameters.
417
418
        Override this method if you want to process the steps
419
        with respect to all parameters as a whole, not parameter-wise.
420
421
        Parameters
422
        ----------
423
        previous_steps : OrderedDict
424
            An :class:`~OrderedDict` of
425
            (:class:`~tensor.TensorSharedVariable`
426
            :class:`~tensor.TensorVariable`) pairs. The keys are the
427
            parameters being trained, the values are the expressions for
428
            quantities related to gradients of the cost with respect to
429
            the parameters, either the gradients themselves or steps in
430
            related directions.
431
432
        Returns
433
        -------
434
        steps : OrderedDict
435
            A dictionary of the proposed steps in the same form as
436
            `previous_steps`.
437
        updates : list
438
            A list of tuples representing updates to be performed.
439
440
        """
441
        parameter_wise = [self.compute_step(parameter,
442
                                            previous_steps[parameter])
443
                          for parameter in previous_steps]
444
        steps, updates = equizip(*parameter_wise)
445
        steps = OrderedDict((parameter, step) for parameter, step
446
                            in equizip(previous_steps.keys(), steps))
447
        updates = list(itertools.chain(*updates))
448
        return steps, updates
449
450
451
class CompositeRule(StepRule):
452
    """Chains several step rules.
453
454
    Parameters
455
    ----------
456
    components : list of :class:`StepRule`
457
        The learning rules to be chained. The rules will be applied in the
458
        order as given.
459
460
    """
461
    def __init__(self, components):
462
        self.components = components
463
464
    def compute_steps(self, previous_steps):
465
        steps = previous_steps
466
        updates = []
467
        for rule in self.components:
468
            steps, more_updates = rule.compute_steps(steps)
469
            updates += more_updates
470
        return steps, updates
471
472
473
class Scale(StepRule):
474
    """A step in the direction proportional to the previous step.
475
476
    If used in :class:`GradientDescent` alone, this step rule implements
477
    steepest descent.
478
479
    Parameters
480
    ----------
481
    learning_rate : float
482
        The learning rate by which the previous step is multiplied to
483
        produce the step.
484
485
    Attributes
486
    ----------
487
    learning_rate : :class:`~tensor.TensorSharedVariable`
488
        The shared variable storing the learning rate used.
489
490
    """
491
    def __init__(self, learning_rate=1.0):
492
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
493
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
494
495
    def compute_step(self, parameter, previous_step):
496
        return self.learning_rate * previous_step, []
497
498
499
class BasicMomentum(StepRule):
500
    """Accumulates step with exponential discount.
501
502
    Parameters
503
    ----------
504
    momentum : float, optional
505
        The momentum coefficient. Defaults to 0.
506
507
    Notes
508
    -----
509
    This step rule is intended to be used in conjunction with another
510
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
511
    experience, look at :class:`Momentum`.
512
513
    """
514
    def __init__(self, momentum=0.):
515
        self.momentum = shared_floatx(momentum, "momentum")
516
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
517
518
    def compute_step(self, parameter, previous_step):
519
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
520
        step = self.momentum * velocity + previous_step
521
        updates = [(velocity, step)]
522 View Code Duplication
        return step, updates
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
524
525
class Momentum(CompositeRule):
526
    """Accumulates step with exponential discount.
527
528
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
529
    usual momentum step rule.
530
531
    Parameters
532
    ----------
533
    learning_rate : float, optional
534
        The learning rate by which the previous step scaled. Defaults to 1.
535
    momentum : float, optional
536
        The momentum coefficient. Defaults to 0.
537
538
    Attributes
539
    ----------
540
    learning_rate : :class:`~tensor.SharedVariable`
541
        A variable for learning rate.
542
    momentum : :class:`~tensor.SharedVariable`
543
        A variable for momentum.
544
545
    See Also
546
    --------
547
    :class:`SharedVariableModifier`
548
549
    """
550
    def __init__(self, learning_rate=1.0, momentum=0.):
551
        scale = Scale(learning_rate=learning_rate)
552
        basic_momentum = BasicMomentum(momentum=momentum)
553
        self.learning_rate = scale.learning_rate
554
        self.momentum = basic_momentum.momentum
555
        self.components = [scale, basic_momentum]
556
557
558
class AdaDelta(StepRule):
559
    """Adapts the step size over time using only first order information.
560
561
    Parameters
562
    ----------
563
    decay_rate : float, optional
564
        Decay rate in [0, 1]. Defaults to 0.95.
565
    epsilon : float, optional
566
        Stabilizing constant for RMS. Defaults to 1e-6.
567
568
    Notes
569
    -----
570
    For more information, see [ADADELTA]_.
571
572
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
573
       Rate Method*, arXiv:1212.5701.
574
575
    """
576
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
577
        if not 0.0 <= decay_rate <= 1.0:
578
            raise ValueError("decay rate needs to be in [0, 1]")
579
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
580
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
581
        self.epsilon = shared_floatx(epsilon, "epsilon")
582
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
583
584
    def compute_step(self, parameter, previous_step):
585
        mean_square_step_tm1 = _create_algorithm_buffer_for(
586
            parameter, "mean_square_step_tm1")
587
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
588
            parameter, "mean_square_delta_x_tm1")
589
590
        mean_square_step_t = (
591
            self.decay_rate * mean_square_step_tm1 +
592
            (1 - self.decay_rate) * tensor.sqr(previous_step)
593
        )
594
595
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
596
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
597
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
598
599
        mean_square_delta_x_t = (
600
            self.decay_rate * mean_square_delta_x_tm1 +
601
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
602
        )
603
604
        step = delta_x_t
605
        updates = [(mean_square_step_tm1, mean_square_step_t),
606
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
607
        return step, updates
608
609
610
class BasicRMSProp(StepRule):
611
    """Scales the step size by a running average of the recent step norms.
612
613
    Parameters
614
    ----------
615
    decay_rate : float, optional
616
        How fast the running average decays, value in [0, 1]
617
        (lower is faster).  Defaults to 0.9.
618
    max_scaling : float, optional
619
        Maximum scaling of the step size, in case the running average is
620
        really small. Needs to be greater than 0. Defaults to 1e5.
621
622
    Notes
623
    -----
624
    This step rule is intended to be used in conjunction with another
625
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
626
    experience, look at :class:`RMSProp`.
627
628
    In general, this step rule should be used _before_ other step rules,
629
    because it has normalization properties that may undo their work.
630
    For instance, it should be applied first when used in conjunction
631
    with :class:`Scale`.
632
633
    For more information, see [Hint2014]_.
634
635
    """
636
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
637
        if not 0.0 <= decay_rate <= 1.0:
638
            raise ValueError("decay rate needs to be in [0, 1]")
639
        if max_scaling <= 0:
640
            raise ValueError("max. scaling needs to be greater than 0")
641
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
642
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
643
        self.epsilon = 1. / max_scaling
644
645
    def compute_step(self, parameter, previous_step):
646
        mean_square_step_tm1 = _create_algorithm_buffer_for(
647
            parameter, "mean_square_step_tm1")
648
        mean_square_step_t = (
649
            self.decay_rate * mean_square_step_tm1 +
650
            (1 - self.decay_rate) * tensor.sqr(previous_step))
651
        rms_step_t = tensor.maximum(
652
            tensor.sqrt(mean_square_step_t), self.epsilon)
653
        step = previous_step / rms_step_t
654
        updates = [(mean_square_step_tm1, mean_square_step_t)]
655
        return step, updates
656
657
658
class RMSProp(CompositeRule):
659
    """Scales the step size by a running average of the recent step norms.
660
661
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
662
    described in [Hint2014]_.
663
664
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
665
       lecture 6a,
666
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
667
668
    Parameters
669
    ----------
670
    learning_rate : float, optional
671
        The learning rate by which the previous step scaled. Defaults to 1.
672
    decay_rate : float, optional
673
        How fast the running average decays (lower is faster).
674
        Defaults to 0.9.
675
    max_scaling : float, optional
676
        Maximum scaling of the step size, in case the running average is
677
        really small. Defaults to 1e5.
678
679
    Attributes
680
    ----------
681
    learning_rate : :class:`~tensor.SharedVariable`
682
        A variable for learning rate.
683
    decay_rate : :class:`~tensor.SharedVariable`
684
        A variable for decay rate.
685
686
    See Also
687
    --------
688
    :class:`SharedVariableModifier`
689
690
    """
691
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
692
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
693
                                      max_scaling=max_scaling)
694
        scale = Scale(learning_rate=learning_rate)
695
        self.learning_rate = scale.learning_rate
696
        self.decay_rate = basic_rms_prop.decay_rate
697
        self.components = [basic_rms_prop, scale]
698
699
700
class StepClipping(StepRule):
701
    """Rescales an entire step if its L2 norm exceeds a threshold.
702
703
    When the previous steps are the gradients, this step rule performs
704
    gradient clipping.
705
706
    Parameters
707
    ----------
708
    threshold : float, optional
709
        The maximum permitted L2 norm for the step. The step
710
        will be rescaled to be not higher than this quanity.
711
        If ``None``, no rescaling will be applied.
712
713
    Attributes
714
    ----------
715
    threshold : :class:`.tensor.TensorSharedVariable`
716
        The shared variable storing the clipping threshold used.
717
718
    """
719
    def __init__(self, threshold=None):
720
        if threshold is not None:
721
            threshold = shared_floatx(threshold, "threshold")
722
            add_role(threshold, ALGORITHM_HYPERPARAMETER)
723
        self.threshold = threshold
724
725 View Code Duplication
    def compute_steps(self, previous_steps):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
        if self.threshold is None:
727
            steps = previous_steps
728
        else:
729
            norm = l2_norm(previous_steps.values())
730
            multiplier = tensor.switch(norm < self.threshold,
731
                                       1, self.threshold / norm)
732
            steps = OrderedDict(
733
                (parameter, step * multiplier)
734
                for parameter, step in previous_steps.items())
735
        return steps, []
736
737
738
class VariableClipping(StepRule):
739
    """Clip the maximum norm of individual variables along certain axes.
740
741
    This :class:`StepRule` can be used to implement L2 norm constraints on
742
    e.g. the weight vectors of individual hidden units, convolutional
743
    filters or entire weight tensors. Combine with :class:`Restrict`
744
    (and possibly :class:`CompositeRule`), to apply such constraints only
745
    to certain variables and/or apply different norm constraints to
746
    different variables.
747
748
    Parameters
749
    ----------
750
    threshold : float
751
        Maximum norm for a given (portion of a) tensor.
752
    axis : int or iterable, optional
753
        An integer single axis, or an iterable collection of integer
754
        axes over which to sum in order to calculate the L2 norm. If
755
        `None` (the default), the norm is computed over all elements
756
        of the tensor.
757
758
    Notes
759
    -----
760
    Because of the way the :class:`StepRule` API works, this particular
761
    rule implements norm clipping of the value *after* update in the
762
    following way: it computes ``parameter - previous_step``, scales it
763
    to have (possibly axes-wise) norm(s) of at most `threshold`,
764
    then subtracts *that* value from `parameter` to yield an 'equivalent
765
    step' that respects the desired norm constraints. This procedure
766
    implicitly assumes one is doing simple (stochastic) gradient descent,
767
    and so steps computed by this step rule may not make sense for use
768
    in other contexts.
769
770
    Investigations into max-norm regularization date from [Srebro2005]_.
771
    The first appearance of this technique as a regularization method
772
    for the weight vectors of individual hidden units in feed-forward
773
    neural networks may be [Hinton2012]_.
774
775
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
776
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
777
       on Learning Theory (COLT)*, June 2005.
778
779
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
780
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
781
       "Improving neural networks by preventing co-adaptation of
782
       feature detectors". arXiv:1207.0580.
783
784
    """
785
    def __init__(self, threshold, axis=None):
786
        axis = pack(axis) if axis is not None else ()
787
        self.axis = set(axis)
788
        self.threshold = shared_floatx(threshold, "threshold")
789
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
790
        if len(axis) != len(self.axis):
791
            raise ValueError("axis must be unique")
792
793
    def compute_step(self, parameter, previous_step):
794
        if any(ax >= previous_step.ndim for ax in self.axis):
795
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
796
                self.axis, parameter, previous_step.ndim))
797
        if len(self.axis) == 0:
798
            norms = l2_norm([parameter - previous_step])
799
        else:
800
            squares = tensor.sqr(parameter - previous_step)
801
            norms = tensor.sqrt(
802
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
803
                       sorted(self.axis), squares))
804
        # We want a step s* that is the same as scaling
805
        # (parameter - previous_step) by threshold / norm
806
        # when threshold < norm.
807
        shrinking_step = (parameter -
808
                          (self.threshold / norms) *
809
                          (parameter - previous_step))
810
        return tensor.switch(norms > self.threshold,
811
                             shrinking_step,
812
                             previous_step), ()
813
814
815
class AdaGrad(StepRule):
816
    """Implements the AdaGrad learning rule.
817
818
    Parameters
819
    ----------
820
    learning_rate : float, optional
821
        Step size.
822
        Default value is set to 0.0002.
823
    epsilon : float, optional
824
        Stabilizing constant for one over root of sum of squares.
825
        Defaults to 1e-6.
826
827
    Notes
828
    -----
829
    For more information, see [ADAGRAD]_.
830
831
    .. [ADAGRAD] Duchi J, Hazan E, Singer Y.,
832
       *Adaptive subgradient methods for online learning and
833
       stochastic optimization*,
834
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
835
836
    """
837
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
838
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
839
        self.epsilon = shared_floatx(epsilon, "epsilon")
840
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
841
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
842
843
    def compute_step(self, parameter, previous_step):
844
        name = 'adagrad_sqs'
845
        if parameter.name:
846
            name += '_' + parameter.name
847
        ssq = _create_algorithm_buffer_for(parameter, name=name)
848
849
        ssq_t = (tensor.sqr(previous_step) + ssq)
850
        step = (self.learning_rate * previous_step /
851
                (tensor.sqrt(ssq_t) + self.epsilon))
852
853
        updates = [(ssq, ssq_t)]
854
855
        return step, updates
856
857
858
class Adam(StepRule):
859
    """Adam optimizer as described in [King2014]_.
860
861
    .. [King2014] Diederik Kingma, Jimmy Ba,
862
       *Adam: A Method for Stochastic Optimization*,
863
       http://arxiv.org/abs/1412.6980
864
865
    Parameters
866
    ----------
867
    learning_rate : float, optional
868
        Step size.
869
        Default value is set to 0.002.
870
    beta1 : float, optional
871
        Exponential decay rate for the first moment estimates.
872
        Default value is set to 0.9.
873
    beta2 : float, optional
874
        Exponential decay rate for the second moment estimates.
875
        Default value is set to 0.999.
876
    epsilon : float, optional
877
        Default value is set to 1e-8.
878
    decay_factor : float, optional
879
        Default value is set to 1.
880
881
    """
882
    def __init__(self, learning_rate=0.002,
883
                 beta1=0.9, beta2=0.999, epsilon=1e-8,
884
                 decay_factor=1):
885
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
886
        self.beta1 = shared_floatx(beta1, "beta1")
887
        self.beta2 = shared_floatx(beta2, "beta2")
888
        self.epsilon = shared_floatx(epsilon, "epsilon")
889
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
890
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
891
                      self.decay_factor]:
892
            add_role(param, ALGORITHM_HYPERPARAMETER)
893
894
    def compute_step(self, parameter, previous_step):
895
        mean = _create_algorithm_buffer_for(parameter, 'mean')
896
        variance = _create_algorithm_buffer_for(parameter, 'variance')
897
        time = shared_floatx(0., 'time')
898
        add_role(time, ALGORITHM_BUFFER)
899
900
        t1 = time + 1
901
        beta_1_decayed = self.beta1 * self.decay_factor ** (t1 - 1)
902
        learning_rate = (self.learning_rate *
903
                         tensor.sqrt(1. - self.beta2**t1) /
904
                         (1. - beta_1_decayed**t1))
905
        mean_t = beta_1_decayed * mean + (1. - beta_1_decayed) * previous_step
906
        variance_t = (self.beta2 * variance +
907
                      (1. - self.beta2) * tensor.sqr(previous_step))
908
        step = (learning_rate * mean_t /
909
                (tensor.sqrt(variance_t) + self.epsilon))
910
911
        updates = [(mean, mean_t),
912
                   (variance, variance_t),
913
                   (time, t1)]
914
915
        return step, updates
916
917
918
class RemoveNotFinite(StepRule):
919
    """A step rule that skips steps with non-finite elements.
920
921
    Replaces a step (the parameter update of a single shared variable)
922
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
923
    step rescaling the parameters.
924
925
    Parameters
926
    ----------
927
    scaler : float, optional
928
        The scaling applied to the parameter in case the step contains
929
        non-finite elements. Defaults to 1, which means that parameters
930
        will not be changed.
931
932
    Notes
933
    -----
934
    This rule should be applied last!
935
936
    This trick was originally used in the GroundHog_ framework.
937
938
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
939
940
    """
941
    def __init__(self, scaler=1):
942
        self.scaler = scaler
943
944
    def compute_step(self, parameter, previous_step):
945
        step_sum = tensor.sum(previous_step)
946
        not_finite = (tensor.isnan(step_sum) +
947
                      tensor.isinf(step_sum))
948
        step = tensor.switch(
949
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
950
        return step, []
951
952
953
class Restrict(StepRule):
954
    """Applies a given :class:`StepRule` only to certain variables.
955
956
    Example applications include clipping steps on only certain parameters,
957
    or scaling a certain kind of parameter's updates (e.g. adding an
958
    additional scalar multiplier to the steps taken on convolutional
959
    filters).
960
961
    Parameters
962
    ----------
963
    step_rule : :class:`StepRule`
964
        The :class:`StepRule` to be applied on the given variables.
965
    variables : iterable
966
        A collection of Theano variables on which to apply `step_rule`.
967
        Variables not appearing in this collection will not have
968
        `step_rule` applied to them.
969
970
    """
971
    def __init__(self, step_rule, variables):
972
        self.step_rule = step_rule
973
        self.variables = frozenset(variables)
974
975
    def compute_steps(self, previous_steps):
976
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
977
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
978
        actual = OrderedDict((parameter, steps[parameter])
979
                             if parameter in steps
980
                             else (parameter, previous_steps[parameter])
981
                             for parameter in previous_steps)
982
        return actual, updates
983