Completed
Pull Request — master (#1079)
by David
04:41
created

UpdatesAlgorithm.__init__()   A

Complexity

Conditions 3

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
dl 0
loc 7
rs 9.4285
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor. Or pass \
65
on_unused_sources='warn' or on_unused_sources='ignore' to \
66
the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
determinism_error = """Cannot infer parameter list in a fixed order.
0 ignored issues
show
Bug introduced by
A suspicious escape sequence \ was found. Did you maybe forget to add an r prefix?

Escape sequences in Python are generally interpreted according to rules similar to standard C. Only if strings are prefixed with r or R are they interpreted as regular expressions.

The escape sequence that was used indicates that you might have intended to write a regular expression.

Learn more about the available escape sequences. in the Python documentation.

Loading history...
75
76
Because dictionaries are unordered (and Python uses randomized hashing, \
77
which can change the iteration order over the same dictionary from one \
78
interpreter session to the next), Blocks cannot infer the parameters list \
79
from a plain dictionary of gradients in an order that is reproducible \
80
across interpreter sessions; please either specify the parameters \
81
explicitly or pass gradients as an OrderedDict (though exercise care in \
82
constructing that OrderedDict, as an OrderedDict \ created by iterating \
83
over an unordered iterable (e.g. a dict) will still \ have an arbitrary \
84
and unpredictable order that could cause problems with \
85
reproducibility)."""
86
87
88
class UpdatesAlgorithm(TrainingAlgorithm):
89
    """Base class for algorithms that use Theano functions with updates.
90
91
    Parameters
92
    ----------
93
    updates : list of tuples or :class:`~collections.OrderedDict`
94
        The updates that should be performed.
95
    theano_func_kwargs : dict, optional
96
        A passthrough to `theano.function` for additional arguments.
97
        Useful for passing `profile` or `mode` arguments to the theano
98
        function that will be compiled for the algorithm.
99
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
100
        Controls behavior when not all sources in a batch are used
101
        (i.e. there is no variable with a matching name in the inputs
102
        of the computational graph of the updates).
103
104
    Attributes
105
    ----------
106
    updates : list of :class:`~tensor.TensorSharedVariable` updates
107
        Updates to be done for every batch. It is required that the
108
        updates are done using the old values of optimized parameters.
109
110
    """
111
    def __init__(self, updates=None, theano_func_kwargs=None,
112
                 on_unused_sources='raise', **kwargs):
113
        self.updates = [] if updates is None else updates
114
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
115
                                   is not None else dict())
116
        self.on_unused_sources = on_unused_sources
117
        super(UpdatesAlgorithm, self).__init__(**kwargs)
118
119
    def initialize(self):
120
        logger.info("Initializing the training algorithm")
121
        update_values = [new_value for _, new_value in self.updates]
122
        logger.debug("Inferring graph inputs...")
123
        self.inputs = ComputationGraph(update_values).inputs
124
        logger.debug("Compiling training function...")
125
        self._function = theano.function(
126
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
127
        logger.info("The training algorithm is initialized")
128
129
    @property
130
    def updates(self):
131
        return self._updates
132
133
    @updates.setter
134
    def updates(self, value):
135
        self._updates = value
136
137
    def add_updates(self, updates):
138
        """Add updates to the training process.
139
140
        The updates will be done _before_ the parameters are changed.
141
142
        Parameters
143
        ----------
144
        updates : list of tuples or :class:`~collections.OrderedDict`
145
            The updates to add.
146
147
        """
148
        if isinstance(updates, OrderedDict):
149
            updates = list(updates.items())
150
        if not isinstance(updates, list):
151
            raise ValueError
152
        self.updates.extend(updates)
153
154
    def _validate_source_names(self, batch):
155
        in_names = [v.name for v in self.inputs]
156
157
        if not set(in_names).issubset(set(batch.keys())):
158
            raise ValueError("Didn't find all sources: " +
159
                             source_missing_error.format(
160
                                 sources=batch.keys(),
161
                                 variables=in_names))
162
        if not set(batch.keys()).issubset(set(in_names)):
163
            if self.on_unused_sources == 'ignore':
164
                pass
165
            elif self.on_unused_sources == 'warn':
166
                if not hasattr(self, '_unused_source_warned'):
167
                    logger.warn(variable_mismatch_error.format(
168
                        sources=batch.keys(),
169
                        variables=in_names))
170
                self._unused_source_warned = True
171
            elif self.on_unused_sources == 'raise':
172
                raise ValueError(
173
                    "mismatch of variable names and data sources" +
174
                    variable_mismatch_error.format(
175
                        sources=batch.keys(),
176
                        variables=in_names))
177
            else:
178
                raise ValueError("Wrong value of on_unused_sources: {}."
179
                                 .format(self.on_unused_sources))
180
181
    def process_batch(self, batch):
182
        self._validate_source_names(batch)
183
        ordered_batch = [batch[v.name] for v in self.inputs]
184
        self._function(*ordered_batch)
185
186
187
class GradientDescent(UpdatesAlgorithm):
188
    """A base class for all gradient descent algorithms.
189
190
    By "gradient descent" we mean a training algorithm of the following
191
    form:
192
193
    .. code-block::  python
194
195
        for batch in data:
196
            steps = step_rule.compute_steps(parameters,
197
                                            gradients_wr_parameters)
198
            for parameter in parameters:
199
                parameter -= steps[parameter]
200
201
    Note, that the step is *subtracted, not added*! This is done in order
202
    to make step rule chaining possible.
203
204
    Parameters
205
    ----------
206
    cost : :class:`~tensor.TensorVariable`, optional
207
        The objective to be minimized. Unused if `gradients` is specified.
208
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
209
        The parameters to be tuned. If not provided, inferred from the
210
        keys of `gradients` (in which case `gradients` *must* be an
211
        `OrderedDict`).
212
    step_rule : instance of :class:`StepRule`, optional
213
        An object encapsulating most of the algorithm's logic. Its
214
        `compute_steps` method is called to get Theano expression for
215
        steps.  Note, that the step rule might have a state, e.g. to
216
        remember a weighted sum of gradients from previous steps like it is
217
        done in gradient descent with momentum. If ``None``, an instance of
218
        :class:`Scale` is created.
219
    gradients : OrderedDict, optional
220
        A dictionary mapping a parameter to an expression for the cost's
221
        gradient with respect to the parameter. If ``None``, the gradient
222
        are taken automatically using :func:`theano.gradient.grad`.
223
    known_grads : dict, optional
224
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
225
        Useful when you know the [approximate] gradients of some
226
        sub-expressions and would like Theano to use that information
227
        to compute parameter gradients. Only makes sense when `gradients`
228
        is `None`.
229
    consider_constant : list, optional
230
        A passthrough to `theano.tensor.grad`'s `consider_constant`
231
        argument.  A list of expressions through which gradients will not
232
        be backpropagated. Only makes sense when `gradients` is `None`.
233
234
    Attributes
235
    ----------
236
    gradients : OrderedDict
237
        The gradient dictionary.
238
    step_rule : instance of :class:`StepRule`
239
        The step rule.
240
241
    Notes
242
    -----
243
    Changing `updates` attribute or calling `add_updates` after
244
    the `initialize` method is called will have no effect.
245
246
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
247
    because ordinary dictionaries have an unpredictable iteration
248
    order due to hash randomization (which is enabled by default since
249
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
250
    when combined with Theano's heuristic graph optimizations, can cause
251
    serious reproducibility issues.
252
253
    .. todo::
254
255
       Some shared variables are not parameters (e.g. those created by
256
       random streams).
257
258
    .. todo::
259
260
       Due to a rather premature status of the :class:`ComputationGraph`
261
       class the parameter used only inside scans are not fetched
262
       currently.
263
264
    """
265
    def __init__(self, cost=None, parameters=None, step_rule=None,
266
                 gradients=None, known_grads=None, consider_constant=None,
267
                 **kwargs):
268
        # Set initial values for cost, parameters, gradients.
269
        self.cost = cost
270
        self.parameters = parameters
271
        self.gradients = gradients
272
273
        # If we don't have gradients, we'll need to infer them from the
274
        # cost and the parameters, both of which must not be None.
275
        if not self.gradients:
276
            self.gradients = self._compute_gradients(known_grads,
277
                                                     consider_constant)
278
        else:
279
            if cost is not None:
280
                logger.warning(('{}: gradients already specified directly; '
281
                                'cost is unused.'
282
                                .format(self.__class__.__name__)))
283
            if self.parameters is None and isinstance(gradients, OrderedDict):
284
                # If the dictionary is ordered, it's safe to use the keys
285
                # as they have a deterministic order.
286
                self.parameters = list(self.gradients.keys())
287
            elif self.parameters is not None:
288
                # If parameters and gradients.keys() don't match we can
289
                # try to recover if gradients is ordered.
290
                if set(self.parameters) != set(self.gradients.keys()):
291
                    logger.warn("Specified parameters list does not match "
292
                                "keys in provided gradient dictionary; "
293
                                "using parameters inferred from gradients")
294
                    if not isinstance(self.gradients, OrderedDict):
295
                        raise ValueError(determinism_error)
296
                    self.parameters = list(self.gradients.keys())
297
            else:
298
                # self.parameters is not None, and gradients isn't
299
                # an OrderedDict. We can't do anything safe.
300
                raise ValueError(determinism_error)
301
            if known_grads:
302
                raise ValueError("known_grads has no effect when gradients "
303
                                 "are passed in")
304
            if consider_constant is not None:
305
                raise ValueError("consider_constant has no effect when "
306
                                 "gradients are passed in")
307
308
        # The order in which the different gradient terms appears
309
        # here matters, as floating point addition is non-commutative (and
310
        # Theano's graph optimizations are not order-independent).
311
        # This is why we do not use .values().
312
        gradient_values = [self.gradients[p] for p in self.parameters]
313
        self.total_gradient_norm = (l2_norm(gradient_values)
314
                                    .copy(name="total_gradient_norm"))
315
316
        self.step_rule = step_rule if step_rule else Scale()
317
        logger.debug("Computing parameter steps...")
318
        self.steps, self.step_rule_updates = (
319
            self.step_rule.compute_steps(self.gradients))
320
321
        # Same as gradient_values above: the order may influence a
322
        # bunch of things, so enforce a consistent one (don't use
323
        # .values()).
324
        step_values = [self.steps[p] for p in self.parameters]
325
        self.total_step_norm = (l2_norm(step_values)
326
                                .copy(name="total_step_norm"))
327
328
        # Once again, iterating on gradients may not be deterministically
329
        # ordered if it is not an OrderedDict. We add the updates here in
330
        # the order specified in self.parameters. Keep it this way to
331
        # maintain reproducibility.
332
        kwargs.setdefault('updates', []).extend(
333
            itertools.chain(((parameter, parameter - self.steps[parameter])
334
                             for parameter in self.parameters),
335
                            self.step_rule_updates)
336
        )
337
        super(GradientDescent, self).__init__(**kwargs)
338
339
    def _compute_gradients(self, known_grads, consider_constant):
340
        if self.cost is None:
341
            raise ValueError("can't infer gradients; no cost specified")
342
        elif self.parameters is None or len(self.parameters) == 0:
343
            raise ValueError("can't infer gradients; no parameters "
344
                             "specified")
345
        # While this strictly speaking could be a dict and not an
346
        # OrderedDict (because we iterate over it in the order of
347
        # self.parameters), this guards a little bit against
348
        # nondeterminism introduced by future refactoring.
349
        logger.info("Taking the cost gradient")
350
        gradients = OrderedDict(
351
            equizip(self.parameters, tensor.grad(
352
                self.cost, self.parameters,
353
                known_grads=known_grads,
354
                consider_constant=consider_constant)))
355
        logger.info("The cost gradient computation graph is built")
356
        return gradients
357
358
359
@add_metaclass(ABCMeta)
360
class StepRule(object):
361
    """A rule to compute steps for a gradient descent algorithm."""
362
    def compute_step(self, parameter, previous_step):
363
        """Build a Theano expression for the step for a parameter.
364
365
        This method is called by default implementation of
366
        :meth:`compute_steps`, it relieves from writing a loop each time.
367
368
        Parameters
369
        ----------
370
        parameter : :class:`~tensor.TensorSharedVariable`
371
            The parameter.
372
        previous_step : :class:`~tensor.TensorVariable`
373
            Some quantity related to the gradient of the cost with respect
374
            to the parameter, either the gradient itself or a step in a
375
            related direction.
376
377
        Returns
378
        -------
379
        step : :class:`~theano.Variable`
380
            Theano variable for the step to take.
381
        updates : list
382
            A list of tuples representing updates to be performed. This
383
            is useful for stateful rules such as :class:`Momentum` which
384
            need to update shared variables after itetations.
385
386
        """
387
        raise NotImplementedError
388
389
    def compute_steps(self, previous_steps):
390
        """Build a Theano expression for steps for all parameters.
391
392
        Override this method if you want to process the steps
393
        with respect to all parameters as a whole, not parameter-wise.
394
395
        Parameters
396
        ----------
397
        previous_steps : OrderedDict
398
            An :class:`~OrderedDict` of
399
            (:class:`~tensor.TensorSharedVariable`
400
            :class:`~tensor.TensorVariable`) pairs. The keys are the
401
            parameters being trained, the values are the expressions for
402
            quantities related to gradients of the cost with respect to
403
            the parameters, either the gradients themselves or steps in
404
            related directions.
405
406
        Returns
407
        -------
408
        steps : OrderedDict
409
            A dictionary of the proposed steps in the same form as
410
            `previous_steps`.
411
        updates : list
412
            A list of tuples representing updates to be performed.
413
414
        """
415
        parameter_wise = [self.compute_step(parameter,
416
                                            previous_steps[parameter])
417
                          for parameter in previous_steps]
418
        steps, updates = equizip(*parameter_wise)
419
        steps = OrderedDict((parameter, step) for parameter, step
420
                            in equizip(previous_steps.keys(), steps))
421
        updates = list(itertools.chain(*updates))
422
        return steps, updates
423
424
425
class CompositeRule(StepRule):
426
    """Chains several step rules.
427
428
    Parameters
429
    ----------
430
    components : list of :class:`StepRule`
431
        The learning rules to be chained. The rules will be applied in the
432
        order as given.
433
434
    """
435
    def __init__(self, components):
436
        self.components = components
437
438
    def compute_steps(self, previous_steps):
439
        steps = previous_steps
440
        updates = []
441
        for rule in self.components:
442
            steps, more_updates = rule.compute_steps(steps)
443
            updates += more_updates
444
        return steps, updates
445
446
447
class Scale(StepRule):
448
    """A step in the direction proportional to the previous step.
449
450
    If used in :class:`GradientDescent` alone, this step rule implements
451
    steepest descent.
452
453
    Parameters
454
    ----------
455
    learning_rate : float
456
        The learning rate by which the previous step is multiplied to
457
        produce the step.
458
459
    Attributes
460
    ----------
461
    learning_rate : :class:`~tensor.TensorSharedVariable`
462
        The shared variable storing the learning rate used.
463
464
    """
465
    def __init__(self, learning_rate=1.0):
466
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
467
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
468
469
    def compute_step(self, parameter, previous_step):
470
        return self.learning_rate * previous_step, []
471
472
473
class BasicMomentum(StepRule):
474
    """Accumulates step with exponential discount.
475
476
    Parameters
477
    ----------
478
    momentum : float, optional
479
        The momentum coefficient. Defaults to 0.
480
481
    Notes
482
    -----
483
    This step rule is intended to be used in conjunction with another
484
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
485
    experience, look at :class:`Momentum`.
486
487
    """
488
    def __init__(self, momentum=0.):
489
        self.momentum = shared_floatx(momentum, "momentum")
490
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
491
492
    def compute_step(self, parameter, previous_step):
493
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
494
        step = self.momentum * velocity + previous_step
495
        updates = [(velocity, step)]
496
        return step, updates
497
498
499
class Momentum(CompositeRule):
500
    """Accumulates step with exponential discount.
501
502
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
503
    usual momentum step rule.
504
505
    Parameters
506
    ----------
507
    learning_rate : float, optional
508
        The learning rate by which the previous step scaled. Defaults to 1.
509
    momentum : float, optional
510
        The momentum coefficient. Defaults to 0.
511
512
    Attributes
513
    ----------
514
    learning_rate : :class:`~tensor.SharedVariable`
515
        A variable for learning rate.
516
    momentum : :class:`~tensor.SharedVariable`
517
        A variable for momentum.
518
519
    See Also
520
    --------
521
    :class:`SharedVariableModifier`
522 View Code Duplication
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    """
524
    def __init__(self, learning_rate=1.0, momentum=0.):
525
        scale = Scale(learning_rate=learning_rate)
526
        basic_momentum = BasicMomentum(momentum=momentum)
527
        self.learning_rate = scale.learning_rate
528
        self.momentum = basic_momentum.momentum
529
        self.components = [scale, basic_momentum]
530
531
532
class AdaDelta(StepRule):
533
    """Adapts the step size over time using only first order information.
534
535
    Parameters
536
    ----------
537
    decay_rate : float, optional
538
        Decay rate in [0, 1]. Defaults to 0.95.
539
    epsilon : float, optional
540
        Stabilizing constant for RMS. Defaults to 1e-6.
541
542
    Notes
543
    -----
544
    For more information, see [ADADELTA]_.
545
546
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
547
       Rate Method*, arXiv:1212.5701.
548
549
    """
550
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
551
        if not 0.0 <= decay_rate <= 1.0:
552
            raise ValueError("decay rate needs to be in [0, 1]")
553
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
554
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
555
        self.epsilon = shared_floatx(epsilon, "epsilon")
556
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
557
558
    def compute_step(self, parameter, previous_step):
559
        mean_square_step_tm1 = _create_algorithm_buffer_for(
560
            parameter, "mean_square_step_tm1")
561
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
562
            parameter, "mean_square_delta_x_tm1")
563
564
        mean_square_step_t = (
565
            self.decay_rate * mean_square_step_tm1 +
566
            (1 - self.decay_rate) * tensor.sqr(previous_step)
567
        )
568
569
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
570
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
571
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
572
573
        mean_square_delta_x_t = (
574
            self.decay_rate * mean_square_delta_x_tm1 +
575
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
576
        )
577
578
        step = delta_x_t
579
        updates = [(mean_square_step_tm1, mean_square_step_t),
580
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
581
        return step, updates
582
583
584
class BasicRMSProp(StepRule):
585
    """Scales the step size by a running average of the recent step norms.
586
587
    Parameters
588
    ----------
589
    decay_rate : float, optional
590
        How fast the running average decays, value in [0, 1]
591
        (lower is faster).  Defaults to 0.9.
592
    max_scaling : float, optional
593
        Maximum scaling of the step size, in case the running average is
594
        really small. Needs to be greater than 0. Defaults to 1e5.
595
596
    Notes
597
    -----
598
    This step rule is intended to be used in conjunction with another
599
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
600
    experience, look at :class:`RMSProp`.
601
602
    In general, this step rule should be used _before_ other step rules,
603
    because it has normalization properties that may undo their work.
604
    For instance, it should be applied first when used in conjunction
605
    with :class:`Scale`.
606
607
    For more information, see [Hint2014]_.
608
609
    """
610
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
611
        if not 0.0 <= decay_rate <= 1.0:
612
            raise ValueError("decay rate needs to be in [0, 1]")
613
        if max_scaling <= 0:
614
            raise ValueError("max. scaling needs to be greater than 0")
615
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
616
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
617
        self.epsilon = 1. / max_scaling
618
619
    def compute_step(self, parameter, previous_step):
620
        mean_square_step_tm1 = _create_algorithm_buffer_for(
621
            parameter, "mean_square_step_tm1")
622
        mean_square_step_t = (
623
            self.decay_rate * mean_square_step_tm1 +
624
            (1 - self.decay_rate) * tensor.sqr(previous_step))
625
        rms_step_t = tensor.maximum(
626
            tensor.sqrt(mean_square_step_t), self.epsilon)
627
        step = previous_step / rms_step_t
628
        updates = [(mean_square_step_tm1, mean_square_step_t)]
629
        return step, updates
630
631
632
class RMSProp(CompositeRule):
633
    """Scales the step size by a running average of the recent step norms.
634
635
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
636
    described in [Hint2014]_.
637
638
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
639
       lecture 6a,
640
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
641
642
    Parameters
643
    ----------
644
    learning_rate : float, optional
645
        The learning rate by which the previous step scaled. Defaults to 1.
646
    decay_rate : float, optional
647
        How fast the running average decays (lower is faster).
648
        Defaults to 0.9.
649
    max_scaling : float, optional
650
        Maximum scaling of the step size, in case the running average is
651
        really small. Defaults to 1e5.
652
653
    Attributes
654
    ----------
655
    learning_rate : :class:`~tensor.SharedVariable`
656
        A variable for learning rate.
657
    decay_rate : :class:`~tensor.SharedVariable`
658
        A variable for decay rate.
659
660
    See Also
661
    --------
662
    :class:`SharedVariableModifier`
663
664
    """
665
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
666
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
667
                                      max_scaling=max_scaling)
668
        scale = Scale(learning_rate=learning_rate)
669
        self.learning_rate = scale.learning_rate
670
        self.decay_rate = basic_rms_prop.decay_rate
671
        self.components = [basic_rms_prop, scale]
672
673
674
class StepClipping(StepRule):
675
    """Rescales an entire step if its L2 norm exceeds a threshold.
676
677
    When the previous steps are the gradients, this step rule performs
678
    gradient clipping.
679
680
    Parameters
681
    ----------
682
    threshold : float, optional
683
        The maximum permitted L2 norm for the step. The step
684
        will be rescaled to be not higher than this quanity.
685
        If ``None``, no rescaling will be applied.
686
687
    Attributes
688
    ----------
689
    threshold : :class:`.tensor.TensorSharedVariable`
690
        The shared variable storing the clipping threshold used.
691
692
    """
693
    def __init__(self, threshold=None):
694
        if threshold:
695
            self.threshold = shared_floatx(threshold, "threshold")
696
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
697
698
    def compute_steps(self, previous_steps):
699
        if not hasattr(self, 'threshold'):
700
            return previous_steps
701
        norm = l2_norm(previous_steps.values())
702
        multiplier = tensor.switch(norm < self.threshold,
703
                                   1, self.threshold / norm)
704
        steps = OrderedDict(
705
            (parameter, step * multiplier)
706
            for parameter, step in previous_steps.items())
707
        return steps, []
708
709
710
class VariableClipping(StepRule):
711
    """Clip the maximum norm of individual variables along certain axes.
712
713
    This :class:`StepRule` can be used to implement L2 norm constraints on
714
    e.g. the weight vectors of individual hidden units, convolutional
715
    filters or entire weight tensors. Combine with :class:`Restrict`
716
    (and possibly :class:`CompositeRule`), to apply such constraints only
717
    to certain variables and/or apply different norm constraints to
718
    different variables.
719
720
    Parameters
721
    ----------
722
    threshold : float
723
        Maximum norm for a given (portion of a) tensor.
724
    axis : int or iterable, optional
725 View Code Duplication
        An integer single axis, or an iterable collection of integer
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
        axes over which to sum in order to calculate the L2 norm. If
727
        `None` (the default), the norm is computed over all elements
728
        of the tensor.
729
730
    Notes
731
    -----
732
    Because of the way the :class:`StepRule` API works, this particular
733
    rule implements norm clipping of the value *after* update in the
734
    following way: it computes ``parameter - previous_step``, scales it
735
    to have (possibly axes-wise) norm(s) of at most `threshold`,
736
    then subtracts *that* value from `parameter` to yield an 'equivalent
737
    step' that respects the desired norm constraints. This procedure
738
    implicitly assumes one is doing simple (stochastic) gradient descent,
739
    and so steps computed by this step rule may not make sense for use
740
    in other contexts.
741
742
    Investigations into max-norm regularization date from [Srebro2005]_.
743
    The first appearance of this technique as a regularization method
744
    for the weight vectors of individual hidden units in feed-forward
745
    neural networks may be [Hinton2012]_.
746
747
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
748
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
749
       on Learning Theory (COLT)*, June 2005.
750
751
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
752
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
753
       "Improving neural networks by preventing co-adaptation of
754
       feature detectors". arXiv:1207.0580.
755
756
    """
757
    def __init__(self, threshold, axis=None):
758
        axis = pack(axis) if axis is not None else ()
759
        self.axis = set(axis)
760
        self.threshold = shared_floatx(threshold, "threshold")
761
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
762
        if len(axis) != len(self.axis):
763
            raise ValueError("axis must be unique")
764
765
    def compute_step(self, parameter, previous_step):
766
        if any(ax >= previous_step.ndim for ax in self.axis):
767
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
768
                self.axis, parameter, previous_step.ndim))
769
        if len(self.axis) == 0:
770
            norms = l2_norm([parameter - previous_step])
771
        else:
772
            squares = tensor.sqr(parameter - previous_step)
773
            norms = tensor.sqrt(
774
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
775
                       sorted(self.axis), squares))
776
        # We want a step s* that is the same as scaling
777
        # (parameter - previous_step) by threshold / norm
778
        # when threshold < norm.
779
        shrinking_step = (parameter -
780
                          (self.threshold / norms) *
781
                          (parameter - previous_step))
782
        return tensor.switch(norms > self.threshold,
783
                             shrinking_step,
784
                             previous_step), ()
785
786
787
class AdaGrad(StepRule):
788
    """Implements the AdaGrad learning rule.
789
790
    Parameters
791
    ----------
792
    learning_rate : float, optional
793
        Step size.
794
        Default value is set to 0.0002.
795
    epsilon : float, optional
796
        Stabilizing constant for one over root of sum of squares.
797
        Defaults to 1e-6.
798
799
    Notes
800
    -----
801
    For more information, see [ADAGRAD]_.
802
803
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
804
       *Adaptive subgradient methods for online learning and
805
        stochastic optimization*,
806
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
807
808
    """
809
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
810
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
811
        self.epsilon = shared_floatx(epsilon, "epsilon")
812
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
813
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
814
815
    def compute_step(self, parameter, previous_step):
816
        name = 'adagrad_sqs'
817
        if parameter.name:
818
            name += '_' + parameter.name
819
        ssq = _create_algorithm_buffer_for(parameter, name=name)
820
821
        ssq_t = (tensor.sqr(previous_step) + ssq)
822
        step = (self.learning_rate * previous_step /
823
                (tensor.sqrt(ssq_t) + self.epsilon))
824
825
        updates = [(ssq, ssq_t)]
826
827
        return step, updates
828
829
830
class Adam(StepRule):
831
    """Adam optimizer as described in [King2014]_.
832
833
    .. [King2014] Diederik Kingma, Jimmy Ba,
834
       *Adam: A Method for Stochastic Optimization*,
835
       http://arxiv.org/abs/1412.6980
836
837
    Parameters
838
    ----------
839
    learning_rate : float, optional
840
        Step size.
841
        Default value is set to 0.002.
842
    beta1 : float, optional
843
        Exponential decay rate for the first moment estimates.
844
        Default value is set to 0.1.
845
    beta2 : float, optional
846
        Exponential decay rate for the second moment estimates.
847
        Default value is set to 0.001.
848
    epsilon : float, optional
849
        Default value is set to 1e-8.
850
    decay_factor : float, optional
851
        Default value is set to 1 - 1e-8.
852
853
    """
854
    def __init__(self, learning_rate=0.002,
855
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
856
                 decay_factor=(1 - 1e-8)):
857
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
858
        self.beta1 = shared_floatx(beta1, "beta1")
859
        self.beta2 = shared_floatx(beta2, "beta2")
860
        self.epsilon = shared_floatx(epsilon, "epsilon")
861
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
862
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
863
                      self.decay_factor]:
864
            add_role(param, ALGORITHM_HYPERPARAMETER)
865
866
    def compute_step(self, parameter, previous_step):
867
        mean = _create_algorithm_buffer_for(parameter, 'mean')
868
        variance = _create_algorithm_buffer_for(parameter, 'variance')
869
        time = shared_floatx(0., 'time')
870
        add_role(time, ALGORITHM_BUFFER)
871
872
        t1 = time + 1
873
        learning_rate = (self.learning_rate *
874
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
875
                         (1. - (1. - self.beta1)**t1))
876
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
877
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
878
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
879
                      (1. - self.beta2) * variance)
880
        step = (learning_rate * mean_t /
881
                (tensor.sqrt(variance_t) + self.epsilon))
882
883
        updates = [(mean, mean_t),
884
                   (variance, variance_t),
885
                   (time, t1)]
886
887
        return step, updates
888
889
890
class RemoveNotFinite(StepRule):
891
    """A step rule that skips steps with non-finite elements.
892
893
    Replaces a step (the parameter update of a single shared variable)
894
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
895
    step rescaling the parameters.
896
897
    Parameters
898
    ----------
899
    scaler : float, optional
900
        The scaling applied to the parameter in case the step contains
901
        non-finite elements. Defaults to 1, which means that parameters
902
        will not be changed.
903
904
    Notes
905
    -----
906
    This rule should be applied last!
907
908
    This trick was originally used in the GroundHog_ framework.
909
910
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
911
912
    """
913
    def __init__(self, scaler=1):
914
        self.scaler = scaler
915
916
    def compute_step(self, parameter, previous_step):
917
        step_sum = tensor.sum(previous_step)
918
        not_finite = (tensor.isnan(step_sum) +
919
                      tensor.isinf(step_sum))
920
        step = tensor.switch(
921
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
922
        return step, []
923
924
925
class Restrict(StepRule):
926
    """Applies a given :class:`StepRule` only to certain variables.
927
928
    Example applications include clipping steps on only certain parameters,
929
    or scaling a certain kind of parameter's updates (e.g. adding an
930
    additional scalar multiplier to the steps taken on convolutional
931
    filters).
932
933
    Parameters
934
    ----------
935
    step_rule : :class:`StepRule`
936
        The :class:`StepRule` to be applied on the given variables.
937
    variables : iterable
938
        A collection of Theano variables on which to apply `step_rule`.
939
        Variables not appearing in this collection will not have
940
        `step_rule` applied to them.
941
942
    """
943
    def __init__(self, step_rule, variables):
944
        self.step_rule = step_rule
945
        self.variables = frozenset(variables)
946
947
    def compute_steps(self, previous_steps):
948
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
949
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
950
        actual = OrderedDict((parameter, steps[parameter])
951
                             if parameter in steps
952
                             else (parameter, previous_steps[parameter])
953
                             for parameter in previous_steps)
954
        return actual, updates
955