Completed
Pull Request — master (#1079)
by David
04:59
created

UpdatesAlgorithm.__init__()   A

Complexity

Conditions 3

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
dl 0
loc 7
rs 9.4285
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor. Or pass \
65
on_unused_sources='warn' or on_unused_sources='ignore' to \
66
the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
determinism_error = """Cannot infer parameter list in a fixed order.
0 ignored issues
show
Bug introduced by
A suspicious escape sequence \ was found. Did you maybe forget to add an r prefix?

Escape sequences in Python are generally interpreted according to rules similar to standard C. Only if strings are prefixed with r or R are they interpreted as regular expressions.

The escape sequence that was used indicates that you might have intended to write a regular expression.

Learn more about the available escape sequences. in the Python documentation.

Loading history...
75
76
Because dictionaries are unordered (and Python uses randomized hashing, \
77
which can change the iteration order over the same dictionary from one \
78
interpreter session to the next), Blocks cannot infer the parameters list \
79
from a plain dictionary of gradients in an order that is reproducible \
80
across interpreter sessions; please either specify the parameters \
81
explicitly or pass gradients as an OrderedDict (though exercise care in \
82
constructing that OrderedDict, as an OrderedDict \ created by iterating \
83
over an unordered iterable (e.g. a dict) will still \ have an arbitrary \
84
and unpredictable order that could cause problems with \
85
reproducibility)."""
86
87
88
class UpdatesAlgorithm(TrainingAlgorithm):
89
    """Base class for algorithms that use Theano functions with updates.
90
91
    Parameters
92
    ----------
93
    updates : list of tuples or :class:`~collections.OrderedDict`
94
        The updates that should be performed.
95
    theano_func_kwargs : dict, optional
96
        A passthrough to `theano.function` for additional arguments.
97
        Useful for passing `profile` or `mode` arguments to the theano
98
        function that will be compiled for the algorithm.
99
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
100
        Controls behavior when not all sources in a batch are used
101
        (i.e. there is no variable with a matching name in the inputs
102
        of the computational graph of the updates).
103
104
    Attributes
105
    ----------
106
    updates : list of :class:`~tensor.TensorSharedVariable` updates
107
        Updates to be done for every batch. It is required that the
108
        updates are done using the old values of optimized parameters.
109
110
    """
111
    def __init__(self, updates=None, theano_func_kwargs=None,
112
                 on_unused_sources='raise', **kwargs):
113
        self.updates = [] if updates is None else updates
114
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
115
                                   is not None else dict())
116
        self.on_unused_sources = on_unused_sources
117
        super(UpdatesAlgorithm, self).__init__(**kwargs)
118
119
    def initialize(self):
120
        logger.info("Initializing the training algorithm")
121
        update_values = [new_value for _, new_value in self.updates]
122
        logger.debug("Inferring graph inputs...")
123
        self.inputs = ComputationGraph(update_values).inputs
124
        logger.debug("Compiling training function...")
125
        self._function = theano.function(
126
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
127
        logger.info("The training algorithm is initialized")
128
129
    @property
130
    def updates(self):
131
        return self._updates
132
133
    @updates.setter
134
    def updates(self, value):
135
        self._updates = value
136
137
    def add_updates(self, updates):
138
        """Add updates to the training process.
139
140
        The updates will be done _before_ the parameters are changed.
141
142
        Parameters
143
        ----------
144
        updates : list of tuples or :class:`~collections.OrderedDict`
145
            The updates to add.
146
147
        """
148
        if isinstance(updates, OrderedDict):
149
            updates = list(updates.items())
150
        if not isinstance(updates, list):
151
            raise ValueError
152
        self.updates.extend(updates)
153
154
    def _validate_source_names(self, batch):
155
        in_names = [v.name for v in self.inputs]
156
157
        if not set(in_names).issubset(set(batch.keys())):
158
            raise ValueError("Didn't find all sources: " +
159
                             source_missing_error.format(
160
                                 sources=batch.keys(),
161
                                 variables=in_names))
162
        if not set(batch.keys()).issubset(set(in_names)):
163
            if self.on_unused_sources == 'ignore':
164
                pass
165
            elif self.on_unused_sources == 'warn':
166
                if not hasattr(self, '_unused_source_warned'):
167
                    logger.warn(variable_mismatch_error.format(
168
                        sources=batch.keys(),
169
                        variables=in_names))
170
                self._unused_source_warned = True
171
            elif self.on_unused_sources == 'raise':
172
                raise ValueError(
173
                    "mismatch of variable names and data sources" +
174
                    variable_mismatch_error.format(
175
                        sources=batch.keys(),
176
                        variables=in_names))
177
            else:
178
                raise ValueError("Wrong value of on_unused_sources: {}."
179
                                 .format(self.on_unused_sources))
180
181
    def process_batch(self, batch):
182
        self._validate_source_names(batch)
183
        ordered_batch = [batch[v.name] for v in self.inputs]
184
        self._function(*ordered_batch)
185
186
187
class GradientDescent(UpdatesAlgorithm):
188
    """A base class for all gradient descent algorithms.
189
190
    By "gradient descent" we mean a training algorithm of the following
191
    form:
192
193
    .. code-block::  python
194
195
        for batch in data:
196
            steps = step_rule.compute_steps(parameters,
197
                                            gradients_wr_parameters)
198
            for parameter in parameters:
199
                parameter -= steps[parameter]
200
201
    Note, that the step is *subtracted, not added*! This is done in order
202
    to make step rule chaining possible.
203
204
    Parameters
205
    ----------
206
    cost : :class:`~tensor.TensorVariable`, optional
207
        The objective to be minimized. Unused if `gradients` is specified.
208
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
209
        The parameters to be tuned. If not provided, inferred from the
210
        keys of `gradients` (in which case `gradients` *must* be an
211
        `OrderedDict`).
212
    step_rule : instance of :class:`StepRule`, optional
213
        An object encapsulating most of the algorithm's logic. Its
214
        `compute_steps` method is called to get Theano expression for
215
        steps.  Note, that the step rule might have a state, e.g. to
216
        remember a weighted sum of gradients from previous steps like it is
217
        done in gradient descent with momentum. If ``None``, an instance of
218
        :class:`Scale` is created.
219
    gradients : OrderedDict, optional
220
        A dictionary mapping a parameter to an expression for the cost's
221
        gradient with respect to the parameter. If ``None``, the gradient
222
        are taken automatically using :func:`theano.gradient.grad`.
223
    known_grads : dict, optional
224
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
225
        Useful when you know the [approximate] gradients of some
226
        sub-expressions and would like Theano to use that information
227
        to compute parameter gradients. Only makes sense when `gradients`
228
        is `None`.
229
    consider_constant : list, optional
230
        A passthrough to `theano.tensor.grad`'s `consider_constant`
231
        argument.  A list of expressions through which gradients will not
232
        be backpropagated. Only makes sense when `gradients` is `None`.
233
234
    Attributes
235
    ----------
236
    gradients : OrderedDict
237
        The gradient dictionary.
238
    step_rule : instance of :class:`StepRule`
239
        The step rule.
240
241
    Notes
242
    -----
243
    Changing `updates` attribute or calling `add_updates` after
244
    the `initialize` method is called will have no effect.
245
246
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
247
    because ordinary dictionaries have an unpredictable iteration
248
    order due to hash randomization (which is enabled by default since
249
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
250
    when combined with Theano's heuristic graph optimizations, can cause
251
    serious reproducibility issues.
252
253
    .. todo::
254
255
       Some shared variables are not parameters (e.g. those created by
256
       random streams).
257
258
    .. todo::
259
260
       Due to a rather premature status of the :class:`ComputationGraph`
261
       class the parameter used only inside scans are not fetched
262
       currently.
263
264
    """
265
    def __init__(self, cost=None, parameters=None, step_rule=None,
266
                 gradients=None, known_grads=None, consider_constant=None,
267
                 **kwargs):
268
        # Set initial values for cost, parameters, gradients.
269
        self.cost = cost
270
        self.parameters = parameters
271
        self.gradients = gradients
272
273
        # If we don't have gradients, we'll need to infer them from the
274
        # cost and the parameters, both of which must not be None.
275
        if not self.gradients:
276
            self.gradients = self._compute_gradients(known_grads,
277
                                                     consider_constant)
278
        else:
279
            if cost is not None:
280
                logger.warning(('{}: gradients already specified directly; '
281
                                'cost is unused.'
282
                                .format(self.__class__.__name__)))
283
            if self.parameters is None and isinstance(gradients, OrderedDict):
284
                # If the dictionary is ordered, it's safe to use the keys
285
                # as they have a deterministic order.
286
                self.parameters = list(self.gradients.keys())
287
            elif self.parameters is not None:
288
                # If parameters and gradients.keys() don't match we can
289
                # try to recover if gradients is ordered.
290
                if set(self.parameters) != set(self.gradients.keys()):
291
                    logger.warn("Specified parameters list does not match "
292
                                "keys in provided gradient dictionary; "
293
                                "using parameters inferred from gradients")
294
                    if not isinstance(self.gradients, OrderedDict):
295
                        raise ValueError(determinism_error)
296
                    self.parameters = list(self.gradients.keys())
297
            else:
298
                # self.parameters is not None, and gradients isn't
299
                # an OrderedDict. We can't do anything safe.
300
                raise ValueError(determinism_error)
301
            if known_grads:
302
                raise ValueError("known_grads has no effect when gradients "
303
                                 "are passed in")
304
            if consider_constant is not None:
305
                raise ValueError("consider_constant has no effect when "
306
                                 "gradients are passed in")
307
308
        # The order in which the different gradient terms appears
309
        # here matters, as floating point addition is non-commutative (and
310
        # Theano's graph optimizations are not order-independent).
311
        # This is why we do not use .values().
312
        gradient_values = [self.gradients[p] for p in self.parameters]
313
        self.total_gradient_norm = (l2_norm(gradient_values)
314
                                    .copy(name="total_gradient_norm"))
315
316
        self.step_rule = step_rule if step_rule else Scale()
317
        logger.debug("Computing parameter steps...")
318
        self.steps, self.step_rule_updates = (
319
            self.step_rule.compute_steps(self.gradients))
320
321
        # Same as gradient_values above: the order may influence a
322
        # bunch of things, so enforce a consistent one (don't use
323
        # .values()).
324
        step_values = [self.steps[p] for p in self.parameters]
325
        self.total_step_norm = (l2_norm(step_values)
326
                                .copy(name="total_step_norm"))
327
328
        # Once again, iterating on gradients may not be deterministically
329
        # ordered if it is not an OrderedDict. We add the updates here in
330
        # the order specified in self.parameters. Keep it this way to
331
        # maintain reproducibility.
332
        kwargs.setdefault('updates', []).extend(
333
            itertools.chain(((parameter, parameter - self.steps[parameter])
334
                             for parameter in self.parameters),
335
                            self.step_rule_updates)
336
        )
337
        # The required inputs are going to be the inputs of the graph
338
        # used in the actual step computation. This may be different from
339
        # those used in the gradient computation, if a step rule elects
340
        # to ignore certain parameters for some reason. This is usually
341
        # presumably an error, and basing the inputs list on the updates
342
        # ensures that the unused inputs check catches it.
343
        # Once again, we use step_values which is painstakingly ordered
344
        # in a reproducible manner.
345
        self.inputs = ComputationGraph(step_values).inputs
346
        super(GradientDescent, self).__init__(**kwargs)
347
348
    def _compute_gradients(self, known_grads, consider_constant):
349
        if self.cost is None:
350
            raise ValueError("can't infer gradients; no cost specified")
351
        elif self.parameters is None or len(self.parameters) == 0:
352
            raise ValueError("can't infer gradients; no parameters "
353
                             "specified")
354
        # While this strictly speaking could be a dict and not an
355
        # OrderedDict (because we iterate over it in the order of
356
        # self.parameters), this guards a little bit against
357
        # nondeterminism introduced by future refactoring.
358
        logger.info("Taking the cost gradient")
359
        gradients = OrderedDict(
360
            equizip(self.parameters, tensor.grad(
361
                self.cost, self.parameters,
362
                known_grads=known_grads,
363
                consider_constant=consider_constant)))
364
        logger.info("The cost gradient computation graph is built")
365
        return gradients
366
367
368
@add_metaclass(ABCMeta)
369
class StepRule(object):
370
    """A rule to compute steps for a gradient descent algorithm."""
371
    def compute_step(self, parameter, previous_step):
372
        """Build a Theano expression for the step for a parameter.
373
374
        This method is called by default implementation of
375
        :meth:`compute_steps`, it relieves from writing a loop each time.
376
377
        Parameters
378
        ----------
379
        parameter : :class:`~tensor.TensorSharedVariable`
380
            The parameter.
381
        previous_step : :class:`~tensor.TensorVariable`
382
            Some quantity related to the gradient of the cost with respect
383
            to the parameter, either the gradient itself or a step in a
384
            related direction.
385
386
        Returns
387
        -------
388
        step : :class:`~theano.Variable`
389
            Theano variable for the step to take.
390
        updates : list
391
            A list of tuples representing updates to be performed. This
392
            is useful for stateful rules such as :class:`Momentum` which
393
            need to update shared variables after itetations.
394
395
        """
396
        raise NotImplementedError
397
398
    def compute_steps(self, previous_steps):
399
        """Build a Theano expression for steps for all parameters.
400
401
        Override this method if you want to process the steps
402
        with respect to all parameters as a whole, not parameter-wise.
403
404
        Parameters
405
        ----------
406
        previous_steps : OrderedDict
407
            An :class:`~OrderedDict` of
408
            (:class:`~tensor.TensorSharedVariable`
409
            :class:`~tensor.TensorVariable`) pairs. The keys are the
410
            parameters being trained, the values are the expressions for
411
            quantities related to gradients of the cost with respect to
412
            the parameters, either the gradients themselves or steps in
413
            related directions.
414
415
        Returns
416
        -------
417
        steps : OrderedDict
418
            A dictionary of the proposed steps in the same form as
419
            `previous_steps`.
420
        updates : list
421
            A list of tuples representing updates to be performed.
422
423
        """
424
        parameter_wise = [self.compute_step(parameter,
425
                                            previous_steps[parameter])
426
                          for parameter in previous_steps]
427
        steps, updates = equizip(*parameter_wise)
428
        steps = OrderedDict((parameter, step) for parameter, step
429
                            in equizip(previous_steps.keys(), steps))
430
        updates = list(itertools.chain(*updates))
431
        return steps, updates
432
433
434
class CompositeRule(StepRule):
435
    """Chains several step rules.
436
437
    Parameters
438
    ----------
439
    components : list of :class:`StepRule`
440
        The learning rules to be chained. The rules will be applied in the
441
        order as given.
442
443
    """
444
    def __init__(self, components):
445
        self.components = components
446
447
    def compute_steps(self, previous_steps):
448
        steps = previous_steps
449
        updates = []
450
        for rule in self.components:
451
            steps, more_updates = rule.compute_steps(steps)
452
            updates += more_updates
453
        return steps, updates
454
455
456
class Scale(StepRule):
457
    """A step in the direction proportional to the previous step.
458
459
    If used in :class:`GradientDescent` alone, this step rule implements
460
    steepest descent.
461
462
    Parameters
463
    ----------
464
    learning_rate : float
465
        The learning rate by which the previous step is multiplied to
466
        produce the step.
467
468
    Attributes
469
    ----------
470
    learning_rate : :class:`~tensor.TensorSharedVariable`
471
        The shared variable storing the learning rate used.
472
473
    """
474
    def __init__(self, learning_rate=1.0):
475
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
476
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
477
478
    def compute_step(self, parameter, previous_step):
479
        return self.learning_rate * previous_step, []
480
481
482
class BasicMomentum(StepRule):
483
    """Accumulates step with exponential discount.
484
485
    Parameters
486
    ----------
487
    momentum : float, optional
488
        The momentum coefficient. Defaults to 0.
489
490
    Notes
491
    -----
492
    This step rule is intended to be used in conjunction with another
493
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
494
    experience, look at :class:`Momentum`.
495
496
    """
497
    def __init__(self, momentum=0.):
498
        self.momentum = shared_floatx(momentum, "momentum")
499
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
500
501
    def compute_step(self, parameter, previous_step):
502
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
503
        step = self.momentum * velocity + previous_step
504
        updates = [(velocity, step)]
505
        return step, updates
506
507
508
class Momentum(CompositeRule):
509
    """Accumulates step with exponential discount.
510
511
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
512
    usual momentum step rule.
513
514
    Parameters
515
    ----------
516
    learning_rate : float, optional
517
        The learning rate by which the previous step scaled. Defaults to 1.
518
    momentum : float, optional
519
        The momentum coefficient. Defaults to 0.
520
521
    Attributes
522 View Code Duplication
    ----------
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    learning_rate : :class:`~tensor.SharedVariable`
524
        A variable for learning rate.
525
    momentum : :class:`~tensor.SharedVariable`
526
        A variable for momentum.
527
528
    See Also
529
    --------
530
    :class:`SharedVariableModifier`
531
532
    """
533
    def __init__(self, learning_rate=1.0, momentum=0.):
534
        scale = Scale(learning_rate=learning_rate)
535
        basic_momentum = BasicMomentum(momentum=momentum)
536
        self.learning_rate = scale.learning_rate
537
        self.momentum = basic_momentum.momentum
538
        self.components = [scale, basic_momentum]
539
540
541
class AdaDelta(StepRule):
542
    """Adapts the step size over time using only first order information.
543
544
    Parameters
545
    ----------
546
    decay_rate : float, optional
547
        Decay rate in [0, 1]. Defaults to 0.95.
548
    epsilon : float, optional
549
        Stabilizing constant for RMS. Defaults to 1e-6.
550
551
    Notes
552
    -----
553
    For more information, see [ADADELTA]_.
554
555
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
556
       Rate Method*, arXiv:1212.5701.
557
558
    """
559
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
560
        if not 0.0 <= decay_rate <= 1.0:
561
            raise ValueError("decay rate needs to be in [0, 1]")
562
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
563
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
564
        self.epsilon = shared_floatx(epsilon, "epsilon")
565
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
566
567
    def compute_step(self, parameter, previous_step):
568
        mean_square_step_tm1 = _create_algorithm_buffer_for(
569
            parameter, "mean_square_step_tm1")
570
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
571
            parameter, "mean_square_delta_x_tm1")
572
573
        mean_square_step_t = (
574
            self.decay_rate * mean_square_step_tm1 +
575
            (1 - self.decay_rate) * tensor.sqr(previous_step)
576
        )
577
578
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
579
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
580
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
581
582
        mean_square_delta_x_t = (
583
            self.decay_rate * mean_square_delta_x_tm1 +
584
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
585
        )
586
587
        step = delta_x_t
588
        updates = [(mean_square_step_tm1, mean_square_step_t),
589
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
590
        return step, updates
591
592
593
class BasicRMSProp(StepRule):
594
    """Scales the step size by a running average of the recent step norms.
595
596
    Parameters
597
    ----------
598
    decay_rate : float, optional
599
        How fast the running average decays, value in [0, 1]
600
        (lower is faster).  Defaults to 0.9.
601
    max_scaling : float, optional
602
        Maximum scaling of the step size, in case the running average is
603
        really small. Needs to be greater than 0. Defaults to 1e5.
604
605
    Notes
606
    -----
607
    This step rule is intended to be used in conjunction with another
608
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
609
    experience, look at :class:`RMSProp`.
610
611
    In general, this step rule should be used _before_ other step rules,
612
    because it has normalization properties that may undo their work.
613
    For instance, it should be applied first when used in conjunction
614
    with :class:`Scale`.
615
616
    For more information, see [Hint2014]_.
617
618
    """
619
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
620
        if not 0.0 <= decay_rate <= 1.0:
621
            raise ValueError("decay rate needs to be in [0, 1]")
622
        if max_scaling <= 0:
623
            raise ValueError("max. scaling needs to be greater than 0")
624
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
625
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
626
        self.epsilon = 1. / max_scaling
627
628
    def compute_step(self, parameter, previous_step):
629
        mean_square_step_tm1 = _create_algorithm_buffer_for(
630
            parameter, "mean_square_step_tm1")
631
        mean_square_step_t = (
632
            self.decay_rate * mean_square_step_tm1 +
633
            (1 - self.decay_rate) * tensor.sqr(previous_step))
634
        rms_step_t = tensor.maximum(
635
            tensor.sqrt(mean_square_step_t), self.epsilon)
636
        step = previous_step / rms_step_t
637
        updates = [(mean_square_step_tm1, mean_square_step_t)]
638
        return step, updates
639
640
641
class RMSProp(CompositeRule):
642
    """Scales the step size by a running average of the recent step norms.
643
644
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
645
    described in [Hint2014]_.
646
647
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
648
       lecture 6a,
649
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
650
651
    Parameters
652
    ----------
653
    learning_rate : float, optional
654
        The learning rate by which the previous step scaled. Defaults to 1.
655
    decay_rate : float, optional
656
        How fast the running average decays (lower is faster).
657
        Defaults to 0.9.
658
    max_scaling : float, optional
659
        Maximum scaling of the step size, in case the running average is
660
        really small. Defaults to 1e5.
661
662
    Attributes
663
    ----------
664
    learning_rate : :class:`~tensor.SharedVariable`
665
        A variable for learning rate.
666
    decay_rate : :class:`~tensor.SharedVariable`
667
        A variable for decay rate.
668
669
    See Also
670
    --------
671
    :class:`SharedVariableModifier`
672
673
    """
674
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
675
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
676
                                      max_scaling=max_scaling)
677
        scale = Scale(learning_rate=learning_rate)
678
        self.learning_rate = scale.learning_rate
679
        self.decay_rate = basic_rms_prop.decay_rate
680
        self.components = [basic_rms_prop, scale]
681
682
683
class StepClipping(StepRule):
684
    """Rescales an entire step if its L2 norm exceeds a threshold.
685
686
    When the previous steps are the gradients, this step rule performs
687
    gradient clipping.
688
689
    Parameters
690
    ----------
691
    threshold : float, optional
692
        The maximum permitted L2 norm for the step. The step
693
        will be rescaled to be not higher than this quanity.
694
        If ``None``, no rescaling will be applied.
695
696
    Attributes
697
    ----------
698
    threshold : :class:`.tensor.TensorSharedVariable`
699
        The shared variable storing the clipping threshold used.
700
701
    """
702
    def __init__(self, threshold=None):
703
        if threshold:
704
            self.threshold = shared_floatx(threshold, "threshold")
705
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
706
707
    def compute_steps(self, previous_steps):
708
        if not hasattr(self, 'threshold'):
709
            return previous_steps
710
        norm = l2_norm(previous_steps.values())
711
        multiplier = tensor.switch(norm < self.threshold,
712
                                   1, self.threshold / norm)
713
        steps = OrderedDict(
714
            (parameter, step * multiplier)
715
            for parameter, step in previous_steps.items())
716
        return steps, []
717
718
719
class VariableClipping(StepRule):
720
    """Clip the maximum norm of individual variables along certain axes.
721
722
    This :class:`StepRule` can be used to implement L2 norm constraints on
723
    e.g. the weight vectors of individual hidden units, convolutional
724
    filters or entire weight tensors. Combine with :class:`Restrict`
725 View Code Duplication
    (and possibly :class:`CompositeRule`), to apply such constraints only
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
    to certain variables and/or apply different norm constraints to
727
    different variables.
728
729
    Parameters
730
    ----------
731
    threshold : float
732
        Maximum norm for a given (portion of a) tensor.
733
    axis : int or iterable, optional
734
        An integer single axis, or an iterable collection of integer
735
        axes over which to sum in order to calculate the L2 norm. If
736
        `None` (the default), the norm is computed over all elements
737
        of the tensor.
738
739
    Notes
740
    -----
741
    Because of the way the :class:`StepRule` API works, this particular
742
    rule implements norm clipping of the value *after* update in the
743
    following way: it computes ``parameter - previous_step``, scales it
744
    to have (possibly axes-wise) norm(s) of at most `threshold`,
745
    then subtracts *that* value from `parameter` to yield an 'equivalent
746
    step' that respects the desired norm constraints. This procedure
747
    implicitly assumes one is doing simple (stochastic) gradient descent,
748
    and so steps computed by this step rule may not make sense for use
749
    in other contexts.
750
751
    Investigations into max-norm regularization date from [Srebro2005]_.
752
    The first appearance of this technique as a regularization method
753
    for the weight vectors of individual hidden units in feed-forward
754
    neural networks may be [Hinton2012]_.
755
756
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
757
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
758
       on Learning Theory (COLT)*, June 2005.
759
760
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
761
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
762
       "Improving neural networks by preventing co-adaptation of
763
       feature detectors". arXiv:1207.0580.
764
765
    """
766
    def __init__(self, threshold, axis=None):
767
        axis = pack(axis) if axis is not None else ()
768
        self.axis = set(axis)
769
        self.threshold = shared_floatx(threshold, "threshold")
770
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
771
        if len(axis) != len(self.axis):
772
            raise ValueError("axis must be unique")
773
774
    def compute_step(self, parameter, previous_step):
775
        if any(ax >= previous_step.ndim for ax in self.axis):
776
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
777
                self.axis, parameter, previous_step.ndim))
778
        if len(self.axis) == 0:
779
            norms = l2_norm([parameter - previous_step])
780
        else:
781
            squares = tensor.sqr(parameter - previous_step)
782
            norms = tensor.sqrt(
783
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
784
                       sorted(self.axis), squares))
785
        # We want a step s* that is the same as scaling
786
        # (parameter - previous_step) by threshold / norm
787
        # when threshold < norm.
788
        shrinking_step = (parameter -
789
                          (self.threshold / norms) *
790
                          (parameter - previous_step))
791
        return tensor.switch(norms > self.threshold,
792
                             shrinking_step,
793
                             previous_step), ()
794
795
796
class AdaGrad(StepRule):
797
    """Implements the AdaGrad learning rule.
798
799
    Parameters
800
    ----------
801
    learning_rate : float, optional
802
        Step size.
803
        Default value is set to 0.0002.
804
    epsilon : float, optional
805
        Stabilizing constant for one over root of sum of squares.
806
        Defaults to 1e-6.
807
808
    Notes
809
    -----
810
    For more information, see [ADAGRAD]_.
811
812
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
813
       *Adaptive subgradient methods for online learning and
814
        stochastic optimization*,
815
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
816
817
    """
818
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
819
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
820
        self.epsilon = shared_floatx(epsilon, "epsilon")
821
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
822
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
823
824
    def compute_step(self, parameter, previous_step):
825
        name = 'adagrad_sqs'
826
        if parameter.name:
827
            name += '_' + parameter.name
828
        ssq = _create_algorithm_buffer_for(parameter, name=name)
829
830
        ssq_t = (tensor.sqr(previous_step) + ssq)
831
        step = (self.learning_rate * previous_step /
832
                (tensor.sqrt(ssq_t) + self.epsilon))
833
834
        updates = [(ssq, ssq_t)]
835
836
        return step, updates
837
838
839
class Adam(StepRule):
840
    """Adam optimizer as described in [King2014]_.
841
842
    .. [King2014] Diederik Kingma, Jimmy Ba,
843
       *Adam: A Method for Stochastic Optimization*,
844
       http://arxiv.org/abs/1412.6980
845
846
    Parameters
847
    ----------
848
    learning_rate : float, optional
849
        Step size.
850
        Default value is set to 0.002.
851
    beta1 : float, optional
852
        Exponential decay rate for the first moment estimates.
853
        Default value is set to 0.1.
854
    beta2 : float, optional
855
        Exponential decay rate for the second moment estimates.
856
        Default value is set to 0.001.
857
    epsilon : float, optional
858
        Default value is set to 1e-8.
859
    decay_factor : float, optional
860
        Default value is set to 1 - 1e-8.
861
862
    """
863
    def __init__(self, learning_rate=0.002,
864
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
865
                 decay_factor=(1 - 1e-8)):
866
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
867
        self.beta1 = shared_floatx(beta1, "beta1")
868
        self.beta2 = shared_floatx(beta2, "beta2")
869
        self.epsilon = shared_floatx(epsilon, "epsilon")
870
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
871
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
872
                      self.decay_factor]:
873
            add_role(param, ALGORITHM_HYPERPARAMETER)
874
875
    def compute_step(self, parameter, previous_step):
876
        mean = _create_algorithm_buffer_for(parameter, 'mean')
877
        variance = _create_algorithm_buffer_for(parameter, 'variance')
878
        time = shared_floatx(0., 'time')
879
        add_role(time, ALGORITHM_BUFFER)
880
881
        t1 = time + 1
882
        learning_rate = (self.learning_rate *
883
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
884
                         (1. - (1. - self.beta1)**t1))
885
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
886
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
887
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
888
                      (1. - self.beta2) * variance)
889
        step = (learning_rate * mean_t /
890
                (tensor.sqrt(variance_t) + self.epsilon))
891
892
        updates = [(mean, mean_t),
893
                   (variance, variance_t),
894
                   (time, t1)]
895
896
        return step, updates
897
898
899
class RemoveNotFinite(StepRule):
900
    """A step rule that skips steps with non-finite elements.
901
902
    Replaces a step (the parameter update of a single shared variable)
903
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
904
    step rescaling the parameters.
905
906
    Parameters
907
    ----------
908
    scaler : float, optional
909
        The scaling applied to the parameter in case the step contains
910
        non-finite elements. Defaults to 1, which means that parameters
911
        will not be changed.
912
913
    Notes
914
    -----
915
    This rule should be applied last!
916
917
    This trick was originally used in the GroundHog_ framework.
918
919
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
920
921
    """
922
    def __init__(self, scaler=1):
923
        self.scaler = scaler
924
925
    def compute_step(self, parameter, previous_step):
926
        step_sum = tensor.sum(previous_step)
927
        not_finite = (tensor.isnan(step_sum) +
928
                      tensor.isinf(step_sum))
929
        step = tensor.switch(
930
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
931
        return step, []
932
933
934
class Restrict(StepRule):
935
    """Applies a given :class:`StepRule` only to certain variables.
936
937
    Example applications include clipping steps on only certain parameters,
938
    or scaling a certain kind of parameter's updates (e.g. adding an
939
    additional scalar multiplier to the steps taken on convolutional
940
    filters).
941
942
    Parameters
943
    ----------
944
    step_rule : :class:`StepRule`
945
        The :class:`StepRule` to be applied on the given variables.
946
    variables : iterable
947
        A collection of Theano variables on which to apply `step_rule`.
948
        Variables not appearing in this collection will not have
949
        `step_rule` applied to them.
950
951
    """
952
    def __init__(self, step_rule, variables):
953
        self.step_rule = step_rule
954
        self.variables = frozenset(variables)
955
956
    def compute_steps(self, previous_steps):
957
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
958
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
959
        actual = OrderedDict((parameter, steps[parameter])
960
                             if parameter in steps
961
                             else (parameter, previous_steps[parameter])
962
                             for parameter in previous_steps)
963
        return actual, updates
964