Completed
Pull Request — master (#1079)
by David
04:43
created

UpdatesAlgorithm.process_batch()   A

Complexity

Conditions 2

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
c 1
b 0
f 0
dl 0
loc 4
rs 10
1
"""Training algorithms."""
0 ignored issues
show
Bug introduced by
There seems to be a cyclic import (blocks.bricks.base -> blocks.graph -> blocks.graph.bn -> blocks.filter).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.simple -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor, use the \
65
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
66
or on_unused_sources='ignore' to the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
determinism_error = """Cannot infer parameter list in a fixed order.
75
76
Because dictionaries are unordered (and Python uses randomized hashing, \
77
which can change the iteration order over the same dictionary from one \
78
interpreter session to the next), Blocks cannot infer the parameters list \
79
from a plain dictionary of gradients in an order that is reproducible \
80
across interpreter sessions; please either specify the parameters \
81
explicitly or pass gradients as an OrderedDict (though exercise care in \
82
constructing that OrderedDict, as an OrderedDict created by iterating \
83
over an unordered iterable (e.g. a dict) will still have an arbitrary \
84
and unpredictable order that could cause problems with \
85
reproducibility)."""
86
87
88
class UpdatesAlgorithm(TrainingAlgorithm):
89
    """Base class for algorithms that use Theano functions with updates.
90
91
    Parameters
92
    ----------
93
    updates : list of tuples or :class:`~collections.OrderedDict`
94
        The updates that should be performed.
95
    theano_func_kwargs : dict, optional
96
        A passthrough to `theano.function` for additional arguments.
97
        Useful for passing `profile` or `mode` arguments to the theano
98
        function that will be compiled for the algorithm.
99
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
100
        Controls behavior when not all sources in a batch are used
101
        (i.e. there is no variable with a matching name in the inputs
102
        of the computational graph of the updates).
103
104
    Attributes
105
    ----------
106
    updates : list of :class:`~tensor.TensorSharedVariable` updates
107
        Updates to be done for every batch. It is required that the
108
        updates are done using the old values of optimized parameters.
109
110
    """
111
    def __init__(self, updates=None, theano_func_kwargs=None,
112
                 on_unused_sources='raise', **kwargs):
113
        self.updates = [] if updates is None else updates
114
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
115
                                   is not None else dict())
116
        self.on_unused_sources = on_unused_sources
117
        super(UpdatesAlgorithm, self).__init__(**kwargs)
118
119
    def initialize(self):
120
        update_values = [new_value for _, new_value in self.updates]
121
        logger.debug("Inferring graph inputs...")
122
        self.inputs = ComputationGraph(update_values).inputs
123
        logger.debug("Compiling training function...")
124
        self._function = theano.function(
125
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
126
        logger.info("The training algorithm is initialized")
127
128
    @property
129
    def updates(self):
130
        return self._updates
131
132
    @updates.setter
133
    def updates(self, value):
134
        self._updates = value
135
136
    def add_updates(self, updates):
137
        """Add updates to the training process.
138
139
        The updates will be done _before_ the parameters are changed.
140
141
        Parameters
142
        ----------
143
        updates : list of tuples or :class:`~collections.OrderedDict`
144
            The updates to add.
145
146
        """
147
        if isinstance(updates, OrderedDict):
148
            updates = list(updates.items())
149
        if not isinstance(updates, list):
150
            raise ValueError
151
        self.updates.extend(updates)
152
153
    def _validate_source_names(self, batch):
154
        in_names = [v.name for v in self.inputs]
155
156
        if not set(in_names).issubset(set(batch.keys())):
157
            raise ValueError("Didn't find all sources: " +
158
                             source_missing_error.format(
159
                                 sources=batch.keys(),
160
                                 variables=in_names))
161
        if not set(batch.keys()).issubset(set(in_names)):
162
            if self.on_unused_sources == 'ignore':
163
                pass
164
            elif self.on_unused_sources == 'warn':
165
                if not hasattr(self, '_unused_source_warned'):
166
                    logger.warn(variable_mismatch_error.format(
167
                        sources=batch.keys(),
168
                        variables=in_names))
169
                self._unused_source_warned = True
170
            elif self.on_unused_sources == 'raise':
171
                raise ValueError(
172
                    "mismatch of variable names and data sources" +
173
                    variable_mismatch_error.format(
174
                        sources=batch.keys(),
175
                        variables=in_names))
176
            else:
177
                raise ValueError("Wrong value of on_unused_sources: {}."
178
                                 .format(self.on_unused_sources))
179
180
    def process_batch(self, batch):
181
        self._validate_source_names(batch)
182
        ordered_batch = [batch[v.name] for v in self.inputs]
183
        self._function(*ordered_batch)
184
185
186
class GradientDescent(UpdatesAlgorithm):
187
    """A base class for all gradient descent algorithms.
188
189
    By "gradient descent" we mean a training algorithm of the following
190
    form:
191
192
    .. code-block::  python
193
194
        for batch in data:
195
            steps = step_rule.compute_steps(parameters,
196
                                            gradients_wr_parameters)
197
            for parameter in parameters:
198
                parameter -= steps[parameter]
199
200
    Note, that the step is *subtracted, not added*! This is done in order
201
    to make step rule chaining possible.
202
203
    Parameters
204
    ----------
205
    cost : :class:`~tensor.TensorVariable`, optional
206
        The objective to be minimized. Unused if `gradients` is specified.
207
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
208
        The parameters to be tuned. If not provided, inferred from the
209
        keys of `gradients` (in which case `gradients` *must* be an
210
        `OrderedDict`).
211
    step_rule : instance of :class:`StepRule`, optional
212
        An object encapsulating most of the algorithm's logic. Its
213
        `compute_steps` method is called to get Theano expression for
214
        steps.  Note, that the step rule might have a state, e.g. to
215
        remember a weighted sum of gradients from previous steps like it is
216
        done in gradient descent with momentum. If ``None``, an instance of
217
        :class:`Scale` is created.
218
    gradients : OrderedDict, optional
219
        A dictionary mapping a parameter to an expression for the cost's
220
        gradient with respect to the parameter. If ``None``, the gradient
221
        are taken automatically using :func:`theano.gradient.grad`.
222
    known_grads : dict, optional
223
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
224
        Useful when you know the [approximate] gradients of some
225
        sub-expressions and would like Theano to use that information
226
        to compute parameter gradients. Only makes sense when `gradients`
227
        is `None`.
228
    consider_constant : list, optional
229
        A passthrough to `theano.tensor.grad`'s `consider_constant`
230
        argument.  A list of expressions through which gradients will not
231
        be backpropagated. Only makes sense when `gradients` is `None`.
232
233
    Attributes
234
    ----------
235
    gradients : OrderedDict
236
        The gradient dictionary.
237
    step_rule : instance of :class:`StepRule`
238
        The step rule.
239
240
    Notes
241
    -----
242
    Changing `updates` attribute or calling `add_updates` after
243
    the `initialize` method is called will have no effect.
244
245
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
246
    because ordinary dictionaries have an unpredictable iteration
247
    order due to hash randomization (which is enabled by default since
248
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
249
    when combined with Theano's heuristic graph optimizations, can cause
250
    serious reproducibility issues.
251
252
    .. todo::
253
254
       Some shared variables are not parameters (e.g. those created by
255
       random streams).
256
257
    .. todo::
258
259
       Due to a rather premature status of the :class:`ComputationGraph`
260
       class the parameter used only inside scans are not fetched
261
       currently.
262
263
    """
264
    def __init__(self, cost=None, parameters=None, step_rule=None,
265
                 gradients=None, known_grads=None, consider_constant=None,
266
                 **kwargs):
267
        # Set initial values for cost, parameters, gradients.
268
        self.cost = cost
269
        self.parameters = parameters
270
        self.step_rule = step_rule if step_rule else Scale()
271
        self.gradients = gradients
272
        self.known_grads = known_grads
273
        self.consider_constant = consider_constant
274
        self._initialized = False
275
        super(GradientDescent, self).__init__(**kwargs)
276
277
    def initialize(self):
278
        if self._initialized:
279
            logger.warn('%s already initialized; subsequent calls to '
280
                        'initialize() have no effect', self.__class__.__name__)
281
            return
282
        # If we don't have gradients, we'll need to infer them from the
283
        # cost and the parameters, both of which must not be None.
284
        if not self.gradients:
285
            self.gradients = self._compute_gradients(self.known_grads,
286
                                                     self.consider_constant)
287
        else:
288
            if self.cost is not None:
289
                logger.warning(('{}: gradients already specified directly; '
290
                                'cost is unused.'
291
                                .format(self.__class__.__name__)))
292
            if self.parameters is None and isinstance(self.gradients,
293
                                                      OrderedDict):
294
                # If the dictionary is ordered, it's safe to use the keys
295
                # as they have a deterministic order.
296
                self.parameters = list(self.gradients.keys())
297
            elif self.parameters is not None:
298
                # If parameters and gradients.keys() don't match we can
299
                # try to recover if gradients is ordered.
300
                if set(self.parameters) != set(self.gradients.keys()):
301
                    logger.warn("Specified parameters list does not match "
302
                                "keys in provided gradient dictionary; "
303
                                "using parameters inferred from gradients")
304
                    if not isinstance(self.gradients, OrderedDict):
305
                        raise ValueError(determinism_error)
306
                    self.parameters = list(self.gradients.keys())
307
            else:
308
                # self.parameters is not None, and gradients isn't
309
                # an OrderedDict. We can't do anything safe.
310
                raise ValueError(determinism_error)
311
            if self.known_grads is not None:
312
                raise ValueError("known_grads has no effect when gradients "
313
                                 "are passed in")
314
            if self.consider_constant is not None:
315
                raise ValueError("consider_constant has no effect when "
316
                                 "gradients are passed in")
317
318
        # The order in which the different gradient terms appears
319
        # here matters, as floating point addition is non-commutative (and
320
        # Theano's graph optimizations are not order-independent).
321
        # This is why we do not use .values().
322
        gradient_values = [self.gradients[p] for p in self.parameters]
323
        self.total_gradient_norm = (l2_norm(gradient_values)
324
                                    .copy(name="total_gradient_norm"))
325
326
        logger.debug("Computing parameter steps...")
327
        self.steps, self.step_rule_updates = (
328
            self.step_rule.compute_steps(self.gradients))
329
330
        # Same as gradient_values above: the order may influence a
331
        # bunch of things, so enforce a consistent one (don't use
332
        # .values()).
333
        step_values = [self.steps[p] for p in self.parameters]
334
        self.total_step_norm = (l2_norm(step_values)
335
                                .copy(name="total_step_norm"))
336
337
        # Once again, iterating on gradients may not be deterministically
338
        # ordered if it is not an OrderedDict. We add the updates here in
339
        # the order specified in self.parameters. Keep it this way to
340
        # maintain reproducibility.
341
        self.updates.extend(
342
            itertools.chain(((parameter, parameter - self.steps[parameter])
343
                             for parameter in self.parameters),
344
                            self.step_rule_updates)
345
        )
346
        self._initialized = True
347
        super(GradientDescent, self).initialize()
348
349
    def _compute_gradients(self, known_grads, consider_constant):
350
        if self.cost is None:
351
            raise ValueError("can't infer gradients; no cost specified")
352
        elif self.parameters is None or len(self.parameters) == 0:
353
            raise ValueError("can't infer gradients; no parameters "
354
                             "specified")
355
        # While this strictly speaking could be a dict and not an
356
        # OrderedDict (because we iterate over it in the order of
357
        # self.parameters), this guards a little bit against
358
        # nondeterminism introduced by future refactoring.
359
        logger.info("Taking the cost gradient")
360
        gradients = OrderedDict(
361
            equizip(self.parameters, tensor.grad(
362
                self.cost, self.parameters,
363
                known_grads=known_grads,
364
                consider_constant=consider_constant)))
365
        logger.info("The cost gradient computation graph is built")
366
        return gradients
367
368
369
@add_metaclass(ABCMeta)
370
class StepRule(object):
371
    """A rule to compute steps for a gradient descent algorithm."""
372
    def compute_step(self, parameter, previous_step):
373
        """Build a Theano expression for the step for a parameter.
374
375
        This method is called by default implementation of
376
        :meth:`compute_steps`, it relieves from writing a loop each time.
377
378
        Parameters
379
        ----------
380
        parameter : :class:`~tensor.TensorSharedVariable`
381
            The parameter.
382
        previous_step : :class:`~tensor.TensorVariable`
383
            Some quantity related to the gradient of the cost with respect
384
            to the parameter, either the gradient itself or a step in a
385
            related direction.
386
387
        Returns
388
        -------
389
        step : :class:`~theano.Variable`
390
            Theano variable for the step to take.
391
        updates : list
392
            A list of tuples representing updates to be performed. This
393
            is useful for stateful rules such as :class:`Momentum` which
394
            need to update shared variables after itetations.
395
396
        """
397
        raise NotImplementedError
398
399
    def compute_steps(self, previous_steps):
400
        """Build a Theano expression for steps for all parameters.
401
402
        Override this method if you want to process the steps
403
        with respect to all parameters as a whole, not parameter-wise.
404
405
        Parameters
406
        ----------
407
        previous_steps : OrderedDict
408
            An :class:`~OrderedDict` of
409
            (:class:`~tensor.TensorSharedVariable`
410
            :class:`~tensor.TensorVariable`) pairs. The keys are the
411
            parameters being trained, the values are the expressions for
412
            quantities related to gradients of the cost with respect to
413
            the parameters, either the gradients themselves or steps in
414
            related directions.
415
416
        Returns
417
        -------
418
        steps : OrderedDict
419
            A dictionary of the proposed steps in the same form as
420
            `previous_steps`.
421
        updates : list
422
            A list of tuples representing updates to be performed.
423
424
        """
425
        parameter_wise = [self.compute_step(parameter,
426
                                            previous_steps[parameter])
427
                          for parameter in previous_steps]
428
        steps, updates = equizip(*parameter_wise)
429
        steps = OrderedDict((parameter, step) for parameter, step
430
                            in equizip(previous_steps.keys(), steps))
431
        updates = list(itertools.chain(*updates))
432
        return steps, updates
433
434
435
class CompositeRule(StepRule):
436
    """Chains several step rules.
437
438
    Parameters
439
    ----------
440
    components : list of :class:`StepRule`
441
        The learning rules to be chained. The rules will be applied in the
442
        order as given.
443
444
    """
445
    def __init__(self, components):
446
        self.components = components
447
448
    def compute_steps(self, previous_steps):
449
        steps = previous_steps
450
        updates = []
451
        for rule in self.components:
452
            steps, more_updates = rule.compute_steps(steps)
453
            updates += more_updates
454
        return steps, updates
455
456
457
class Scale(StepRule):
458
    """A step in the direction proportional to the previous step.
459
460
    If used in :class:`GradientDescent` alone, this step rule implements
461
    steepest descent.
462
463
    Parameters
464
    ----------
465
    learning_rate : float
466
        The learning rate by which the previous step is multiplied to
467
        produce the step.
468
469
    Attributes
470
    ----------
471
    learning_rate : :class:`~tensor.TensorSharedVariable`
472
        The shared variable storing the learning rate used.
473
474
    """
475
    def __init__(self, learning_rate=1.0):
476
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
477
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
478
479
    def compute_step(self, parameter, previous_step):
480
        return self.learning_rate * previous_step, []
481
482
483
class BasicMomentum(StepRule):
484
    """Accumulates step with exponential discount.
485
486
    Parameters
487
    ----------
488
    momentum : float, optional
489
        The momentum coefficient. Defaults to 0.
490
491
    Notes
492
    -----
493
    This step rule is intended to be used in conjunction with another
494
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
495
    experience, look at :class:`Momentum`.
496
497
    """
498
    def __init__(self, momentum=0.):
499
        self.momentum = shared_floatx(momentum, "momentum")
500
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
501
502
    def compute_step(self, parameter, previous_step):
503
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
504
        step = self.momentum * velocity + previous_step
505
        updates = [(velocity, step)]
506
        return step, updates
507
508
509
class Momentum(CompositeRule):
510
    """Accumulates step with exponential discount.
511
512
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
513
    usual momentum step rule.
514
515
    Parameters
516
    ----------
517
    learning_rate : float, optional
518
        The learning rate by which the previous step scaled. Defaults to 1.
519
    momentum : float, optional
520
        The momentum coefficient. Defaults to 0.
521
522 View Code Duplication
    Attributes
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    ----------
524
    learning_rate : :class:`~tensor.SharedVariable`
525
        A variable for learning rate.
526
    momentum : :class:`~tensor.SharedVariable`
527
        A variable for momentum.
528
529
    See Also
530
    --------
531
    :class:`SharedVariableModifier`
532
533
    """
534
    def __init__(self, learning_rate=1.0, momentum=0.):
535
        scale = Scale(learning_rate=learning_rate)
536
        basic_momentum = BasicMomentum(momentum=momentum)
537
        self.learning_rate = scale.learning_rate
538
        self.momentum = basic_momentum.momentum
539
        self.components = [scale, basic_momentum]
540
541
542
class AdaDelta(StepRule):
543
    """Adapts the step size over time using only first order information.
544
545
    Parameters
546
    ----------
547
    decay_rate : float, optional
548
        Decay rate in [0, 1]. Defaults to 0.95.
549
    epsilon : float, optional
550
        Stabilizing constant for RMS. Defaults to 1e-6.
551
552
    Notes
553
    -----
554
    For more information, see [ADADELTA]_.
555
556
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
557
       Rate Method*, arXiv:1212.5701.
558
559
    """
560
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
561
        if not 0.0 <= decay_rate <= 1.0:
562
            raise ValueError("decay rate needs to be in [0, 1]")
563
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
564
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
565
        self.epsilon = shared_floatx(epsilon, "epsilon")
566
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
567
568
    def compute_step(self, parameter, previous_step):
569
        mean_square_step_tm1 = _create_algorithm_buffer_for(
570
            parameter, "mean_square_step_tm1")
571
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
572
            parameter, "mean_square_delta_x_tm1")
573
574
        mean_square_step_t = (
575
            self.decay_rate * mean_square_step_tm1 +
576
            (1 - self.decay_rate) * tensor.sqr(previous_step)
577
        )
578
579
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
580
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
581
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
582
583
        mean_square_delta_x_t = (
584
            self.decay_rate * mean_square_delta_x_tm1 +
585
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
586
        )
587
588
        step = delta_x_t
589
        updates = [(mean_square_step_tm1, mean_square_step_t),
590
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
591
        return step, updates
592
593
594
class BasicRMSProp(StepRule):
595
    """Scales the step size by a running average of the recent step norms.
596
597
    Parameters
598
    ----------
599
    decay_rate : float, optional
600
        How fast the running average decays, value in [0, 1]
601
        (lower is faster).  Defaults to 0.9.
602
    max_scaling : float, optional
603
        Maximum scaling of the step size, in case the running average is
604
        really small. Needs to be greater than 0. Defaults to 1e5.
605
606
    Notes
607
    -----
608
    This step rule is intended to be used in conjunction with another
609
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
610
    experience, look at :class:`RMSProp`.
611
612
    In general, this step rule should be used _before_ other step rules,
613
    because it has normalization properties that may undo their work.
614
    For instance, it should be applied first when used in conjunction
615
    with :class:`Scale`.
616
617
    For more information, see [Hint2014]_.
618
619
    """
620
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
621
        if not 0.0 <= decay_rate <= 1.0:
622
            raise ValueError("decay rate needs to be in [0, 1]")
623
        if max_scaling <= 0:
624
            raise ValueError("max. scaling needs to be greater than 0")
625
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
626
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
627
        self.epsilon = 1. / max_scaling
628
629
    def compute_step(self, parameter, previous_step):
630
        mean_square_step_tm1 = _create_algorithm_buffer_for(
631
            parameter, "mean_square_step_tm1")
632
        mean_square_step_t = (
633
            self.decay_rate * mean_square_step_tm1 +
634
            (1 - self.decay_rate) * tensor.sqr(previous_step))
635
        rms_step_t = tensor.maximum(
636
            tensor.sqrt(mean_square_step_t), self.epsilon)
637
        step = previous_step / rms_step_t
638
        updates = [(mean_square_step_tm1, mean_square_step_t)]
639
        return step, updates
640
641
642
class RMSProp(CompositeRule):
643
    """Scales the step size by a running average of the recent step norms.
644
645
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
646
    described in [Hint2014]_.
647
648
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
649
       lecture 6a,
650
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
651
652
    Parameters
653
    ----------
654
    learning_rate : float, optional
655
        The learning rate by which the previous step scaled. Defaults to 1.
656
    decay_rate : float, optional
657
        How fast the running average decays (lower is faster).
658
        Defaults to 0.9.
659
    max_scaling : float, optional
660
        Maximum scaling of the step size, in case the running average is
661
        really small. Defaults to 1e5.
662
663
    Attributes
664
    ----------
665
    learning_rate : :class:`~tensor.SharedVariable`
666
        A variable for learning rate.
667
    decay_rate : :class:`~tensor.SharedVariable`
668
        A variable for decay rate.
669
670
    See Also
671
    --------
672
    :class:`SharedVariableModifier`
673
674
    """
675
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
676
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
677
                                      max_scaling=max_scaling)
678
        scale = Scale(learning_rate=learning_rate)
679
        self.learning_rate = scale.learning_rate
680
        self.decay_rate = basic_rms_prop.decay_rate
681
        self.components = [basic_rms_prop, scale]
682
683
684
class StepClipping(StepRule):
685
    """Rescales an entire step if its L2 norm exceeds a threshold.
686
687
    When the previous steps are the gradients, this step rule performs
688
    gradient clipping.
689
690
    Parameters
691
    ----------
692
    threshold : float, optional
693
        The maximum permitted L2 norm for the step. The step
694
        will be rescaled to be not higher than this quanity.
695
        If ``None``, no rescaling will be applied.
696
697
    Attributes
698
    ----------
699
    threshold : :class:`.tensor.TensorSharedVariable`
700
        The shared variable storing the clipping threshold used.
701
702
    """
703
    def __init__(self, threshold=None):
704
        if threshold:
705
            self.threshold = shared_floatx(threshold, "threshold")
706
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
707
708
    def compute_steps(self, previous_steps):
709
        if not hasattr(self, 'threshold'):
710
            return previous_steps
711
        norm = l2_norm(previous_steps.values())
712
        multiplier = tensor.switch(norm < self.threshold,
713
                                   1, self.threshold / norm)
714
        steps = OrderedDict(
715
            (parameter, step * multiplier)
716
            for parameter, step in previous_steps.items())
717
        return steps, []
718
719
720
class VariableClipping(StepRule):
721
    """Clip the maximum norm of individual variables along certain axes.
722
723
    This :class:`StepRule` can be used to implement L2 norm constraints on
724
    e.g. the weight vectors of individual hidden units, convolutional
725 View Code Duplication
    filters or entire weight tensors. Combine with :class:`Restrict`
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
    (and possibly :class:`CompositeRule`), to apply such constraints only
727
    to certain variables and/or apply different norm constraints to
728
    different variables.
729
730
    Parameters
731
    ----------
732
    threshold : float
733
        Maximum norm for a given (portion of a) tensor.
734
    axis : int or iterable, optional
735
        An integer single axis, or an iterable collection of integer
736
        axes over which to sum in order to calculate the L2 norm. If
737
        `None` (the default), the norm is computed over all elements
738
        of the tensor.
739
740
    Notes
741
    -----
742
    Because of the way the :class:`StepRule` API works, this particular
743
    rule implements norm clipping of the value *after* update in the
744
    following way: it computes ``parameter - previous_step``, scales it
745
    to have (possibly axes-wise) norm(s) of at most `threshold`,
746
    then subtracts *that* value from `parameter` to yield an 'equivalent
747
    step' that respects the desired norm constraints. This procedure
748
    implicitly assumes one is doing simple (stochastic) gradient descent,
749
    and so steps computed by this step rule may not make sense for use
750
    in other contexts.
751
752
    Investigations into max-norm regularization date from [Srebro2005]_.
753
    The first appearance of this technique as a regularization method
754
    for the weight vectors of individual hidden units in feed-forward
755
    neural networks may be [Hinton2012]_.
756
757
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
758
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
759
       on Learning Theory (COLT)*, June 2005.
760
761
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
762
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
763
       "Improving neural networks by preventing co-adaptation of
764
       feature detectors". arXiv:1207.0580.
765
766
    """
767
    def __init__(self, threshold, axis=None):
768
        axis = pack(axis) if axis is not None else ()
769
        self.axis = set(axis)
770
        self.threshold = shared_floatx(threshold, "threshold")
771
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
772
        if len(axis) != len(self.axis):
773
            raise ValueError("axis must be unique")
774
775
    def compute_step(self, parameter, previous_step):
776
        if any(ax >= previous_step.ndim for ax in self.axis):
777
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
778
                self.axis, parameter, previous_step.ndim))
779
        if len(self.axis) == 0:
780
            norms = l2_norm([parameter - previous_step])
781
        else:
782
            squares = tensor.sqr(parameter - previous_step)
783
            norms = tensor.sqrt(
784
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
785
                       sorted(self.axis), squares))
786
        # We want a step s* that is the same as scaling
787
        # (parameter - previous_step) by threshold / norm
788
        # when threshold < norm.
789
        shrinking_step = (parameter -
790
                          (self.threshold / norms) *
791
                          (parameter - previous_step))
792
        return tensor.switch(norms > self.threshold,
793
                             shrinking_step,
794
                             previous_step), ()
795
796
797
class AdaGrad(StepRule):
798
    """Implements the AdaGrad learning rule.
799
800
    Parameters
801
    ----------
802
    learning_rate : float, optional
803
        Step size.
804
        Default value is set to 0.0002.
805
    epsilon : float, optional
806
        Stabilizing constant for one over root of sum of squares.
807
        Defaults to 1e-6.
808
809
    Notes
810
    -----
811
    For more information, see [ADAGRAD]_.
812
813
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
814
       *Adaptive subgradient methods for online learning and
815
        stochastic optimization*,
816
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
817
818
    """
819
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
820
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
821
        self.epsilon = shared_floatx(epsilon, "epsilon")
822
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
823
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
824
825
    def compute_step(self, parameter, previous_step):
826
        name = 'adagrad_sqs'
827
        if parameter.name:
828
            name += '_' + parameter.name
829
        ssq = _create_algorithm_buffer_for(parameter, name=name)
830
831
        ssq_t = (tensor.sqr(previous_step) + ssq)
832
        step = (self.learning_rate * previous_step /
833
                (tensor.sqrt(ssq_t) + self.epsilon))
834
835
        updates = [(ssq, ssq_t)]
836
837
        return step, updates
838
839
840
class Adam(StepRule):
841
    """Adam optimizer as described in [King2014]_.
842
843
    .. [King2014] Diederik Kingma, Jimmy Ba,
844
       *Adam: A Method for Stochastic Optimization*,
845
       http://arxiv.org/abs/1412.6980
846
847
    Parameters
848
    ----------
849
    learning_rate : float, optional
850
        Step size.
851
        Default value is set to 0.002.
852
    beta1 : float, optional
853
        Exponential decay rate for the first moment estimates.
854
        Default value is set to 0.1.
855
    beta2 : float, optional
856
        Exponential decay rate for the second moment estimates.
857
        Default value is set to 0.001.
858
    epsilon : float, optional
859
        Default value is set to 1e-8.
860
    decay_factor : float, optional
861
        Default value is set to 1 - 1e-8.
862
863
    """
864
    def __init__(self, learning_rate=0.002,
865
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
866
                 decay_factor=(1 - 1e-8)):
867
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
868
        self.beta1 = shared_floatx(beta1, "beta1")
869
        self.beta2 = shared_floatx(beta2, "beta2")
870
        self.epsilon = shared_floatx(epsilon, "epsilon")
871
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
872
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
873
                      self.decay_factor]:
874
            add_role(param, ALGORITHM_HYPERPARAMETER)
875
876
    def compute_step(self, parameter, previous_step):
877
        mean = _create_algorithm_buffer_for(parameter, 'mean')
878
        variance = _create_algorithm_buffer_for(parameter, 'variance')
879
        time = shared_floatx(0., 'time')
880
        add_role(time, ALGORITHM_BUFFER)
881
882
        t1 = time + 1
883
        learning_rate = (self.learning_rate *
884
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
885
                         (1. - (1. - self.beta1)**t1))
886
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
887
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
888
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
889
                      (1. - self.beta2) * variance)
890
        step = (learning_rate * mean_t /
891
                (tensor.sqrt(variance_t) + self.epsilon))
892
893
        updates = [(mean, mean_t),
894
                   (variance, variance_t),
895
                   (time, t1)]
896
897
        return step, updates
898
899
900
class RemoveNotFinite(StepRule):
901
    """A step rule that skips steps with non-finite elements.
902
903
    Replaces a step (the parameter update of a single shared variable)
904
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
905
    step rescaling the parameters.
906
907
    Parameters
908
    ----------
909
    scaler : float, optional
910
        The scaling applied to the parameter in case the step contains
911
        non-finite elements. Defaults to 1, which means that parameters
912
        will not be changed.
913
914
    Notes
915
    -----
916
    This rule should be applied last!
917
918
    This trick was originally used in the GroundHog_ framework.
919
920
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
921
922
    """
923
    def __init__(self, scaler=1):
924
        self.scaler = scaler
925
926
    def compute_step(self, parameter, previous_step):
927
        step_sum = tensor.sum(previous_step)
928
        not_finite = (tensor.isnan(step_sum) +
929
                      tensor.isinf(step_sum))
930
        step = tensor.switch(
931
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
932
        return step, []
933
934
935
class Restrict(StepRule):
936
    """Applies a given :class:`StepRule` only to certain variables.
937
938
    Example applications include clipping steps on only certain parameters,
939
    or scaling a certain kind of parameter's updates (e.g. adding an
940
    additional scalar multiplier to the steps taken on convolutional
941
    filters).
942
943
    Parameters
944
    ----------
945
    step_rule : :class:`StepRule`
946
        The :class:`StepRule` to be applied on the given variables.
947
    variables : iterable
948
        A collection of Theano variables on which to apply `step_rule`.
949
        Variables not appearing in this collection will not have
950
        `step_rule` applied to them.
951
952
    """
953
    def __init__(self, step_rule, variables):
954
        self.step_rule = step_rule
955
        self.variables = frozenset(variables)
956
957
    def compute_steps(self, previous_steps):
958
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
959
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
960
        actual = OrderedDict((parameter, steps[parameter])
961
                             if parameter in steps
962
                             else (parameter, previous_steps[parameter])
963
                             for parameter in previous_steps)
964
        return actual, updates
965