Completed
Push — master ( bf89d4...30ab15 )
by David
04:08
created

UpdatesAlgorithm._validate_source_names()   D

Complexity

Conditions 8

Size

Total Lines 26

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
c 1
b 0
f 0
dl 0
loc 26
rs 4
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor, use the \
65
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
66
or on_unused_sources='ignore' to the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
determinism_error = """Cannot infer parameter list in a fixed order.
75
76
Because dictionaries are unordered (and Python uses randomized hashing, \
77
which can change the iteration order over the same dictionary from one \
78
interpreter session to the next), Blocks cannot infer the parameters list \
79
from a plain dictionary of gradients in an order that is reproducible \
80
across interpreter sessions; please either specify the parameters \
81
explicitly or pass gradients as an OrderedDict (though exercise care in \
82
constructing that OrderedDict, as an OrderedDict created by iterating \
83
over an unordered iterable (e.g. a dict) will still have an arbitrary \
84
and unpredictable order that could cause problems with \
85
reproducibility)."""
86
87
88
class UpdatesAlgorithm(TrainingAlgorithm):
89
    """Base class for algorithms that use Theano functions with updates.
90
91
    Parameters
92
    ----------
93
    updates : list of tuples or :class:`~collections.OrderedDict`
94
        The updates that should be performed.
95
    theano_func_kwargs : dict, optional
96
        A passthrough to `theano.function` for additional arguments.
97
        Useful for passing `profile` or `mode` arguments to the theano
98
        function that will be compiled for the algorithm.
99
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
100
        Controls behavior when not all sources in a batch are used
101
        (i.e. there is no variable with a matching name in the inputs
102
        of the computational graph of the updates).
103
104
    Attributes
105
    ----------
106
    updates : list of :class:`~tensor.TensorSharedVariable` updates
107
        Updates to be done for every batch. It is required that the
108
        updates are done using the old values of optimized parameters.
109
110
    Notes
111
    -----
112
    Changing `updates` attribute or calling `add_updates` after
113
    the `initialize` method is called will have no effect.
114
115
    """
116
    def __init__(self, updates=None, theano_func_kwargs=None,
117
                 on_unused_sources='raise', **kwargs):
118
        self.updates = [] if updates is None else updates
119
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
120
                                   is not None else dict())
121
        self.on_unused_sources = on_unused_sources
122
        super(UpdatesAlgorithm, self).__init__(**kwargs)
123
124
    def initialize(self):
125
        logger.info("Initializing the training algorithm")
126
        update_values = [new_value for _, new_value in self.updates]
127
        logger.debug("Inferring graph inputs...")
128
        self.inputs = ComputationGraph(update_values).inputs
129
        logger.debug("Compiling training function...")
130
        self._function = theano.function(
131
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
132
        logger.info("The training algorithm is initialized")
133
134
    @property
135
    def updates(self):
136
        return self._updates
137
138
    @updates.setter
139
    def updates(self, value):
140
        self._updates = value
141
142
    def add_updates(self, updates):
143
        """Add updates to the training process.
144
145
        The updates will be done _before_ the parameters are changed.
146
147
        Parameters
148
        ----------
149
        updates : list of tuples or :class:`~collections.OrderedDict`
150
            The updates to add.
151
152
        """
153
        if isinstance(updates, OrderedDict):
154
            updates = list(updates.items())
155
        if not isinstance(updates, list):
156
            raise ValueError
157
        self.updates.extend(updates)
158
159
    def _validate_source_names(self, batch):
160
        in_names = [v.name for v in self.inputs]
161
162
        if not set(in_names).issubset(set(batch.keys())):
163
            raise ValueError("Didn't find all sources: " +
164
                             source_missing_error.format(
165
                                 sources=batch.keys(),
166
                                 variables=in_names))
167
        if not set(batch.keys()).issubset(set(in_names)):
168
            if self.on_unused_sources == 'ignore':
169
                pass
170
            elif self.on_unused_sources == 'warn':
171
                if not hasattr(self, '_unused_source_warned'):
172
                    logger.warn(variable_mismatch_error.format(
173
                        sources=batch.keys(),
174
                        variables=in_names))
175
                self._unused_source_warned = True
176
            elif self.on_unused_sources == 'raise':
177
                raise ValueError(
178
                    "mismatch of variable names and data sources" +
179
                    variable_mismatch_error.format(
180
                        sources=batch.keys(),
181
                        variables=in_names))
182
            else:
183
                raise ValueError("Wrong value of on_unused_sources: {}."
184
                                 .format(self.on_unused_sources))
185
186
    def process_batch(self, batch):
187
        self._validate_source_names(batch)
188
        ordered_batch = [batch[v.name] for v in self.inputs]
189
        self._function(*ordered_batch)
190
191
192
class GradientDescent(UpdatesAlgorithm):
193
    """A base class for all gradient descent algorithms.
194
195
    By "gradient descent" we mean a training algorithm of the following
196
    form:
197
198
    .. code-block::  python
199
200
        for batch in data:
201
            steps = step_rule.compute_steps(parameters,
202
                                            gradients_wr_parameters)
203
            for parameter in parameters:
204
                parameter -= steps[parameter]
205
206
    Note, that the step is *subtracted, not added*! This is done in order
207
    to make step rule chaining possible.
208
209
    Parameters
210
    ----------
211
    cost : :class:`~tensor.TensorVariable`, optional
212
        The objective to be minimized. Unused if `gradients` is specified.
213
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
214
        The parameters to be tuned. If not provided, inferred from the
215
        keys of `gradients` (in which case `gradients` *must* be an
216
        `OrderedDict`).
217
    step_rule : instance of :class:`StepRule`, optional
218
        An object encapsulating most of the algorithm's logic. Its
219
        `compute_steps` method is called to get Theano expression for
220
        steps.  Note, that the step rule might have a state, e.g. to
221
        remember a weighted sum of gradients from previous steps like it is
222
        done in gradient descent with momentum. If ``None``, an instance of
223
        :class:`Scale` is created.
224
    gradients : OrderedDict, optional
225
        A dictionary mapping a parameter to an expression for the cost's
226
        gradient with respect to the parameter. If ``None``, the gradient
227
        are taken automatically using :func:`theano.gradient.grad`.
228
    known_grads : dict, optional
229
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
230
        Useful when you know the [approximate] gradients of some
231
        sub-expressions and would like Theano to use that information
232
        to compute parameter gradients. Only makes sense when `gradients`
233
        is `None`.
234
    consider_constant : list, optional
235
        A passthrough to `theano.tensor.grad`'s `consider_constant`
236
        argument.  A list of expressions through which gradients will not
237
        be backpropagated. Only makes sense when `gradients` is `None`.
238
239
    Attributes
240
    ----------
241
    gradients : OrderedDict
242
        The gradient dictionary.
243
    step_rule : instance of :class:`StepRule`
244
        The step rule.
245
246
    Notes
247
    -----
248
    Changing `updates` attribute or calling `add_updates` after
249
    the `initialize` method is called will have no effect.
250
251
    If a cost and parameters are provided, gradients are taken immediately
252
    upon construction, and changes to these attributes after construction
253
    will have no effect.
254
255
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
256
    because ordinary dictionaries have an unpredictable iteration
257
    order due to hash randomization (which is enabled by default since
258
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
259
    when combined with Theano's heuristic graph optimizations, can cause
260
    serious reproducibility issues.
261
262
    """
263
    def __init__(self, cost=None, parameters=None, step_rule=None,
264
                 gradients=None, known_grads=None, consider_constant=None,
265
                 **kwargs):
266
        # Set initial values for cost, parameters, gradients.
267
        self.cost = cost
268
        self.parameters = parameters
269
        self.gradients = gradients
270
271
        # If we don't have gradients, we'll need to infer them from the
272
        # cost and the parameters, both of which must not be None.
273
        if not self.gradients:
274
            self.gradients = self._compute_gradients(known_grads,
275
                                                     consider_constant)
276
        else:
277
            if cost is not None:
278
                logger.warning(('{}: gradients already specified directly; '
279
                                'cost is unused.'
280
                                .format(self.__class__.__name__)))
281
            if self.parameters is None and isinstance(gradients, OrderedDict):
282
                # If the dictionary is ordered, it's safe to use the keys
283
                # as they have a deterministic order.
284
                self.parameters = list(self.gradients.keys())
285
            elif self.parameters is not None:
286
                # If parameters and gradients.keys() don't match we can
287
                # try to recover if gradients is ordered.
288
                if set(self.parameters) != set(self.gradients.keys()):
289
                    logger.warn("Specified parameters list does not match "
290
                                "keys in provided gradient dictionary; "
291
                                "using parameters inferred from gradients")
292
                    if not isinstance(self.gradients, OrderedDict):
293
                        raise ValueError(determinism_error)
294
                    self.parameters = list(self.gradients.keys())
295
            else:
296
                # self.parameters is not None, and gradients isn't
297
                # an OrderedDict. We can't do anything safe.
298
                raise ValueError(determinism_error)
299
            if known_grads:
300
                raise ValueError("known_grads has no effect when gradients "
301
                                 "are passed in")
302
            if consider_constant is not None:
303
                raise ValueError("consider_constant has no effect when "
304
                                 "gradients are passed in")
305
306
        # The order in which the different gradient terms appears
307
        # here matters, as floating point addition is non-commutative (and
308
        # Theano's graph optimizations are not order-independent).
309
        # This is why we do not use .values().
310
        gradient_values = [self.gradients[p] for p in self.parameters]
311
        self.total_gradient_norm = (l2_norm(gradient_values)
312
                                    .copy(name="total_gradient_norm"))
313
314
        self.step_rule = step_rule if step_rule else Scale()
315
        logger.debug("Computing parameter steps...")
316
        self.steps, self.step_rule_updates = (
317
            self.step_rule.compute_steps(self.gradients))
318
319
        # Same as gradient_values above: the order may influence a
320
        # bunch of things, so enforce a consistent one (don't use
321
        # .values()).
322
        step_values = [self.steps[p] for p in self.parameters]
323
        self.total_step_norm = (l2_norm(step_values)
324
                                .copy(name="total_step_norm"))
325
326
        # Once again, iterating on gradients may not be deterministically
327
        # ordered if it is not an OrderedDict. We add the updates here in
328
        # the order specified in self.parameters. Keep it this way to
329
        # maintain reproducibility.
330
        kwargs.setdefault('updates', []).extend(
331
            itertools.chain(((parameter, parameter - self.steps[parameter])
332
                             for parameter in self.parameters),
333
                            self.step_rule_updates)
334
        )
335
        super(GradientDescent, self).__init__(**kwargs)
336
337
    def _compute_gradients(self, known_grads, consider_constant):
338
        if self.cost is None:
339
            raise ValueError("can't infer gradients; no cost specified")
340
        elif self.parameters is None or len(self.parameters) == 0:
341
            raise ValueError("can't infer gradients; no parameters "
342
                             "specified")
343
        # While this strictly speaking could be a dict and not an
344
        # OrderedDict (because we iterate over it in the order of
345
        # self.parameters), this guards a little bit against
346
        # nondeterminism introduced by future refactoring.
347
        logger.info("Taking the cost gradient")
348
        gradients = OrderedDict(
349
            equizip(self.parameters, tensor.grad(
350
                self.cost, self.parameters,
351
                known_grads=known_grads,
352
                consider_constant=consider_constant)))
353
        logger.info("The cost gradient computation graph is built")
354
        return gradients
355
356
357
@add_metaclass(ABCMeta)
358
class StepRule(object):
359
    """A rule to compute steps for a gradient descent algorithm."""
360
    def compute_step(self, parameter, previous_step):
361
        """Build a Theano expression for the step for a parameter.
362
363
        This method is called by default implementation of
364
        :meth:`compute_steps`, it relieves from writing a loop each time.
365
366
        Parameters
367
        ----------
368
        parameter : :class:`~tensor.TensorSharedVariable`
369
            The parameter.
370
        previous_step : :class:`~tensor.TensorVariable`
371
            Some quantity related to the gradient of the cost with respect
372
            to the parameter, either the gradient itself or a step in a
373
            related direction.
374
375
        Returns
376
        -------
377
        step : :class:`~theano.Variable`
378
            Theano variable for the step to take.
379
        updates : list
380
            A list of tuples representing updates to be performed. This
381
            is useful for stateful rules such as :class:`Momentum` which
382
            need to update shared variables after itetations.
383
384
        """
385
        raise NotImplementedError
386
387
    def compute_steps(self, previous_steps):
388
        """Build a Theano expression for steps for all parameters.
389
390
        Override this method if you want to process the steps
391
        with respect to all parameters as a whole, not parameter-wise.
392
393
        Parameters
394
        ----------
395
        previous_steps : OrderedDict
396
            An :class:`~OrderedDict` of
397
            (:class:`~tensor.TensorSharedVariable`
398
            :class:`~tensor.TensorVariable`) pairs. The keys are the
399
            parameters being trained, the values are the expressions for
400
            quantities related to gradients of the cost with respect to
401
            the parameters, either the gradients themselves or steps in
402
            related directions.
403
404
        Returns
405
        -------
406
        steps : OrderedDict
407
            A dictionary of the proposed steps in the same form as
408
            `previous_steps`.
409
        updates : list
410
            A list of tuples representing updates to be performed.
411
412
        """
413
        parameter_wise = [self.compute_step(parameter,
414
                                            previous_steps[parameter])
415
                          for parameter in previous_steps]
416
        steps, updates = equizip(*parameter_wise)
417
        steps = OrderedDict((parameter, step) for parameter, step
418
                            in equizip(previous_steps.keys(), steps))
419
        updates = list(itertools.chain(*updates))
420
        return steps, updates
421
422
423
class CompositeRule(StepRule):
424
    """Chains several step rules.
425
426
    Parameters
427
    ----------
428
    components : list of :class:`StepRule`
429
        The learning rules to be chained. The rules will be applied in the
430
        order as given.
431
432
    """
433
    def __init__(self, components):
434
        self.components = components
435
436
    def compute_steps(self, previous_steps):
437
        steps = previous_steps
438
        updates = []
439
        for rule in self.components:
440
            steps, more_updates = rule.compute_steps(steps)
441
            updates += more_updates
442
        return steps, updates
443
444
445
class Scale(StepRule):
446
    """A step in the direction proportional to the previous step.
447
448
    If used in :class:`GradientDescent` alone, this step rule implements
449
    steepest descent.
450
451
    Parameters
452
    ----------
453
    learning_rate : float
454
        The learning rate by which the previous step is multiplied to
455
        produce the step.
456
457
    Attributes
458
    ----------
459
    learning_rate : :class:`~tensor.TensorSharedVariable`
460
        The shared variable storing the learning rate used.
461
462
    """
463
    def __init__(self, learning_rate=1.0):
464
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
465
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
466
467
    def compute_step(self, parameter, previous_step):
468
        return self.learning_rate * previous_step, []
469
470
471
class BasicMomentum(StepRule):
472
    """Accumulates step with exponential discount.
473
474
    Parameters
475
    ----------
476
    momentum : float, optional
477
        The momentum coefficient. Defaults to 0.
478
479
    Notes
480
    -----
481
    This step rule is intended to be used in conjunction with another
482
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
483
    experience, look at :class:`Momentum`.
484
485
    """
486
    def __init__(self, momentum=0.):
487
        self.momentum = shared_floatx(momentum, "momentum")
488
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
489
490
    def compute_step(self, parameter, previous_step):
491
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
492
        step = self.momentum * velocity + previous_step
493
        updates = [(velocity, step)]
494
        return step, updates
495
496
497
class Momentum(CompositeRule):
498
    """Accumulates step with exponential discount.
499
500
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
501
    usual momentum step rule.
502
503
    Parameters
504
    ----------
505
    learning_rate : float, optional
506
        The learning rate by which the previous step scaled. Defaults to 1.
507
    momentum : float, optional
508
        The momentum coefficient. Defaults to 0.
509
510
    Attributes
511
    ----------
512
    learning_rate : :class:`~tensor.SharedVariable`
513
        A variable for learning rate.
514
    momentum : :class:`~tensor.SharedVariable`
515
        A variable for momentum.
516
517
    See Also
518
    --------
519
    :class:`SharedVariableModifier`
520
521
    """
522 View Code Duplication
    def __init__(self, learning_rate=1.0, momentum=0.):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
        scale = Scale(learning_rate=learning_rate)
524
        basic_momentum = BasicMomentum(momentum=momentum)
525
        self.learning_rate = scale.learning_rate
526
        self.momentum = basic_momentum.momentum
527
        self.components = [scale, basic_momentum]
528
529
530
class AdaDelta(StepRule):
531
    """Adapts the step size over time using only first order information.
532
533
    Parameters
534
    ----------
535
    decay_rate : float, optional
536
        Decay rate in [0, 1]. Defaults to 0.95.
537
    epsilon : float, optional
538
        Stabilizing constant for RMS. Defaults to 1e-6.
539
540
    Notes
541
    -----
542
    For more information, see [ADADELTA]_.
543
544
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
545
       Rate Method*, arXiv:1212.5701.
546
547
    """
548
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
549
        if not 0.0 <= decay_rate <= 1.0:
550
            raise ValueError("decay rate needs to be in [0, 1]")
551
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
552
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
553
        self.epsilon = shared_floatx(epsilon, "epsilon")
554
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
555
556
    def compute_step(self, parameter, previous_step):
557
        mean_square_step_tm1 = _create_algorithm_buffer_for(
558
            parameter, "mean_square_step_tm1")
559
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
560
            parameter, "mean_square_delta_x_tm1")
561
562
        mean_square_step_t = (
563
            self.decay_rate * mean_square_step_tm1 +
564
            (1 - self.decay_rate) * tensor.sqr(previous_step)
565
        )
566
567
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
568
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
569
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
570
571
        mean_square_delta_x_t = (
572
            self.decay_rate * mean_square_delta_x_tm1 +
573
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
574
        )
575
576
        step = delta_x_t
577
        updates = [(mean_square_step_tm1, mean_square_step_t),
578
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
579
        return step, updates
580
581
582
class BasicRMSProp(StepRule):
583
    """Scales the step size by a running average of the recent step norms.
584
585
    Parameters
586
    ----------
587
    decay_rate : float, optional
588
        How fast the running average decays, value in [0, 1]
589
        (lower is faster).  Defaults to 0.9.
590
    max_scaling : float, optional
591
        Maximum scaling of the step size, in case the running average is
592
        really small. Needs to be greater than 0. Defaults to 1e5.
593
594
    Notes
595
    -----
596
    This step rule is intended to be used in conjunction with another
597
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
598
    experience, look at :class:`RMSProp`.
599
600
    In general, this step rule should be used _before_ other step rules,
601
    because it has normalization properties that may undo their work.
602
    For instance, it should be applied first when used in conjunction
603
    with :class:`Scale`.
604
605
    For more information, see [Hint2014]_.
606
607
    """
608
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
609
        if not 0.0 <= decay_rate <= 1.0:
610
            raise ValueError("decay rate needs to be in [0, 1]")
611
        if max_scaling <= 0:
612
            raise ValueError("max. scaling needs to be greater than 0")
613
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
614
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
615
        self.epsilon = 1. / max_scaling
616
617
    def compute_step(self, parameter, previous_step):
618
        mean_square_step_tm1 = _create_algorithm_buffer_for(
619
            parameter, "mean_square_step_tm1")
620
        mean_square_step_t = (
621
            self.decay_rate * mean_square_step_tm1 +
622
            (1 - self.decay_rate) * tensor.sqr(previous_step))
623
        rms_step_t = tensor.maximum(
624
            tensor.sqrt(mean_square_step_t), self.epsilon)
625
        step = previous_step / rms_step_t
626
        updates = [(mean_square_step_tm1, mean_square_step_t)]
627
        return step, updates
628
629
630
class RMSProp(CompositeRule):
631
    """Scales the step size by a running average of the recent step norms.
632
633
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
634
    described in [Hint2014]_.
635
636
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
637
       lecture 6a,
638
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
639
640
    Parameters
641
    ----------
642
    learning_rate : float, optional
643
        The learning rate by which the previous step scaled. Defaults to 1.
644
    decay_rate : float, optional
645
        How fast the running average decays (lower is faster).
646
        Defaults to 0.9.
647
    max_scaling : float, optional
648
        Maximum scaling of the step size, in case the running average is
649
        really small. Defaults to 1e5.
650
651
    Attributes
652
    ----------
653
    learning_rate : :class:`~tensor.SharedVariable`
654
        A variable for learning rate.
655
    decay_rate : :class:`~tensor.SharedVariable`
656
        A variable for decay rate.
657
658
    See Also
659
    --------
660
    :class:`SharedVariableModifier`
661
662
    """
663
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
664
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
665
                                      max_scaling=max_scaling)
666
        scale = Scale(learning_rate=learning_rate)
667
        self.learning_rate = scale.learning_rate
668
        self.decay_rate = basic_rms_prop.decay_rate
669
        self.components = [basic_rms_prop, scale]
670
671
672
class StepClipping(StepRule):
673
    """Rescales an entire step if its L2 norm exceeds a threshold.
674
675
    When the previous steps are the gradients, this step rule performs
676
    gradient clipping.
677
678
    Parameters
679
    ----------
680
    threshold : float, optional
681
        The maximum permitted L2 norm for the step. The step
682
        will be rescaled to be not higher than this quanity.
683
        If ``None``, no rescaling will be applied.
684
685
    Attributes
686
    ----------
687
    threshold : :class:`.tensor.TensorSharedVariable`
688
        The shared variable storing the clipping threshold used.
689
690
    """
691
    def __init__(self, threshold=None):
692
        if threshold:
693
            self.threshold = shared_floatx(threshold, "threshold")
694
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
695
696
    def compute_steps(self, previous_steps):
697
        if not hasattr(self, 'threshold'):
698
            return previous_steps
699
        norm = l2_norm(previous_steps.values())
700
        multiplier = tensor.switch(norm < self.threshold,
701
                                   1, self.threshold / norm)
702
        steps = OrderedDict(
703
            (parameter, step * multiplier)
704
            for parameter, step in previous_steps.items())
705
        return steps, []
706
707
708
class VariableClipping(StepRule):
709
    """Clip the maximum norm of individual variables along certain axes.
710
711
    This :class:`StepRule` can be used to implement L2 norm constraints on
712
    e.g. the weight vectors of individual hidden units, convolutional
713
    filters or entire weight tensors. Combine with :class:`Restrict`
714
    (and possibly :class:`CompositeRule`), to apply such constraints only
715
    to certain variables and/or apply different norm constraints to
716
    different variables.
717
718
    Parameters
719
    ----------
720
    threshold : float
721
        Maximum norm for a given (portion of a) tensor.
722
    axis : int or iterable, optional
723
        An integer single axis, or an iterable collection of integer
724
        axes over which to sum in order to calculate the L2 norm. If
725 View Code Duplication
        `None` (the default), the norm is computed over all elements
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
        of the tensor.
727
728
    Notes
729
    -----
730
    Because of the way the :class:`StepRule` API works, this particular
731
    rule implements norm clipping of the value *after* update in the
732
    following way: it computes ``parameter - previous_step``, scales it
733
    to have (possibly axes-wise) norm(s) of at most `threshold`,
734
    then subtracts *that* value from `parameter` to yield an 'equivalent
735
    step' that respects the desired norm constraints. This procedure
736
    implicitly assumes one is doing simple (stochastic) gradient descent,
737
    and so steps computed by this step rule may not make sense for use
738
    in other contexts.
739
740
    Investigations into max-norm regularization date from [Srebro2005]_.
741
    The first appearance of this technique as a regularization method
742
    for the weight vectors of individual hidden units in feed-forward
743
    neural networks may be [Hinton2012]_.
744
745
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
746
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
747
       on Learning Theory (COLT)*, June 2005.
748
749
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
750
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
751
       "Improving neural networks by preventing co-adaptation of
752
       feature detectors". arXiv:1207.0580.
753
754
    """
755
    def __init__(self, threshold, axis=None):
756
        axis = pack(axis) if axis is not None else ()
757
        self.axis = set(axis)
758
        self.threshold = shared_floatx(threshold, "threshold")
759
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
760
        if len(axis) != len(self.axis):
761
            raise ValueError("axis must be unique")
762
763
    def compute_step(self, parameter, previous_step):
764
        if any(ax >= previous_step.ndim for ax in self.axis):
765
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
766
                self.axis, parameter, previous_step.ndim))
767
        if len(self.axis) == 0:
768
            norms = l2_norm([parameter - previous_step])
769
        else:
770
            squares = tensor.sqr(parameter - previous_step)
771
            norms = tensor.sqrt(
772
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
773
                       sorted(self.axis), squares))
774
        # We want a step s* that is the same as scaling
775
        # (parameter - previous_step) by threshold / norm
776
        # when threshold < norm.
777
        shrinking_step = (parameter -
778
                          (self.threshold / norms) *
779
                          (parameter - previous_step))
780
        return tensor.switch(norms > self.threshold,
781
                             shrinking_step,
782
                             previous_step), ()
783
784
785
class AdaGrad(StepRule):
786
    """Implements the AdaGrad learning rule.
787
788
    Parameters
789
    ----------
790
    learning_rate : float, optional
791
        Step size.
792
        Default value is set to 0.0002.
793
    epsilon : float, optional
794
        Stabilizing constant for one over root of sum of squares.
795
        Defaults to 1e-6.
796
797
    Notes
798
    -----
799
    For more information, see [ADAGRAD]_.
800
801
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
802
       *Adaptive subgradient methods for online learning and
803
        stochastic optimization*,
804
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
805
806
    """
807
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
808
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
809
        self.epsilon = shared_floatx(epsilon, "epsilon")
810
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
811
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
812
813
    def compute_step(self, parameter, previous_step):
814
        name = 'adagrad_sqs'
815
        if parameter.name:
816
            name += '_' + parameter.name
817
        ssq = _create_algorithm_buffer_for(parameter, name=name)
818
819
        ssq_t = (tensor.sqr(previous_step) + ssq)
820
        step = (self.learning_rate * previous_step /
821
                (tensor.sqrt(ssq_t) + self.epsilon))
822
823
        updates = [(ssq, ssq_t)]
824
825
        return step, updates
826
827
828
class Adam(StepRule):
829
    """Adam optimizer as described in [King2014]_.
830
831
    .. [King2014] Diederik Kingma, Jimmy Ba,
832
       *Adam: A Method for Stochastic Optimization*,
833
       http://arxiv.org/abs/1412.6980
834
835
    Parameters
836
    ----------
837
    learning_rate : float, optional
838
        Step size.
839
        Default value is set to 0.002.
840
    beta1 : float, optional
841
        Exponential decay rate for the first moment estimates.
842
        Default value is set to 0.1.
843
    beta2 : float, optional
844
        Exponential decay rate for the second moment estimates.
845
        Default value is set to 0.001.
846
    epsilon : float, optional
847
        Default value is set to 1e-8.
848
    decay_factor : float, optional
849
        Default value is set to 1 - 1e-8.
850
851
    """
852
    def __init__(self, learning_rate=0.002,
853
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
854
                 decay_factor=(1 - 1e-8)):
855
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
856
        self.beta1 = shared_floatx(beta1, "beta1")
857
        self.beta2 = shared_floatx(beta2, "beta2")
858
        self.epsilon = shared_floatx(epsilon, "epsilon")
859
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
860
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
861
                      self.decay_factor]:
862
            add_role(param, ALGORITHM_HYPERPARAMETER)
863
864
    def compute_step(self, parameter, previous_step):
865
        mean = _create_algorithm_buffer_for(parameter, 'mean')
866
        variance = _create_algorithm_buffer_for(parameter, 'variance')
867
        time = shared_floatx(0., 'time')
868
        add_role(time, ALGORITHM_BUFFER)
869
870
        t1 = time + 1
871
        learning_rate = (self.learning_rate *
872
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
873
                         (1. - (1. - self.beta1)**t1))
874
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
875
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
876
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
877
                      (1. - self.beta2) * variance)
878
        step = (learning_rate * mean_t /
879
                (tensor.sqrt(variance_t) + self.epsilon))
880
881
        updates = [(mean, mean_t),
882
                   (variance, variance_t),
883
                   (time, t1)]
884
885
        return step, updates
886
887
888
class RemoveNotFinite(StepRule):
889
    """A step rule that skips steps with non-finite elements.
890
891
    Replaces a step (the parameter update of a single shared variable)
892
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
893
    step rescaling the parameters.
894
895
    Parameters
896
    ----------
897
    scaler : float, optional
898
        The scaling applied to the parameter in case the step contains
899
        non-finite elements. Defaults to 1, which means that parameters
900
        will not be changed.
901
902
    Notes
903
    -----
904
    This rule should be applied last!
905
906
    This trick was originally used in the GroundHog_ framework.
907
908
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
909
910
    """
911
    def __init__(self, scaler=1):
912
        self.scaler = scaler
913
914
    def compute_step(self, parameter, previous_step):
915
        step_sum = tensor.sum(previous_step)
916
        not_finite = (tensor.isnan(step_sum) +
917
                      tensor.isinf(step_sum))
918
        step = tensor.switch(
919
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
920
        return step, []
921
922
923
class Restrict(StepRule):
924
    """Applies a given :class:`StepRule` only to certain variables.
925
926
    Example applications include clipping steps on only certain parameters,
927
    or scaling a certain kind of parameter's updates (e.g. adding an
928
    additional scalar multiplier to the steps taken on convolutional
929
    filters).
930
931
    Parameters
932
    ----------
933
    step_rule : :class:`StepRule`
934
        The :class:`StepRule` to be applied on the given variables.
935
    variables : iterable
936
        A collection of Theano variables on which to apply `step_rule`.
937
        Variables not appearing in this collection will not have
938
        `step_rule` applied to them.
939
940
    """
941
    def __init__(self, step_rule, variables):
942
        self.step_rule = step_rule
943
        self.variables = frozenset(variables)
944
945
    def compute_steps(self, previous_steps):
946
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
947
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
948
        actual = OrderedDict((parameter, steps[parameter])
949
                             if parameter in steps
950
                             else (parameter, previous_steps[parameter])
951
                             for parameter in previous_steps)
952
        return actual, updates
953