Test Failed
Pull Request — master (#1191)
by
unknown
16:36
created

UpdatesAlgorithm   A

Complexity

Total Complexity 21

Size/Duplication

Total Lines 106
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 106
rs 10
wmc 21

7 Methods

Rating   Name   Duplication   Size   Complexity  
A __init__() 0 7 3
D _validate_source_names() 0 26 8
A check_sanity() 0 3 1
A initialize() 0 9 2
A updates() 0 3 1
A process_batch() 0 4 2
A add_updates() 0 16 3
1
"""Training algorithms."""
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from collections import Mapping
7
from six.moves import reduce
8
9
from picklable_itertools.extras import equizip
10
11
import theano
12
from six import add_metaclass
13
from theano import tensor
14
15
from blocks.graph import ComputationGraph
16
from blocks.model import Model
17
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
18
from blocks.theano_expressions import l2_norm
19
from blocks.utils import dict_subset, pack, shared_floatx
20
from blocks.utils.theano_utils import shared_floatx_zeros_matching
21
22
logger = logging.getLogger(__name__)
23
24
25
def _create_algorithm_buffer_for(param, *args, **kwargs):
26
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
27
    buf.tag.for_parameter = param
28
    add_role(buf, ALGORITHM_BUFFER)
29
    return buf
30
31
32
@add_metaclass(ABCMeta)
33
class TrainingAlgorithm(object):
34
    """Base class for training algorithms.
35
36
    A training algorithm object has a simple life-cycle.
37
    First it is initialized by calling its :meth:`initialize` method.
38
    At this stage, for instance, Theano functions can be compiled.
39
    After that the :meth:`process_batch` method is repeatedly
40
    called with a batch of training data as a parameter.
41
42
    """
43
    @abstractmethod
44
    def initialize(self, **kwargs):
45
        """Initialize the training algorithm."""
46
        pass
47
48
    @abstractmethod
49
    def process_batch(self, batch):
50
        """Process a batch of training data.
51
52
        Attributes
53
        ----------
54
        batch : dict
55
            A dictionary of (source name, data) pairs.
56
57
        """
58
        pass
59
60
61
variable_mismatch_error = """
62
63
Blocks tried to match the sources ({sources}) of the training dataset to \
64
the names of the Theano variables ({variables}), but failed to do so. \
65
If you want to train on a subset of the sources that your dataset provides, \
66
pass the `sources` keyword argument to its constructor, use the \
67
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
68
or on_unused_sources='ignore' to the GradientDescent algorithm."""
69
70
source_missing_error = """
71
72
Blocks didn't find all the sources ({sources}) of the training dataset \
73
that match the names of the Theano variables ({variables})."""
74
75
76
determinism_error = """Cannot infer parameter list in a fixed order.
77
78
Because dictionaries are unordered (and Python uses randomized hashing, \
79
which can change the iteration order over the same dictionary from one \
80
interpreter session to the next), Blocks cannot infer the parameters list \
81
from a plain dictionary of gradients in an order that is reproducible \
82
across interpreter sessions; please either specify the parameters \
83
explicitly or pass gradients as an OrderedDict (though exercise care in \
84
constructing that OrderedDict, as an OrderedDict created by iterating \
85
over an unordered iterable (e.g. a dict) will still have an arbitrary \
86
and unpredictable order that could cause problems with \
87
reproducibility)."""
88
89
90
class UpdatesAlgorithm(TrainingAlgorithm):
91
    """Base class for algorithms that use Theano functions with updates.
92
93
    Parameters
94
    ----------
95
    updates : list of tuples or :class:`~collections.OrderedDict`
96
        The updates that should be performed.
97
    theano_func_kwargs : dict, optional
98
        A passthrough to `theano.function` for additional arguments.
99
        Useful for passing `profile` or `mode` arguments to the theano
100
        function that will be compiled for the algorithm.
101
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
102
        Controls behavior when not all sources in a batch are used
103
        (i.e. there is no variable with a matching name in the inputs
104
        of the computational graph of the updates).
105
106
    Attributes
107
    ----------
108
    updates : list of :class:`~tensor.TensorSharedVariable` updates
109
        Updates to be done for every batch. It is required that the
110
        updates are done using the old values of optimized parameters.
111
112
    Notes
113
    -----
114
    Changing `updates` attribute or calling `add_updates` after
115
    the `initialize` method is called will have no effect.
116
117
    """
118
    def __init__(self, updates=None, theano_func_kwargs=None,
119
                 on_unused_sources='raise', **kwargs):
120
        self.updates = [] if updates is None else updates
121
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
122
                                   is not None else dict())
123
        self.on_unused_sources = on_unused_sources
124
        super(UpdatesAlgorithm, self).__init__(**kwargs)
125
126
    def initialize(self):
127
        logger.info("Initializing the training algorithm")
128
        update_values = [new_value for _, new_value in self.updates]
129
        logger.debug("Inferring graph inputs...")
130
        self.inputs = ComputationGraph(update_values).inputs
131
        logger.debug("Compiling training function...")
132
        self._function = theano.function(
133
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
134
        logger.info("The training algorithm is initialized")
135
136
    @property
137
    def updates(self):
138
        return self._updates
139
140
    @updates.setter
141
    def updates(self, value):
142
        self._updates = value
143
144
    def add_updates(self, updates):
145
        """Add updates to the training process.
146
147
        The updates will be done _before_ the parameters are changed.
148
149
        Parameters
150
        ----------
151
        updates : list of tuples or :class:`~collections.OrderedDict`
152
            The updates to add.
153
154
        """
155
        if isinstance(updates, OrderedDict):
156
            updates = list(updates.items())
157
        if not isinstance(updates, list):
158
            raise ValueError
159
        self.updates.extend(updates)
160
161
    def _validate_source_names(self, batch):
162
        in_names = [v.name for v in self.inputs]
163
164
        if not set(in_names).issubset(set(batch.keys())):
165
            raise ValueError("Didn't find all sources: " +
166
                             source_missing_error.format(
167
                                 sources=batch.keys(),
168
                                 variables=in_names))
169
        if not set(batch.keys()).issubset(set(in_names)):
170
            if self.on_unused_sources == 'ignore':
171
                pass
172
            elif self.on_unused_sources == 'warn':
173
                if not hasattr(self, '_unused_source_warned'):
174
                    logger.warn(variable_mismatch_error.format(
175
                        sources=batch.keys(),
176
                        variables=in_names))
177
                self._unused_source_warned = True
178
            elif self.on_unused_sources == 'raise':
179
                raise ValueError(
180
                    "mismatch of variable names and data sources" +
181
                    variable_mismatch_error.format(
182
                        sources=batch.keys(),
183
                        variables=in_names))
184
            else:
185
                raise ValueError("Wrong value of on_unused_sources: {}."
186
                                 .format(self.on_unused_sources))
187
188
    def process_batch(self, batch):
189
        self._validate_source_names(batch)
190
        ordered_batch = [batch[v.name] for v in self.inputs]
191
        self._function(*ordered_batch)
192
193
    def check_sanity(self, model):
194
        # Do nothing
195
        pass
196
197
198
class GradientDescent(UpdatesAlgorithm):
199
    """A base class for all gradient descent algorithms.
200
201
    By "gradient descent" we mean a training algorithm of the following
202
    form:
203
204
    .. code-block::  python
205
206
        for batch in data:
207
            steps = step_rule.compute_steps(parameters,
208
                                            gradients_wr_parameters)
209
            for parameter in parameters:
210
                parameter -= steps[parameter]
211
212
    Note, that the step is *subtracted, not added*! This is done in order
213
    to make step rule chaining possible.
214
215
    Parameters
216
    ----------
217
    cost : :class:`~tensor.TensorVariable`, optional
218
        The objective to be minimized. Unused if `gradients` is specified.
219
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
220
        The parameters to be tuned. If not provided, inferred from the
221
        keys of `gradients` (in which case `gradients` *must* be an
222
        `OrderedDict`).
223
    step_rule : instance of :class:`StepRule`, optional
224
        An object encapsulating most of the algorithm's logic. Its
225
        `compute_steps` method is called to get Theano expression for
226
        steps.  Note, that the step rule might have a state, e.g. to
227
        remember a weighted sum of gradients from previous steps like it is
228
        done in gradient descent with momentum. If ``None``, an instance of
229
        :class:`Scale` is created.
230
    gradients : OrderedDict or list of 2-tuples, optional
231
        A dictionary mapping a parameter to an expression for the cost's
232
        gradient with respect to the parameter, or equivalently, a list of
233
        (parameter, gradient) tuples. If ``None``, the gradient
234
        are taken automatically using :func:`theano.gradient.grad`.
235
    known_grads : dict, optional
236
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
237
        Useful when you know the [approximate] gradients of some
238
        sub-expressions and would like Theano to use that information
239
        to compute parameter gradients. Only makes sense when `gradients`
240
        is `None`.
241
    consider_constant : list, optional
242
        A passthrough to `theano.tensor.grad`'s `consider_constant`
243
        argument.  A list of expressions through which gradients will not
244
        be backpropagated. Only makes sense when `gradients` is `None`.
245
246
    Attributes
247
    ----------
248
    gradients : OrderedDict
249
        The gradient dictionary.
250
    step_rule : instance of :class:`StepRule`
251
        The step rule.
252
253
    Notes
254
    -----
255
    Changing `updates` attribute or calling `add_updates` after
256
    the `initialize` method is called will have no effect.
257
258
    If a cost and parameters are provided, gradients are taken immediately
259
    upon construction, and changes to these attributes after construction
260
    will have no effect.
261
262
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
263
    because ordinary dictionaries have an unpredictable iteration
264
    order due to hash randomization (which is enabled by default since
265
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
266
    when combined with Theano's heuristic graph optimizations, can cause
267
    serious reproducibility issues.
268
269
    """
270
    def __init__(self, cost=None, parameters=None, step_rule=None,
271
                 gradients=None, known_grads=None, consider_constant=None,
272
                 **kwargs):
273
        # Set initial values for cost, parameters, gradients.
274
        self.cost = cost
275
        self.parameters = parameters
276
        # Coerce lists of tuples to OrderedDict. Do not coerce Mappings,
277
        # as we don't want to convert dict -> OrderedDict and give it
278
        # an arbitrary, non-deterministic order.
279
        if gradients is not None and not isinstance(gradients, Mapping):
280
            gradients = OrderedDict(gradients)
281
        self.gradients = gradients
282
283
        # If we don't have gradients, we'll need to infer them from the
284
        # cost and the parameters, both of which must not be None.
285
        if not self.gradients:
286
            self.gradients = self._compute_gradients(known_grads,
287
                                                     consider_constant)
288
        else:
289
            if cost is not None:
290
                logger.warning(('{}: gradients already specified directly; '
291
                                'cost is unused.'
292
                                .format(self.__class__.__name__)))
293
            if self.parameters is None and isinstance(gradients, OrderedDict):
294
                # If the dictionary is ordered, it's safe to use the keys
295
                # as they have a deterministic order.
296
                self.parameters = list(self.gradients.keys())
297
            elif self.parameters is not None:
298
                # If parameters and gradients.keys() don't match we can
299
                # try to recover if gradients is ordered.
300
                if set(self.parameters) != set(self.gradients.keys()):
301
                    logger.warn("Specified parameters list does not match "
302
                                "keys in provided gradient dictionary; "
303
                                "using parameters inferred from gradients")
304
                    if not isinstance(self.gradients, OrderedDict):
305
                        raise ValueError(determinism_error)
306
                    self.parameters = list(self.gradients.keys())
307
            else:
308
                # self.parameters is not None, and gradients isn't
309
                # an OrderedDict. We can't do anything safe.
310
                raise ValueError(determinism_error)
311
            if known_grads:
312
                raise ValueError("known_grads has no effect when gradients "
313
                                 "are passed in")
314
            if consider_constant is not None:
315
                raise ValueError("consider_constant has no effect when "
316
                                 "gradients are passed in")
317
318
        # The order in which the different gradient terms appears
319
        # here matters, as floating point addition is non-commutative (and
320
        # Theano's graph optimizations are not order-independent).
321
        # This is why we do not use .values().
322
        gradient_values = [self.gradients[p] for p in self.parameters]
323
        self.total_gradient_norm = (l2_norm(gradient_values)
324
                                    .copy(name="total_gradient_norm"))
325
326
        self.step_rule = step_rule if step_rule else Scale()
327
        logger.debug("Computing parameter steps...")
328
        self.steps, self.step_rule_updates = (
329
            self.step_rule.compute_steps(self.gradients))
330
331
        # Same as gradient_values above: the order may influence a
332
        # bunch of things, so enforce a consistent one (don't use
333
        # .values()).
334
        step_values = [self.steps[p] for p in self.parameters]
335
        self.total_step_norm = (l2_norm(step_values)
336
                                .copy(name="total_step_norm"))
337
338
        # Once again, iterating on gradients may not be deterministically
339
        # ordered if it is not an OrderedDict. We add the updates here in
340
        # the order specified in self.parameters. Keep it this way to
341
        # maintain reproducibility.
342
        kwargs.setdefault('updates', []).extend(
343
            itertools.chain(((parameter, parameter - self.steps[parameter])
344
                             for parameter in self.parameters),
345
                            self.step_rule_updates)
346
        )
347
        super(GradientDescent, self).__init__(**kwargs)
348
349
    def _compute_gradients(self, known_grads, consider_constant):
350
        if self.cost is None:
351
            raise ValueError("can't infer gradients; no cost specified")
352
        elif self.parameters is None or len(self.parameters) == 0:
353
            raise ValueError("can't infer gradients; no parameters "
354
                             "specified")
355
        # While this strictly speaking could be a dict and not an
356
        # OrderedDict (because we iterate over it in the order of
357
        # self.parameters), this guards a little bit against
358
        # nondeterminism introduced by future refactoring.
359
        logger.info("Taking the cost gradient")
360
        gradients = OrderedDict(
361
            equizip(self.parameters, tensor.grad(
362
                self.cost, self.parameters,
363
                known_grads=known_grads,
364
                consider_constant=consider_constant)))
365
        logger.info("The cost gradient computation graph is built")
366
        return gradients
367
368
    def check_sanity(self, model):
369
        # Sanity check for the most common case
370
        if (self._model and isinstance(self._model, Model) and
371
                isinstance(self.algorithm, GradientDescent)):
372
            if not (set(self._model.get_parameter_dict().values()) ==
373
                    set(self.algorithm.parameters)):
374
                logger.warning("different parameters for model and algorithm")
375
376
377
@add_metaclass(ABCMeta)
378
class StepRule(object):
379
    """A rule to compute steps for a gradient descent algorithm."""
380
    def compute_step(self, parameter, previous_step):
381
        """Build a Theano expression for the step for a parameter.
382
383
        This method is called by default implementation of
384
        :meth:`compute_steps`, it relieves from writing a loop each time.
385
386
        Parameters
387
        ----------
388
        parameter : :class:`~tensor.TensorSharedVariable`
389
            The parameter.
390
        previous_step : :class:`~tensor.TensorVariable`
391
            Some quantity related to the gradient of the cost with respect
392
            to the parameter, either the gradient itself or a step in a
393
            related direction.
394
395
        Returns
396
        -------
397
        step : :class:`~theano.Variable`
398
            Theano variable for the step to take.
399
        updates : list
400
            A list of tuples representing updates to be performed. This
401
            is useful for stateful rules such as :class:`Momentum` which
402
            need to update shared variables after itetations.
403
404
        """
405
        raise NotImplementedError
406
407
    def compute_steps(self, previous_steps):
408
        """Build a Theano expression for steps for all parameters.
409
410
        Override this method if you want to process the steps
411
        with respect to all parameters as a whole, not parameter-wise.
412
413
        Parameters
414
        ----------
415
        previous_steps : OrderedDict
416
            An :class:`~OrderedDict` of
417
            (:class:`~tensor.TensorSharedVariable`
418
            :class:`~tensor.TensorVariable`) pairs. The keys are the
419
            parameters being trained, the values are the expressions for
420
            quantities related to gradients of the cost with respect to
421
            the parameters, either the gradients themselves or steps in
422
            related directions.
423
424
        Returns
425
        -------
426
        steps : OrderedDict
427
            A dictionary of the proposed steps in the same form as
428
            `previous_steps`.
429
        updates : list
430
            A list of tuples representing updates to be performed.
431
432
        """
433
        parameter_wise = [self.compute_step(parameter,
434
                                            previous_steps[parameter])
435
                          for parameter in previous_steps]
436
        steps, updates = equizip(*parameter_wise)
437
        steps = OrderedDict((parameter, step) for parameter, step
438
                            in equizip(previous_steps.keys(), steps))
439
        updates = list(itertools.chain(*updates))
440
        return steps, updates
441
442
443
class CompositeRule(StepRule):
444
    """Chains several step rules.
445
446
    Parameters
447
    ----------
448
    components : list of :class:`StepRule`
449
        The learning rules to be chained. The rules will be applied in the
450
        order as given.
451
452
    """
453
    def __init__(self, components):
454
        self.components = components
455
456
    def compute_steps(self, previous_steps):
457
        steps = previous_steps
458
        updates = []
459
        for rule in self.components:
460
            steps, more_updates = rule.compute_steps(steps)
461
            updates += more_updates
462
        return steps, updates
463
464
465
class Scale(StepRule):
466
    """A step in the direction proportional to the previous step.
467
468
    If used in :class:`GradientDescent` alone, this step rule implements
469
    steepest descent.
470
471
    Parameters
472
    ----------
473
    learning_rate : float
474
        The learning rate by which the previous step is multiplied to
475
        produce the step.
476
477
    Attributes
478
    ----------
479
    learning_rate : :class:`~tensor.TensorSharedVariable`
480
        The shared variable storing the learning rate used.
481
482
    """
483
    def __init__(self, learning_rate=1.0):
484
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
485
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
486
487
    def compute_step(self, parameter, previous_step):
488
        return self.learning_rate * previous_step, []
489
490
491
class BasicMomentum(StepRule):
492
    """Accumulates step with exponential discount.
493
494
    Parameters
495
    ----------
496
    momentum : float, optional
497
        The momentum coefficient. Defaults to 0.
498
499
    Notes
500
    -----
501
    This step rule is intended to be used in conjunction with another
502
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
503
    experience, look at :class:`Momentum`.
504
505
    """
506
    def __init__(self, momentum=0.):
507
        self.momentum = shared_floatx(momentum, "momentum")
508
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
509
510
    def compute_step(self, parameter, previous_step):
511
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
512
        step = self.momentum * velocity + previous_step
513
        updates = [(velocity, step)]
514
        return step, updates
515
516
517
class Momentum(CompositeRule):
518
    """Accumulates step with exponential discount.
519
520
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
521
    usual momentum step rule.
522 View Code Duplication
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
    Parameters
524
    ----------
525
    learning_rate : float, optional
526
        The learning rate by which the previous step scaled. Defaults to 1.
527
    momentum : float, optional
528
        The momentum coefficient. Defaults to 0.
529
530
    Attributes
531
    ----------
532
    learning_rate : :class:`~tensor.SharedVariable`
533
        A variable for learning rate.
534
    momentum : :class:`~tensor.SharedVariable`
535
        A variable for momentum.
536
537
    See Also
538
    --------
539
    :class:`SharedVariableModifier`
540
541
    """
542
    def __init__(self, learning_rate=1.0, momentum=0.):
543
        scale = Scale(learning_rate=learning_rate)
544
        basic_momentum = BasicMomentum(momentum=momentum)
545
        self.learning_rate = scale.learning_rate
546
        self.momentum = basic_momentum.momentum
547
        self.components = [scale, basic_momentum]
548
549
550
class AdaDelta(StepRule):
551
    """Adapts the step size over time using only first order information.
552
553
    Parameters
554
    ----------
555
    decay_rate : float, optional
556
        Decay rate in [0, 1]. Defaults to 0.95.
557
    epsilon : float, optional
558
        Stabilizing constant for RMS. Defaults to 1e-6.
559
560
    Notes
561
    -----
562
    For more information, see [ADADELTA]_.
563
564
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
565
       Rate Method*, arXiv:1212.5701.
566
567
    """
568
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
569
        if not 0.0 <= decay_rate <= 1.0:
570
            raise ValueError("decay rate needs to be in [0, 1]")
571
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
572
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
573
        self.epsilon = shared_floatx(epsilon, "epsilon")
574
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
575
576
    def compute_step(self, parameter, previous_step):
577
        mean_square_step_tm1 = _create_algorithm_buffer_for(
578
            parameter, "mean_square_step_tm1")
579
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
580
            parameter, "mean_square_delta_x_tm1")
581
582
        mean_square_step_t = (
583
            self.decay_rate * mean_square_step_tm1 +
584
            (1 - self.decay_rate) * tensor.sqr(previous_step)
585
        )
586
587
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
588
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
589
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
590
591
        mean_square_delta_x_t = (
592
            self.decay_rate * mean_square_delta_x_tm1 +
593
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
594
        )
595
596
        step = delta_x_t
597
        updates = [(mean_square_step_tm1, mean_square_step_t),
598
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
599
        return step, updates
600
601
602
class BasicRMSProp(StepRule):
603
    """Scales the step size by a running average of the recent step norms.
604
605
    Parameters
606
    ----------
607
    decay_rate : float, optional
608
        How fast the running average decays, value in [0, 1]
609
        (lower is faster).  Defaults to 0.9.
610
    max_scaling : float, optional
611
        Maximum scaling of the step size, in case the running average is
612
        really small. Needs to be greater than 0. Defaults to 1e5.
613
614
    Notes
615
    -----
616
    This step rule is intended to be used in conjunction with another
617
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
618
    experience, look at :class:`RMSProp`.
619
620
    In general, this step rule should be used _before_ other step rules,
621
    because it has normalization properties that may undo their work.
622
    For instance, it should be applied first when used in conjunction
623
    with :class:`Scale`.
624
625
    For more information, see [Hint2014]_.
626
627
    """
628
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
629
        if not 0.0 <= decay_rate <= 1.0:
630
            raise ValueError("decay rate needs to be in [0, 1]")
631
        if max_scaling <= 0:
632
            raise ValueError("max. scaling needs to be greater than 0")
633
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
634
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
635
        self.epsilon = 1. / max_scaling
636
637
    def compute_step(self, parameter, previous_step):
638
        mean_square_step_tm1 = _create_algorithm_buffer_for(
639
            parameter, "mean_square_step_tm1")
640
        mean_square_step_t = (
641
            self.decay_rate * mean_square_step_tm1 +
642
            (1 - self.decay_rate) * tensor.sqr(previous_step))
643
        rms_step_t = tensor.maximum(
644
            tensor.sqrt(mean_square_step_t), self.epsilon)
645
        step = previous_step / rms_step_t
646
        updates = [(mean_square_step_tm1, mean_square_step_t)]
647
        return step, updates
648
649
650
class RMSProp(CompositeRule):
651
    """Scales the step size by a running average of the recent step norms.
652
653
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
654
    described in [Hint2014]_.
655
656
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
657
       lecture 6a,
658
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
659
660
    Parameters
661
    ----------
662
    learning_rate : float, optional
663
        The learning rate by which the previous step scaled. Defaults to 1.
664
    decay_rate : float, optional
665
        How fast the running average decays (lower is faster).
666
        Defaults to 0.9.
667
    max_scaling : float, optional
668
        Maximum scaling of the step size, in case the running average is
669
        really small. Defaults to 1e5.
670
671
    Attributes
672
    ----------
673
    learning_rate : :class:`~tensor.SharedVariable`
674
        A variable for learning rate.
675
    decay_rate : :class:`~tensor.SharedVariable`
676
        A variable for decay rate.
677
678
    See Also
679
    --------
680
    :class:`SharedVariableModifier`
681
682
    """
683
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
684
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
685
                                      max_scaling=max_scaling)
686
        scale = Scale(learning_rate=learning_rate)
687
        self.learning_rate = scale.learning_rate
688
        self.decay_rate = basic_rms_prop.decay_rate
689
        self.components = [basic_rms_prop, scale]
690
691
692
class StepClipping(StepRule):
693
    """Rescales an entire step if its L2 norm exceeds a threshold.
694
695
    When the previous steps are the gradients, this step rule performs
696
    gradient clipping.
697
698
    Parameters
699
    ----------
700
    threshold : float, optional
701
        The maximum permitted L2 norm for the step. The step
702
        will be rescaled to be not higher than this quanity.
703
        If ``None``, no rescaling will be applied.
704
705
    Attributes
706
    ----------
707
    threshold : :class:`.tensor.TensorSharedVariable`
708
        The shared variable storing the clipping threshold used.
709
710
    """
711
    def __init__(self, threshold=None):
712
        if threshold is not None:
713
            threshold = shared_floatx(threshold, "threshold")
714
            add_role(threshold, ALGORITHM_HYPERPARAMETER)
715
        self.threshold = threshold
716
717
    def compute_steps(self, previous_steps):
718
        if self.threshold is None:
719
            steps = previous_steps
720
        else:
721
            norm = l2_norm(previous_steps.values())
722
            multiplier = tensor.switch(norm < self.threshold,
723
                                       1, self.threshold / norm)
724
            steps = OrderedDict(
725 View Code Duplication
                (parameter, step * multiplier)
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
                for parameter, step in previous_steps.items())
727
        return steps, []
728
729
730
class VariableClipping(StepRule):
731
    """Clip the maximum norm of individual variables along certain axes.
732
733
    This :class:`StepRule` can be used to implement L2 norm constraints on
734
    e.g. the weight vectors of individual hidden units, convolutional
735
    filters or entire weight tensors. Combine with :class:`Restrict`
736
    (and possibly :class:`CompositeRule`), to apply such constraints only
737
    to certain variables and/or apply different norm constraints to
738
    different variables.
739
740
    Parameters
741
    ----------
742
    threshold : float
743
        Maximum norm for a given (portion of a) tensor.
744
    axis : int or iterable, optional
745
        An integer single axis, or an iterable collection of integer
746
        axes over which to sum in order to calculate the L2 norm. If
747
        `None` (the default), the norm is computed over all elements
748
        of the tensor.
749
750
    Notes
751
    -----
752
    Because of the way the :class:`StepRule` API works, this particular
753
    rule implements norm clipping of the value *after* update in the
754
    following way: it computes ``parameter - previous_step``, scales it
755
    to have (possibly axes-wise) norm(s) of at most `threshold`,
756
    then subtracts *that* value from `parameter` to yield an 'equivalent
757
    step' that respects the desired norm constraints. This procedure
758
    implicitly assumes one is doing simple (stochastic) gradient descent,
759
    and so steps computed by this step rule may not make sense for use
760
    in other contexts.
761
762
    Investigations into max-norm regularization date from [Srebro2005]_.
763
    The first appearance of this technique as a regularization method
764
    for the weight vectors of individual hidden units in feed-forward
765
    neural networks may be [Hinton2012]_.
766
767
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
768
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
769
       on Learning Theory (COLT)*, June 2005.
770
771
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
772
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
773
       "Improving neural networks by preventing co-adaptation of
774
       feature detectors". arXiv:1207.0580.
775
776
    """
777
    def __init__(self, threshold, axis=None):
778
        axis = pack(axis) if axis is not None else ()
779
        self.axis = set(axis)
780
        self.threshold = shared_floatx(threshold, "threshold")
781
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
782
        if len(axis) != len(self.axis):
783
            raise ValueError("axis must be unique")
784
785
    def compute_step(self, parameter, previous_step):
786
        if any(ax >= previous_step.ndim for ax in self.axis):
787
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
788
                self.axis, parameter, previous_step.ndim))
789
        if len(self.axis) == 0:
790
            norms = l2_norm([parameter - previous_step])
791
        else:
792
            squares = tensor.sqr(parameter - previous_step)
793
            norms = tensor.sqrt(
794
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
795
                       sorted(self.axis), squares))
796
        # We want a step s* that is the same as scaling
797
        # (parameter - previous_step) by threshold / norm
798
        # when threshold < norm.
799
        shrinking_step = (parameter -
800
                          (self.threshold / norms) *
801
                          (parameter - previous_step))
802
        return tensor.switch(norms > self.threshold,
803
                             shrinking_step,
804
                             previous_step), ()
805
806
807
class AdaGrad(StepRule):
808
    """Implements the AdaGrad learning rule.
809
810
    Parameters
811
    ----------
812
    learning_rate : float, optional
813
        Step size.
814
        Default value is set to 0.0002.
815
    epsilon : float, optional
816
        Stabilizing constant for one over root of sum of squares.
817
        Defaults to 1e-6.
818
819
    Notes
820
    -----
821
    For more information, see [ADAGRAD]_.
822
823
    .. [ADAGRAD] Duchi J, Hazan E, Singer Y.,
824
       *Adaptive subgradient methods for online learning and
825
       stochastic optimization*,
826
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
827
828
    """
829
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
830
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
831
        self.epsilon = shared_floatx(epsilon, "epsilon")
832
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
833
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
834
835
    def compute_step(self, parameter, previous_step):
836
        name = 'adagrad_sqs'
837
        if parameter.name:
838
            name += '_' + parameter.name
839
        ssq = _create_algorithm_buffer_for(parameter, name=name)
840
841
        ssq_t = (tensor.sqr(previous_step) + ssq)
842
        step = (self.learning_rate * previous_step /
843
                (tensor.sqrt(ssq_t) + self.epsilon))
844
845
        updates = [(ssq, ssq_t)]
846
847
        return step, updates
848
849
850
class Adam(StepRule):
851
    """Adam optimizer as described in [King2014]_.
852
853
    .. [King2014] Diederik Kingma, Jimmy Ba,
854
       *Adam: A Method for Stochastic Optimization*,
855
       http://arxiv.org/abs/1412.6980
856
857
    Parameters
858
    ----------
859
    learning_rate : float, optional
860
        Step size.
861
        Default value is set to 0.002.
862
    beta1 : float, optional
863
        Exponential decay rate for the first moment estimates.
864
        Default value is set to 0.9.
865
    beta2 : float, optional
866
        Exponential decay rate for the second moment estimates.
867
        Default value is set to 0.999.
868
    epsilon : float, optional
869
        Default value is set to 1e-8.
870
    decay_factor : float, optional
871
        Default value is set to 1.
872
873
    """
874
    def __init__(self, learning_rate=0.002,
875
                 beta1=0.9, beta2=0.999, epsilon=1e-8,
876
                 decay_factor=1):
877
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
878
        self.beta1 = shared_floatx(beta1, "beta1")
879
        self.beta2 = shared_floatx(beta2, "beta2")
880
        self.epsilon = shared_floatx(epsilon, "epsilon")
881
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
882
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
883
                      self.decay_factor]:
884
            add_role(param, ALGORITHM_HYPERPARAMETER)
885
886
    def compute_step(self, parameter, previous_step):
887
        mean = _create_algorithm_buffer_for(parameter, 'mean')
888
        variance = _create_algorithm_buffer_for(parameter, 'variance')
889
        time = shared_floatx(0., 'time')
890
        add_role(time, ALGORITHM_BUFFER)
891
892
        t1 = time + 1
893
        beta_1_decayed = self.beta1 * self.decay_factor ** (t1 - 1)
894
        learning_rate = (self.learning_rate *
895
                         tensor.sqrt(1. - self.beta2**t1) /
896
                         (1. - beta_1_decayed**t1))
897
        mean_t = beta_1_decayed * mean + (1. - beta_1_decayed) * previous_step
898
        variance_t = (self.beta2 * variance +
899
                      (1. - self.beta2) * tensor.sqr(previous_step))
900
        step = (learning_rate * mean_t /
901
                (tensor.sqrt(variance_t) + self.epsilon))
902
903
        updates = [(mean, mean_t),
904
                   (variance, variance_t),
905
                   (time, t1)]
906
907
        return step, updates
908
909
910
class RemoveNotFinite(StepRule):
911
    """A step rule that skips steps with non-finite elements.
912
913
    Replaces a step (the parameter update of a single shared variable)
914
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
915
    step rescaling the parameters.
916
917
    Parameters
918
    ----------
919
    scaler : float, optional
920
        The scaling applied to the parameter in case the step contains
921
        non-finite elements. Defaults to 1, which means that parameters
922
        will not be changed.
923
924
    Notes
925
    -----
926
    This rule should be applied last!
927
928
    This trick was originally used in the GroundHog_ framework.
929
930
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
931
932
    """
933
    def __init__(self, scaler=1):
934
        self.scaler = scaler
935
936
    def compute_step(self, parameter, previous_step):
937
        step_sum = tensor.sum(previous_step)
938
        not_finite = (tensor.isnan(step_sum) +
939
                      tensor.isinf(step_sum))
940
        step = tensor.switch(
941
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
942
        return step, []
943
944
945
class Restrict(StepRule):
946
    """Applies a given :class:`StepRule` only to certain variables.
947
948
    Example applications include clipping steps on only certain parameters,
949
    or scaling a certain kind of parameter's updates (e.g. adding an
950
    additional scalar multiplier to the steps taken on convolutional
951
    filters).
952
953
    Parameters
954
    ----------
955
    step_rule : :class:`StepRule`
956
        The :class:`StepRule` to be applied on the given variables.
957
    variables : iterable
958
        A collection of Theano variables on which to apply `step_rule`.
959
        Variables not appearing in this collection will not have
960
        `step_rule` applied to them.
961
962
    """
963
    def __init__(self, step_rule, variables):
964
        self.step_rule = step_rule
965
        self.variables = frozenset(variables)
966
967
    def compute_steps(self, previous_steps):
968
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
969
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
970
        actual = OrderedDict((parameter, steps[parameter])
971
                             if parameter in steps
972
                             else (parameter, previous_steps[parameter])
973
                             for parameter in previous_steps)
974
        return actual, updates
975