Completed
Push — master ( 6b68ea...3db89c )
by Dmitry
64:40 queued 09:28
created

blocks/algorithms/__init__.py (9 issues)

Labels
Severity
1
"""Training algorithms."""
0 ignored issues
show
There seems to be a cyclic import (blocks.bricks.base -> blocks.graph -> blocks.graph.bn -> blocks.filter).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.misc -> blocks.bricks.recurrent.base -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.misc -> blocks.bricks.parallel -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.architectures -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from collections import Mapping
7
from six.moves import reduce
8
9
from picklable_itertools.extras import equizip
10
11
import theano
12
from six import add_metaclass
13
from theano import tensor
14
15
from blocks.graph import ComputationGraph
16
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
17
from blocks.theano_expressions import l2_norm
18
from blocks.utils import (dict_subset, pack, shared_floatx,
19
                          shared_floatx_zeros_matching)
20
21
logger = logging.getLogger(__name__)
22
23
24
def _create_algorithm_buffer_for(param, *args, **kwargs):
25
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
26
    buf.tag.for_parameter = param
27
    add_role(buf, ALGORITHM_BUFFER)
28
    return buf
29
30
31
@add_metaclass(ABCMeta)
32
class TrainingAlgorithm(object):
33
    """Base class for training algorithms.
34
35
    A training algorithm object has a simple life-cycle.
36
    First it is initialized by calling its :meth:`initialize` method.
37
    At this stage, for instance, Theano functions can be compiled.
38
    After that the :meth:`process_batch` method is repeatedly
39
    called with a batch of training data as a parameter.
40
41
    """
42
    @abstractmethod
43
    def initialize(self, **kwargs):
44
        """Initialize the training algorithm."""
45
        pass
46
47
    @abstractmethod
48
    def process_batch(self, batch):
49
        """Process a batch of training data.
50
51
        Attributes
52
        ----------
53
        batch : dict
54
            A dictionary of (source name, data) pairs.
55
56
        """
57
        pass
58
59
60
variable_mismatch_error = """
61
62
Blocks tried to match the sources ({sources}) of the training dataset to \
63
the names of the Theano variables ({variables}), but failed to do so. \
64
If you want to train on a subset of the sources that your dataset provides, \
65
pass the `sources` keyword argument to its constructor, use the \
66
FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \
67
or on_unused_sources='ignore' to the GradientDescent algorithm."""
68
69
source_missing_error = """
70
71
Blocks didn't find all the sources ({sources}) of the training dataset \
72
that match the names of the Theano variables ({variables})."""
73
74
75
determinism_error = """Cannot infer parameter list in a fixed order.
76
77
Because dictionaries are unordered (and Python uses randomized hashing, \
78
which can change the iteration order over the same dictionary from one \
79
interpreter session to the next), Blocks cannot infer the parameters list \
80
from a plain dictionary of gradients in an order that is reproducible \
81
across interpreter sessions; please either specify the parameters \
82
explicitly or pass gradients as an OrderedDict (though exercise care in \
83
constructing that OrderedDict, as an OrderedDict created by iterating \
84
over an unordered iterable (e.g. a dict) will still have an arbitrary \
85
and unpredictable order that could cause problems with \
86
reproducibility)."""
87
88
89
class UpdatesAlgorithm(TrainingAlgorithm):
90
    """Base class for algorithms that use Theano functions with updates.
91
92
    Parameters
93
    ----------
94
    updates : list of tuples or :class:`~collections.OrderedDict`
95
        The updates that should be performed.
96
    theano_func_kwargs : dict, optional
97
        A passthrough to `theano.function` for additional arguments.
98
        Useful for passing `profile` or `mode` arguments to the theano
99
        function that will be compiled for the algorithm.
100
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
101
        Controls behavior when not all sources in a batch are used
102
        (i.e. there is no variable with a matching name in the inputs
103
        of the computational graph of the updates).
104
105
    Attributes
106
    ----------
107
    updates : list of :class:`~tensor.TensorSharedVariable` updates
108
        Updates to be done for every batch. It is required that the
109
        updates are done using the old values of optimized parameters.
110
111
    Notes
112
    -----
113
    Changing `updates` attribute or calling `add_updates` after
114
    the `initialize` method is called will have no effect.
115
116
    """
117
    def __init__(self, updates=None, theano_func_kwargs=None,
118
                 on_unused_sources='raise', **kwargs):
119
        self.updates = [] if updates is None else updates
120
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
121
                                   is not None else dict())
122
        self.on_unused_sources = on_unused_sources
123
        super(UpdatesAlgorithm, self).__init__(**kwargs)
124
125
    def initialize(self):
126
        logger.info("Initializing the training algorithm")
127
        update_values = [new_value for _, new_value in self.updates]
128
        logger.debug("Inferring graph inputs...")
129
        self.inputs = ComputationGraph(update_values).inputs
130
        logger.debug("Compiling training function...")
131
        self._function = theano.function(
132
            self.inputs, [], updates=self.updates, **self.theano_func_kwargs)
133
        logger.info("The training algorithm is initialized")
134
135
    @property
136
    def updates(self):
137
        return self._updates
138
139
    @updates.setter
140
    def updates(self, value):
141
        self._updates = value
142
143
    def add_updates(self, updates):
144
        """Add updates to the training process.
145
146
        The updates will be done _before_ the parameters are changed.
147
148
        Parameters
149
        ----------
150
        updates : list of tuples or :class:`~collections.OrderedDict`
151
            The updates to add.
152
153
        """
154
        if isinstance(updates, OrderedDict):
155
            updates = list(updates.items())
156
        if not isinstance(updates, list):
157
            raise ValueError
158
        self.updates.extend(updates)
159
160
    def _validate_source_names(self, batch):
161
        in_names = [v.name for v in self.inputs]
162
163
        if not set(in_names).issubset(set(batch.keys())):
164
            raise ValueError("Didn't find all sources: " +
165
                             source_missing_error.format(
166
                                 sources=batch.keys(),
167
                                 variables=in_names))
168
        if not set(batch.keys()).issubset(set(in_names)):
169
            if self.on_unused_sources == 'ignore':
170
                pass
171
            elif self.on_unused_sources == 'warn':
172
                if not hasattr(self, '_unused_source_warned'):
173
                    logger.warn(variable_mismatch_error.format(
174
                        sources=batch.keys(),
175
                        variables=in_names))
176
                self._unused_source_warned = True
177
            elif self.on_unused_sources == 'raise':
178
                raise ValueError(
179
                    "mismatch of variable names and data sources" +
180
                    variable_mismatch_error.format(
181
                        sources=batch.keys(),
182
                        variables=in_names))
183
            else:
184
                raise ValueError("Wrong value of on_unused_sources: {}."
185
                                 .format(self.on_unused_sources))
186
187
    def process_batch(self, batch):
188
        self._validate_source_names(batch)
189
        ordered_batch = [batch[v.name] for v in self.inputs]
190
        self._function(*ordered_batch)
191
192
193
class GradientDescent(UpdatesAlgorithm):
194
    """A base class for all gradient descent algorithms.
195
196
    By "gradient descent" we mean a training algorithm of the following
197
    form:
198
199
    .. code-block::  python
200
201
        for batch in data:
202
            steps = step_rule.compute_steps(parameters,
203
                                            gradients_wr_parameters)
204
            for parameter in parameters:
205
                parameter -= steps[parameter]
206
207
    Note, that the step is *subtracted, not added*! This is done in order
208
    to make step rule chaining possible.
209
210
    Parameters
211
    ----------
212
    cost : :class:`~tensor.TensorVariable`, optional
213
        The objective to be minimized. Unused if `gradients` is specified.
214
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
215
        The parameters to be tuned. If not provided, inferred from the
216
        keys of `gradients` (in which case `gradients` *must* be an
217
        `OrderedDict`).
218
    step_rule : instance of :class:`StepRule`, optional
219
        An object encapsulating most of the algorithm's logic. Its
220
        `compute_steps` method is called to get Theano expression for
221
        steps.  Note, that the step rule might have a state, e.g. to
222
        remember a weighted sum of gradients from previous steps like it is
223
        done in gradient descent with momentum. If ``None``, an instance of
224
        :class:`Scale` is created.
225
    gradients : OrderedDict or list of 2-tuples, optional
226
        A dictionary mapping a parameter to an expression for the cost's
227
        gradient with respect to the parameter, or equivalently, a list of
228
        (parameter, gradient) tuples. If ``None``, the gradient
229
        are taken automatically using :func:`theano.gradient.grad`.
230
    known_grads : dict, optional
231
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
232
        Useful when you know the [approximate] gradients of some
233
        sub-expressions and would like Theano to use that information
234
        to compute parameter gradients. Only makes sense when `gradients`
235
        is `None`.
236
    consider_constant : list, optional
237
        A passthrough to `theano.tensor.grad`'s `consider_constant`
238
        argument.  A list of expressions through which gradients will not
239
        be backpropagated. Only makes sense when `gradients` is `None`.
240
241
    Attributes
242
    ----------
243
    gradients : OrderedDict
244
        The gradient dictionary.
245
    step_rule : instance of :class:`StepRule`
246
        The step rule.
247
248
    Notes
249
    -----
250
    Changing `updates` attribute or calling `add_updates` after
251
    the `initialize` method is called will have no effect.
252
253
    If a cost and parameters are provided, gradients are taken immediately
254
    upon construction, and changes to these attributes after construction
255
    will have no effect.
256
257
    `gradients` must be an `OrderedDict` if `parameters` is unspecified
258
    because ordinary dictionaries have an unpredictable iteration
259
    order due to hash randomization (which is enabled by default since
260
    versions 2.7.3 and 3.2.3 of Python). This source of variability,
261
    when combined with Theano's heuristic graph optimizations, can cause
262
    serious reproducibility issues.
263
264
    """
265
    def __init__(self, cost=None, parameters=None, step_rule=None,
266
                 gradients=None, known_grads=None, consider_constant=None,
267
                 **kwargs):
268
        # Set initial values for cost, parameters, gradients.
269
        self.cost = cost
270
        self.parameters = parameters
271
        # Coerce lists of tuples to OrderedDict. Do not coerce Mappings,
272
        # as we don't want to convert dict -> OrderedDict and give it
273
        # an arbitrary, non-deterministic order.
274
        if gradients is not None and not isinstance(gradients, Mapping):
275
            gradients = OrderedDict(gradients)
276
        self.gradients = gradients
277
278
        # If we don't have gradients, we'll need to infer them from the
279
        # cost and the parameters, both of which must not be None.
280
        if not self.gradients:
281
            self.gradients = self._compute_gradients(known_grads,
282
                                                     consider_constant)
283
        else:
284
            if cost is not None:
285
                logger.warning(('{}: gradients already specified directly; '
286
                                'cost is unused.'
287
                                .format(self.__class__.__name__)))
288
            if self.parameters is None and isinstance(gradients, OrderedDict):
289
                # If the dictionary is ordered, it's safe to use the keys
290
                # as they have a deterministic order.
291
                self.parameters = list(self.gradients.keys())
292
            elif self.parameters is not None:
293
                # If parameters and gradients.keys() don't match we can
294
                # try to recover if gradients is ordered.
295
                if set(self.parameters) != set(self.gradients.keys()):
296
                    logger.warn("Specified parameters list does not match "
297
                                "keys in provided gradient dictionary; "
298
                                "using parameters inferred from gradients")
299
                    if not isinstance(self.gradients, OrderedDict):
300
                        raise ValueError(determinism_error)
301
                    self.parameters = list(self.gradients.keys())
302
            else:
303
                # self.parameters is not None, and gradients isn't
304
                # an OrderedDict. We can't do anything safe.
305
                raise ValueError(determinism_error)
306
            if known_grads:
307
                raise ValueError("known_grads has no effect when gradients "
308
                                 "are passed in")
309
            if consider_constant is not None:
310
                raise ValueError("consider_constant has no effect when "
311
                                 "gradients are passed in")
312
313
        # The order in which the different gradient terms appears
314
        # here matters, as floating point addition is non-commutative (and
315
        # Theano's graph optimizations are not order-independent).
316
        # This is why we do not use .values().
317
        gradient_values = [self.gradients[p] for p in self.parameters]
318
        self.total_gradient_norm = (l2_norm(gradient_values)
319
                                    .copy(name="total_gradient_norm"))
320
321
        self.step_rule = step_rule if step_rule else Scale()
322
        logger.debug("Computing parameter steps...")
323
        self.steps, self.step_rule_updates = (
324
            self.step_rule.compute_steps(self.gradients))
325
326
        # Same as gradient_values above: the order may influence a
327
        # bunch of things, so enforce a consistent one (don't use
328
        # .values()).
329
        step_values = [self.steps[p] for p in self.parameters]
330
        self.total_step_norm = (l2_norm(step_values)
331
                                .copy(name="total_step_norm"))
332
333
        # Once again, iterating on gradients may not be deterministically
334
        # ordered if it is not an OrderedDict. We add the updates here in
335
        # the order specified in self.parameters. Keep it this way to
336
        # maintain reproducibility.
337
        kwargs.setdefault('updates', []).extend(
338
            itertools.chain(((parameter, parameter - self.steps[parameter])
339
                             for parameter in self.parameters),
340
                            self.step_rule_updates)
341
        )
342
        super(GradientDescent, self).__init__(**kwargs)
343
344
    def _compute_gradients(self, known_grads, consider_constant):
345
        if self.cost is None:
346
            raise ValueError("can't infer gradients; no cost specified")
347
        elif self.parameters is None or len(self.parameters) == 0:
348
            raise ValueError("can't infer gradients; no parameters "
349
                             "specified")
350
        # While this strictly speaking could be a dict and not an
351
        # OrderedDict (because we iterate over it in the order of
352
        # self.parameters), this guards a little bit against
353
        # nondeterminism introduced by future refactoring.
354
        logger.info("Taking the cost gradient")
355
        gradients = OrderedDict(
356
            equizip(self.parameters, tensor.grad(
357
                self.cost, self.parameters,
358
                known_grads=known_grads,
359
                consider_constant=consider_constant)))
360
        logger.info("The cost gradient computation graph is built")
361
        return gradients
362
363
364
@add_metaclass(ABCMeta)
365
class StepRule(object):
366
    """A rule to compute steps for a gradient descent algorithm."""
367
    def compute_step(self, parameter, previous_step):
368
        """Build a Theano expression for the step for a parameter.
369
370
        This method is called by default implementation of
371
        :meth:`compute_steps`, it relieves from writing a loop each time.
372
373
        Parameters
374
        ----------
375
        parameter : :class:`~tensor.TensorSharedVariable`
376
            The parameter.
377
        previous_step : :class:`~tensor.TensorVariable`
378
            Some quantity related to the gradient of the cost with respect
379
            to the parameter, either the gradient itself or a step in a
380
            related direction.
381
382
        Returns
383
        -------
384
        step : :class:`~theano.Variable`
385
            Theano variable for the step to take.
386
        updates : list
387
            A list of tuples representing updates to be performed. This
388
            is useful for stateful rules such as :class:`Momentum` which
389
            need to update shared variables after itetations.
390
391
        """
392
        raise NotImplementedError
393
394
    def compute_steps(self, previous_steps):
395
        """Build a Theano expression for steps for all parameters.
396
397
        Override this method if you want to process the steps
398
        with respect to all parameters as a whole, not parameter-wise.
399
400
        Parameters
401
        ----------
402
        previous_steps : OrderedDict
403
            An :class:`~OrderedDict` of
404
            (:class:`~tensor.TensorSharedVariable`
405
            :class:`~tensor.TensorVariable`) pairs. The keys are the
406
            parameters being trained, the values are the expressions for
407
            quantities related to gradients of the cost with respect to
408
            the parameters, either the gradients themselves or steps in
409
            related directions.
410
411
        Returns
412
        -------
413
        steps : OrderedDict
414
            A dictionary of the proposed steps in the same form as
415
            `previous_steps`.
416
        updates : list
417
            A list of tuples representing updates to be performed.
418
419
        """
420
        parameter_wise = [self.compute_step(parameter,
421
                                            previous_steps[parameter])
422
                          for parameter in previous_steps]
423
        steps, updates = equizip(*parameter_wise)
424
        steps = OrderedDict((parameter, step) for parameter, step
425
                            in equizip(previous_steps.keys(), steps))
426
        updates = list(itertools.chain(*updates))
427
        return steps, updates
428
429
430
class CompositeRule(StepRule):
431
    """Chains several step rules.
432
433
    Parameters
434
    ----------
435
    components : list of :class:`StepRule`
436
        The learning rules to be chained. The rules will be applied in the
437
        order as given.
438
439
    """
440
    def __init__(self, components):
441
        self.components = components
442
443
    def compute_steps(self, previous_steps):
444
        steps = previous_steps
445
        updates = []
446
        for rule in self.components:
447
            steps, more_updates = rule.compute_steps(steps)
448
            updates += more_updates
449
        return steps, updates
450
451
452
class Scale(StepRule):
453
    """A step in the direction proportional to the previous step.
454
455
    If used in :class:`GradientDescent` alone, this step rule implements
456
    steepest descent.
457
458
    Parameters
459
    ----------
460
    learning_rate : float
461
        The learning rate by which the previous step is multiplied to
462
        produce the step.
463
464
    Attributes
465
    ----------
466
    learning_rate : :class:`~tensor.TensorSharedVariable`
467
        The shared variable storing the learning rate used.
468
469
    """
470
    def __init__(self, learning_rate=1.0):
471
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
472
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
473
474
    def compute_step(self, parameter, previous_step):
475
        return self.learning_rate * previous_step, []
476
477
478
class BasicMomentum(StepRule):
479
    """Accumulates step with exponential discount.
480
481
    Parameters
482
    ----------
483
    momentum : float, optional
484
        The momentum coefficient. Defaults to 0.
485
486
    Notes
487
    -----
488
    This step rule is intended to be used in conjunction with another
489
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
490
    experience, look at :class:`Momentum`.
491
492
    """
493
    def __init__(self, momentum=0.):
494
        self.momentum = shared_floatx(momentum, "momentum")
495
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
496
497
    def compute_step(self, parameter, previous_step):
498
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
499
        step = self.momentum * velocity + previous_step
500
        updates = [(velocity, step)]
501
        return step, updates
502
503
504
class Momentum(CompositeRule):
505
    """Accumulates step with exponential discount.
506
507
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
508
    usual momentum step rule.
509
510
    Parameters
511
    ----------
512
    learning_rate : float, optional
513
        The learning rate by which the previous step scaled. Defaults to 1.
514
    momentum : float, optional
515
        The momentum coefficient. Defaults to 0.
516
517
    Attributes
518
    ----------
519
    learning_rate : :class:`~tensor.SharedVariable`
520
        A variable for learning rate.
521
    momentum : :class:`~tensor.SharedVariable`
522 View Code Duplication
        A variable for momentum.
523
524
    See Also
525
    --------
526
    :class:`SharedVariableModifier`
527
528
    """
529
    def __init__(self, learning_rate=1.0, momentum=0.):
530
        scale = Scale(learning_rate=learning_rate)
531
        basic_momentum = BasicMomentum(momentum=momentum)
532
        self.learning_rate = scale.learning_rate
533
        self.momentum = basic_momentum.momentum
534
        self.components = [scale, basic_momentum]
535
536
537
class AdaDelta(StepRule):
538
    """Adapts the step size over time using only first order information.
539
540
    Parameters
541
    ----------
542
    decay_rate : float, optional
543
        Decay rate in [0, 1]. Defaults to 0.95.
544
    epsilon : float, optional
545
        Stabilizing constant for RMS. Defaults to 1e-6.
546
547
    Notes
548
    -----
549
    For more information, see [ADADELTA]_.
550
551
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
552
       Rate Method*, arXiv:1212.5701.
553
554
    """
555
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
556
        if not 0.0 <= decay_rate <= 1.0:
557
            raise ValueError("decay rate needs to be in [0, 1]")
558
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
559
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
560
        self.epsilon = shared_floatx(epsilon, "epsilon")
561
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
562
563
    def compute_step(self, parameter, previous_step):
564
        mean_square_step_tm1 = _create_algorithm_buffer_for(
565
            parameter, "mean_square_step_tm1")
566
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
567
            parameter, "mean_square_delta_x_tm1")
568
569
        mean_square_step_t = (
570
            self.decay_rate * mean_square_step_tm1 +
571
            (1 - self.decay_rate) * tensor.sqr(previous_step)
572
        )
573
574
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
575
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
576
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
577
578
        mean_square_delta_x_t = (
579
            self.decay_rate * mean_square_delta_x_tm1 +
580
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
581
        )
582
583
        step = delta_x_t
584
        updates = [(mean_square_step_tm1, mean_square_step_t),
585
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
586
        return step, updates
587
588
589
class BasicRMSProp(StepRule):
590
    """Scales the step size by a running average of the recent step norms.
591
592
    Parameters
593
    ----------
594
    decay_rate : float, optional
595
        How fast the running average decays, value in [0, 1]
596
        (lower is faster).  Defaults to 0.9.
597
    max_scaling : float, optional
598
        Maximum scaling of the step size, in case the running average is
599
        really small. Needs to be greater than 0. Defaults to 1e5.
600
601
    Notes
602
    -----
603
    This step rule is intended to be used in conjunction with another
604
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
605
    experience, look at :class:`RMSProp`.
606
607
    In general, this step rule should be used _before_ other step rules,
608
    because it has normalization properties that may undo their work.
609
    For instance, it should be applied first when used in conjunction
610
    with :class:`Scale`.
611
612
    For more information, see [Hint2014]_.
613
614
    """
615
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
616
        if not 0.0 <= decay_rate <= 1.0:
617
            raise ValueError("decay rate needs to be in [0, 1]")
618
        if max_scaling <= 0:
619
            raise ValueError("max. scaling needs to be greater than 0")
620
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
621
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
622
        self.epsilon = 1. / max_scaling
623
624
    def compute_step(self, parameter, previous_step):
625
        mean_square_step_tm1 = _create_algorithm_buffer_for(
626
            parameter, "mean_square_step_tm1")
627
        mean_square_step_t = (
628
            self.decay_rate * mean_square_step_tm1 +
629
            (1 - self.decay_rate) * tensor.sqr(previous_step))
630
        rms_step_t = tensor.maximum(
631
            tensor.sqrt(mean_square_step_t), self.epsilon)
632
        step = previous_step / rms_step_t
633
        updates = [(mean_square_step_tm1, mean_square_step_t)]
634
        return step, updates
635
636
637
class RMSProp(CompositeRule):
638
    """Scales the step size by a running average of the recent step norms.
639
640
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
641
    described in [Hint2014]_.
642
643
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
644
       lecture 6a,
645
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
646
647
    Parameters
648
    ----------
649
    learning_rate : float, optional
650
        The learning rate by which the previous step scaled. Defaults to 1.
651
    decay_rate : float, optional
652
        How fast the running average decays (lower is faster).
653
        Defaults to 0.9.
654
    max_scaling : float, optional
655
        Maximum scaling of the step size, in case the running average is
656
        really small. Defaults to 1e5.
657
658
    Attributes
659
    ----------
660
    learning_rate : :class:`~tensor.SharedVariable`
661
        A variable for learning rate.
662
    decay_rate : :class:`~tensor.SharedVariable`
663
        A variable for decay rate.
664
665
    See Also
666
    --------
667
    :class:`SharedVariableModifier`
668
669
    """
670
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
671
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
672
                                      max_scaling=max_scaling)
673
        scale = Scale(learning_rate=learning_rate)
674
        self.learning_rate = scale.learning_rate
675
        self.decay_rate = basic_rms_prop.decay_rate
676
        self.components = [basic_rms_prop, scale]
677
678
679
class StepClipping(StepRule):
680
    """Rescales an entire step if its L2 norm exceeds a threshold.
681
682
    When the previous steps are the gradients, this step rule performs
683
    gradient clipping.
684
685
    Parameters
686
    ----------
687
    threshold : float, optional
688
        The maximum permitted L2 norm for the step. The step
689
        will be rescaled to be not higher than this quanity.
690
        If ``None``, no rescaling will be applied.
691
692
    Attributes
693
    ----------
694
    threshold : :class:`.tensor.TensorSharedVariable`
695
        The shared variable storing the clipping threshold used.
696
697
    """
698
    def __init__(self, threshold=None):
699
        if threshold is not None:
700
            threshold = shared_floatx(threshold, "threshold")
701
            add_role(threshold, ALGORITHM_HYPERPARAMETER)
702
        self.threshold = threshold
703
704
    def compute_steps(self, previous_steps):
705
        if self.threshold is None:
706
            steps = previous_steps
707
        else:
708
            norm = l2_norm(previous_steps.values())
709
            multiplier = tensor.switch(norm < self.threshold,
710
                                       1, self.threshold / norm)
711
            steps = OrderedDict(
712
                (parameter, step * multiplier)
713
                for parameter, step in previous_steps.items())
714
        return steps, []
715
716
717
class VariableClipping(StepRule):
718
    """Clip the maximum norm of individual variables along certain axes.
719
720
    This :class:`StepRule` can be used to implement L2 norm constraints on
721
    e.g. the weight vectors of individual hidden units, convolutional
722
    filters or entire weight tensors. Combine with :class:`Restrict`
723
    (and possibly :class:`CompositeRule`), to apply such constraints only
724
    to certain variables and/or apply different norm constraints to
725 View Code Duplication
    different variables.
726
727
    Parameters
728
    ----------
729
    threshold : float
730
        Maximum norm for a given (portion of a) tensor.
731
    axis : int or iterable, optional
732
        An integer single axis, or an iterable collection of integer
733
        axes over which to sum in order to calculate the L2 norm. If
734
        `None` (the default), the norm is computed over all elements
735
        of the tensor.
736
737
    Notes
738
    -----
739
    Because of the way the :class:`StepRule` API works, this particular
740
    rule implements norm clipping of the value *after* update in the
741
    following way: it computes ``parameter - previous_step``, scales it
742
    to have (possibly axes-wise) norm(s) of at most `threshold`,
743
    then subtracts *that* value from `parameter` to yield an 'equivalent
744
    step' that respects the desired norm constraints. This procedure
745
    implicitly assumes one is doing simple (stochastic) gradient descent,
746
    and so steps computed by this step rule may not make sense for use
747
    in other contexts.
748
749
    Investigations into max-norm regularization date from [Srebro2005]_.
750
    The first appearance of this technique as a regularization method
751
    for the weight vectors of individual hidden units in feed-forward
752
    neural networks may be [Hinton2012]_.
753
754
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
755
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
756
       on Learning Theory (COLT)*, June 2005.
757
758
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
759
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
760
       "Improving neural networks by preventing co-adaptation of
761
       feature detectors". arXiv:1207.0580.
762
763
    """
764
    def __init__(self, threshold, axis=None):
765
        axis = pack(axis) if axis is not None else ()
766
        self.axis = set(axis)
767
        self.threshold = shared_floatx(threshold, "threshold")
768
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
769
        if len(axis) != len(self.axis):
770
            raise ValueError("axis must be unique")
771
772
    def compute_step(self, parameter, previous_step):
773
        if any(ax >= previous_step.ndim for ax in self.axis):
774
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
775
                self.axis, parameter, previous_step.ndim))
776
        if len(self.axis) == 0:
777
            norms = l2_norm([parameter - previous_step])
778
        else:
779
            squares = tensor.sqr(parameter - previous_step)
780
            norms = tensor.sqrt(
781
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
782
                       sorted(self.axis), squares))
783
        # We want a step s* that is the same as scaling
784
        # (parameter - previous_step) by threshold / norm
785
        # when threshold < norm.
786
        shrinking_step = (parameter -
787
                          (self.threshold / norms) *
788
                          (parameter - previous_step))
789
        return tensor.switch(norms > self.threshold,
790
                             shrinking_step,
791
                             previous_step), ()
792
793
794
class AdaGrad(StepRule):
795
    """Implements the AdaGrad learning rule.
796
797
    Parameters
798
    ----------
799
    learning_rate : float, optional
800
        Step size.
801
        Default value is set to 0.0002.
802
    epsilon : float, optional
803
        Stabilizing constant for one over root of sum of squares.
804
        Defaults to 1e-6.
805
806
    Notes
807
    -----
808
    For more information, see [ADAGRAD]_.
809
810
    .. [ADAGRAD] Duchi J, Hazan E, Singer Y.,
811
       *Adaptive subgradient methods for online learning and
812
       stochastic optimization*,
813
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
814
815
    """
816
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
817
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
818
        self.epsilon = shared_floatx(epsilon, "epsilon")
819
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
820
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
821
822
    def compute_step(self, parameter, previous_step):
823
        name = 'adagrad_sqs'
824
        if parameter.name:
825
            name += '_' + parameter.name
826
        ssq = _create_algorithm_buffer_for(parameter, name=name)
827
828
        ssq_t = (tensor.sqr(previous_step) + ssq)
829
        step = (self.learning_rate * previous_step /
830
                (tensor.sqrt(ssq_t) + self.epsilon))
831
832
        updates = [(ssq, ssq_t)]
833
834
        return step, updates
835
836
837
class Adam(StepRule):
838
    """Adam optimizer as described in [King2014]_.
839
840
    .. [King2014] Diederik Kingma, Jimmy Ba,
841
       *Adam: A Method for Stochastic Optimization*,
842
       http://arxiv.org/abs/1412.6980
843
844
    Parameters
845
    ----------
846
    learning_rate : float, optional
847
        Step size.
848
        Default value is set to 0.002.
849
    beta1 : float, optional
850
        Exponential decay rate for the first moment estimates.
851
        Default value is set to 0.1.
852
    beta2 : float, optional
853
        Exponential decay rate for the second moment estimates.
854
        Default value is set to 0.001.
855
    epsilon : float, optional
856
        Default value is set to 1e-8.
857
    decay_factor : float, optional
858
        Default value is set to 1 - 1e-8.
859
860
    """
861
    def __init__(self, learning_rate=0.002,
862
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
863
                 decay_factor=(1 - 1e-8)):
864
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
865
        self.beta1 = shared_floatx(beta1, "beta1")
866
        self.beta2 = shared_floatx(beta2, "beta2")
867
        self.epsilon = shared_floatx(epsilon, "epsilon")
868
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
869
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
870
                      self.decay_factor]:
871
            add_role(param, ALGORITHM_HYPERPARAMETER)
872
873
    def compute_step(self, parameter, previous_step):
874
        mean = _create_algorithm_buffer_for(parameter, 'mean')
875
        variance = _create_algorithm_buffer_for(parameter, 'variance')
876
        time = shared_floatx(0., 'time')
877
        add_role(time, ALGORITHM_BUFFER)
878
879
        t1 = time + 1
880
        learning_rate = (self.learning_rate *
881
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
882
                         (1. - (1. - self.beta1)**t1))
883
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
884
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
885
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
886
                      (1. - self.beta2) * variance)
887
        step = (learning_rate * mean_t /
888
                (tensor.sqrt(variance_t) + self.epsilon))
889
890
        updates = [(mean, mean_t),
891
                   (variance, variance_t),
892
                   (time, t1)]
893
894
        return step, updates
895
896
897
class RemoveNotFinite(StepRule):
898
    """A step rule that skips steps with non-finite elements.
899
900
    Replaces a step (the parameter update of a single shared variable)
901
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
902
    step rescaling the parameters.
903
904
    Parameters
905
    ----------
906
    scaler : float, optional
907
        The scaling applied to the parameter in case the step contains
908
        non-finite elements. Defaults to 1, which means that parameters
909
        will not be changed.
910
911
    Notes
912
    -----
913
    This rule should be applied last!
914
915
    This trick was originally used in the GroundHog_ framework.
916
917
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
918
919
    """
920
    def __init__(self, scaler=1):
921
        self.scaler = scaler
922
923
    def compute_step(self, parameter, previous_step):
924
        step_sum = tensor.sum(previous_step)
925
        not_finite = (tensor.isnan(step_sum) +
926
                      tensor.isinf(step_sum))
927
        step = tensor.switch(
928
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
929
        return step, []
930
931
932
class Restrict(StepRule):
933
    """Applies a given :class:`StepRule` only to certain variables.
934
935
    Example applications include clipping steps on only certain parameters,
936
    or scaling a certain kind of parameter's updates (e.g. adding an
937
    additional scalar multiplier to the steps taken on convolutional
938
    filters).
939
940
    Parameters
941
    ----------
942
    step_rule : :class:`StepRule`
943
        The :class:`StepRule` to be applied on the given variables.
944
    variables : iterable
945
        A collection of Theano variables on which to apply `step_rule`.
946
        Variables not appearing in this collection will not have
947
        `step_rule` applied to them.
948
949
    """
950
    def __init__(self, step_rule, variables):
951
        self.step_rule = step_rule
952
        self.variables = frozenset(variables)
953
954
    def compute_steps(self, previous_steps):
955
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
956
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
957
        actual = OrderedDict((parameter, steps[parameter])
958
                             if parameter in steps
959
                             else (parameter, previous_steps[parameter])
960
                             for parameter in previous_steps)
961
        return actual, updates
962