Completed
Pull Request — master (#1079)
by David
05:09
created

GradientDescent.inputs()   A

Complexity

Conditions 1

Size

Total Lines 11

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 11
rs 9.4285
1
"""Training algorithms."""
0 ignored issues
show
Bug introduced by
There seems to be a cyclic import (blocks.bricks.base -> blocks.graph -> blocks.graph.bn -> blocks.filter).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
Bug introduced by
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).

Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.

Loading history...
2
import logging
3
import itertools
4
from abc import ABCMeta, abstractmethod
5
from collections import OrderedDict
6
from six.moves import reduce
7
8
from picklable_itertools.extras import equizip
9
10
import theano
11
from six import add_metaclass
12
from theano import tensor
13
14
from blocks.graph import ComputationGraph
15
from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER
16
from blocks.theano_expressions import l2_norm
17
from blocks.utils import (dict_subset, pack, shared_floatx,
18
                          shared_floatx_zeros_matching)
19
20
logger = logging.getLogger(__name__)
21
22
23
def _create_algorithm_buffer_for(param, *args, **kwargs):
24
    buf = shared_floatx_zeros_matching(param, *args, **kwargs)
25
    buf.tag.for_parameter = param
26
    add_role(buf, ALGORITHM_BUFFER)
27
    return buf
28
29
30
@add_metaclass(ABCMeta)
31
class TrainingAlgorithm(object):
32
    """Base class for training algorithms.
33
34
    A training algorithm object has a simple life-cycle.
35
    First it is initialized by calling its :meth:`initialize` method.
36
    At this stage, for instance, Theano functions can be compiled.
37
    After that the :meth:`process_batch` method is repeatedly
38
    called with a batch of training data as a parameter.
39
40
    """
41
    @abstractmethod
42
    def initialize(self, **kwargs):
43
        """Initialize the training algorithm."""
44
        pass
45
46
    @abstractmethod
47
    def process_batch(self, batch):
48
        """Process a batch of training data.
49
50
        Attributes
51
        ----------
52
        batch : dict
53
            A dictionary of (source name, data) pairs.
54
55
        """
56
        pass
57
58
59
variable_mismatch_error = """
60
61
Blocks tried to match the sources ({sources}) of the training dataset to \
62
the names of the Theano variables ({variables}), but failed to do so. \
63
If you want to train on a subset of the sources that your dataset provides, \
64
pass the `sources` keyword argument to its constructor. Or pass \
65
on_unused_sources='warn' or on_unused_sources='ignore' to \
66
the GradientDescent algorithm."""
67
68
source_missing_error = """
69
70
Blocks didn't find all the sources ({sources}) of the training dataset \
71
that match the names of the Theano variables ({variables})."""
72
73
74
class GradientDescent(TrainingAlgorithm):
75
    """A base class for all gradient descent algorithms.
76
77
    By "gradient descent" we mean a training algorithm of the following
78
    form:
79
80
    .. code-block::  python
81
82
        for batch in data:
83
            steps = step_rule.compute_steps(parameters,
84
                                            gradients_wr_parameters)
85
            for parameter in parameters:
86
                parameter -= steps[parameter]
87
88
    Note, that the step is *subtracted, not added*! This is done in order
89
    to make step rule chaining possible.
90
91
    Parameters
92
    ----------
93
    cost : :class:`~tensor.TensorVariable`, optional
94
        The objective to be minimized.
95
    parameters : list of :class:`~tensor.TensorSharedVariable`, optional
96
        The parameters to be tuned. If not provided, inferred from the
97
        keys of `gradients`.
98
    step_rule : instance of :class:`StepRule`, optional
99
        An object encapsulating most of the algorithm's logic. Its
100
        `compute_steps` method is called to get Theano expression for
101
        steps.  Note, that the step rule might have a state, e.g. to
102
        remember a weighted sum of gradients from previous steps like it is
103
        done in gradient descent with momentum. If ``None``, an instance of
104
        :class:`Scale` is created.
105
    gradients : dict, optional
106
        A dictionary mapping a parameter to an expression for the cost's
107
        gradient with respect to the parameter. If ``None``, the gradient
108
        are taken automatically using :func:`theano.gradient.grad`.
109
    known_grads : dict, optional
110
        A passthrough to `theano.tensor.grad`'s `known_grads` argument.
111
        Useful when you know the [approximate] gradients of some
112
        sub-expressions and would like Theano to use that information
113
        to compute parameter gradients. Only makes sense when `gradients`
114
        is `None`.
115
    consider_constant : list, optional
116
        A passthrough to `theano.tensor.grad`'s `consider_constant`
117
        argument.  A list of expressions through which gradients will not
118
        be backpropagated. Only makes sense when `gradients` is `None`.
119
    on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn'
120
        Controls behavior when not all sources are used.
121
    theano_func_kwargs : dict, optional
122
        A passthrough to `theano.function` for additional arguments.
123
        Useful for passing `profile` or `mode` arguments to the theano
124
        function that will be compiled for the algorithm.
125
126
    Attributes
127
    ----------
128
    gradients : dict
129
        The gradient dictionary.
130
    step_rule : instance of :class:`StepRule`
131
        The step rule.
132
    updates : list of :class:`~tensor.TensorSharedVariable` updates
133
        Updates to be done for every batch. It is required that the
134
        updates are done using the old values of optimized parameters.
135
136
    Notes
137
    -----
138
    Changing `updates` attribute or calling `add_updates` after
139
    the `initialize` method is called will have no effect.
140
141
    .. todo::
142
143
       Some shared variables are not parameters (e.g. those created by
144
       random streams).
145
146
    .. todo::
147
148
       Due to a rather premature status of the :class:`ComputationGraph`
149
       class the parameter used only inside scans are not fetched
150
       currently.
151
152
    """
153
    def __init__(self, cost=None, parameters=None, step_rule=None,
154
                 gradients=None, known_grads=None, consider_constant=None,
155
                 on_unused_sources='raise', theano_func_kwargs=None, **kwargs):
156
        super(GradientDescent, self).__init__(**kwargs)
157
        # Set initial values for cost, parameters, gradients.
158
        self.cost = cost
159
        self._updates = []
160
        self.parameters = parameters
161
        self.gradients = gradients
162
163
        # If we don't have gradients, we'll need to infer them from the
164
        # cost and the parameters, both of which must not be None.
165
        if not self.gradients:
166
            if self.cost is None:
167
                raise ValueError("can't infer gradients; no cost specified")
168
            elif self.parameters is None or len(self.parameters) == 0:
169
                raise ValueError("can't infer gradients; "
170
                                 "no parameters specified")
171
            self.inputs = ComputationGraph(cost).inputs
172
            logger.info("Taking the cost gradient")
173
            self.gradients = dict(
174
                equizip(self.parameters, tensor.grad(
175
                    self.cost, self.parameters,
176
                    known_grads=known_grads,
177
                    consider_constant=consider_constant)))
178
            logger.info("The cost gradient computation graph is built")
179
        else:
180
            # If we have gradients, we get parameters from that.
181
            # If you're specifying both then something is screwy.
182
            if self.parameters is not None:
183
                logger.warning('{} received both gradients and parameters '
184
                               'arguments; using parameters deduced from '
185
                               'gradients')
186
            gradients_dict = dict(gradients)
187
            self.parameters = list(gradients_dict.keys())
188
            self.inputs = ComputationGraph(gradients_dict.values()).inputs
189
            if known_grads:
190
                raise ValueError("known_grads has no effect when gradients "
191
                                 "are passed in")
192
            if consider_constant is not None:
193
                raise ValueError("consider_constant has no effect when "
194
                                 "gradients are passed in")
195
        self.step_rule = step_rule if step_rule else Scale()
196
197
        self.total_gradient_norm = l2_norm(
198
            self.gradients.values()).copy(name="total_gradient_norm")
199
        self.steps, self.step_rule_updates = (
200
            self.step_rule.compute_steps(self.gradients))
201
        self.total_step_norm = l2_norm(
202
            self.steps.values()).copy(name="total_step_norm")
203
        self.on_unused_sources = on_unused_sources
204
        self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs
205
                                   is not None else dict())
206
207
    def initialize(self):
208
        logger.info("Initializing the training algorithm")
209
        # Note: the gradients are computed in the same order in which
210
        # the parameters were given. Keep it like that to ensure
211
        # reproducibility.
212
        for parameter in self.parameters:
213
            self.updates.append((parameter, parameter - self.steps[parameter]))
214
        self.updates += self.step_rule_updates
215
        super(GradientDescent, self).initialize()
216
        logger.info("The training algorithm is initialized")
217
218
    def _validate_source_names(self, batch):
219
        in_names = [v.name for v in self.inputs]
220
221
        if not set(in_names).issubset(set(batch.keys())):
222
            raise ValueError("Didn't find all sources: " +
223
                             source_missing_error.format(
224
                                 sources=batch.keys(),
225
                                 variables=in_names))
226
        if not set(batch.keys()).issubset(set(in_names)):
227
            if self.on_unused_sources == 'ignore':
228
                pass
229
            elif self.on_unused_sources == 'warn':
230
                if not hasattr(self, '_unused_source_warned'):
231
                    logger.warn(variable_mismatch_error.format(
232
                        sources=batch.keys(),
233
                        variables=in_names))
234
                self._unused_source_warned = True
235
            elif self.on_unused_sources == 'raise':
236
                raise ValueError(
237
                    "mismatch of variable names and data sources" +
238
                    variable_mismatch_error.format(
239
                        sources=batch.keys(),
240
                        variables=in_names))
241
            else:
242
                raise ValueError("Wrong value of on_unused_sources: {}."
243
                                 .format(self.on_unused_sources))
244
245
    def process_batch(self, batch):
246
        self._validate_source_names(batch)
247
        ordered_batch = [batch[v.name] for v in self.inputs]
248
        self._function(*ordered_batch)
249
250
    @property
251
    def inputs(self):
252
        """Return inputs of the cost computation graph.
253
254
        Returns
255
        -------
256
        inputs : list of :class:`~tensor.TensorVariable`
257
            Inputs to this graph.
258
259
        """
260
        return self._cost_computation_graph.inputs
261
262
    @property
263
    def updates(self):
264
        return self._updates
265
266
    @updates.setter
267
    def updates(self, value):
268
        self._updates = value
269
270
    def add_updates(self, updates):
271
        """Add updates to the training process.
272
273
        The updates will be done _before_ the parameters are changed.
274
275
        Parameters
276
        ----------
277
        updates : list of tuples or :class:`~collections.OrderedDict`
278
            The updates to add.
279
280
        """
281
        if isinstance(updates, OrderedDict):
282
            updates = list(updates.items())
283
        if not isinstance(updates, list):
284
            raise ValueError
285
        self.updates.extend(updates)
286
287
288
@add_metaclass(ABCMeta)
289
class StepRule(object):
290
    """A rule to compute steps for a gradient descent algorithm."""
291
    def compute_step(self, parameter, previous_step):
292
        """Build a Theano expression for the step for a parameter.
293
294
        This method is called by default implementation of
295
        :meth:`compute_steps`, it relieves from writing a loop each time.
296
297
        Parameters
298
        ----------
299
        parameter : :class:`~tensor.TensorSharedVariable`
300
            The parameter.
301
        previous_step : :class:`~tensor.TensorVariable`
302
            Some quantity related to the gradient of the cost with respect
303
            to the parameter, either the gradient itself or a step in a
304
            related direction.
305
306
        Returns
307
        -------
308
        step : :class:`~theano.Variable`
309
            Theano variable for the step to take.
310
        updates : list
311
            A list of tuples representing updates to be performed. This
312
            is useful for stateful rules such as :class:`Momentum` which
313
            need to update shared variables after itetations.
314
315
        """
316
        raise NotImplementedError
317
318
    def compute_steps(self, previous_steps):
319
        """Build a Theano expression for steps for all parameters.
320
321
        Override this method if you want to process the steps
322
        with respect to all parameters as a whole, not parameter-wise.
323
324
        Parameters
325
        ----------
326
        previous_steps : OrderedDict
327
            An :class:`~OrderedDict` of
328
            (:class:`~tensor.TensorSharedVariable`
329
            :class:`~tensor.TensorVariable`) pairs. The keys are the
330
            parameters being trained, the values are the expressions for
331
            quantities related to gradients of the cost with respect to
332
            the parameters, either the gradients themselves or steps in
333
            related directions.
334
335
        Returns
336
        -------
337
        steps : OrderedDict
338
            A dictionary of the proposed steps in the same form as
339
            `previous_steps`.
340
        updates : list
341
            A list of tuples representing updates to be performed.
342
343
        """
344
        parameter_wise = [self.compute_step(parameter,
345
                                            previous_steps[parameter])
346
                          for parameter in previous_steps]
347
        steps, updates = equizip(*parameter_wise)
348
        steps = OrderedDict((parameter, step) for parameter, step
349
                            in equizip(previous_steps.keys(), steps))
350
        updates = list(itertools.chain(*updates))
351
        return steps, updates
352
353
354
class CompositeRule(StepRule):
355
    """Chains several step rules.
356
357
    Parameters
358
    ----------
359
    components : list of :class:`StepRule`
360
        The learning rules to be chained. The rules will be applied in the
361
        order as given.
362
363
    """
364
    def __init__(self, components):
365
        self.components = components
366
367
    def compute_steps(self, previous_steps):
368
        steps = previous_steps
369
        updates = []
370
        for rule in self.components:
371
            steps, more_updates = rule.compute_steps(steps)
372
            updates += more_updates
373
        return steps, updates
374
375
376
class Scale(StepRule):
377
    """A step in the direction proportional to the previous step.
378
379
    If used in :class:`GradientDescent` alone, this step rule implements
380
    steepest descent.
381
382
    Parameters
383
    ----------
384
    learning_rate : float
385
        The learning rate by which the previous step is multiplied to
386
        produce the step.
387
388
    Attributes
389
    ----------
390
    learning_rate : :class:`~tensor.TensorSharedVariable`
391
        The shared variable storing the learning rate used.
392
393
    """
394
    def __init__(self, learning_rate=1.0):
395
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
396
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
397
398
    def compute_step(self, parameter, previous_step):
399
        return self.learning_rate * previous_step, []
400
401
402
class BasicMomentum(StepRule):
403
    """Accumulates step with exponential discount.
404
405
    Parameters
406
    ----------
407
    momentum : float, optional
408
        The momentum coefficient. Defaults to 0.
409
410
    Notes
411
    -----
412
    This step rule is intended to be used in conjunction with another
413
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
414
    experience, look at :class:`Momentum`.
415
416
    """
417
    def __init__(self, momentum=0.):
418
        self.momentum = shared_floatx(momentum, "momentum")
419
        add_role(self.momentum, ALGORITHM_HYPERPARAMETER)
420
421
    def compute_step(self, parameter, previous_step):
422
        velocity = _create_algorithm_buffer_for(parameter, "velocity")
423
        step = self.momentum * velocity + previous_step
424
        updates = [(velocity, step)]
425
        return step, updates
426
427
428
class Momentum(CompositeRule):
429
    """Accumulates step with exponential discount.
430
431
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
432
    usual momentum step rule.
433
434
    Parameters
435
    ----------
436
    learning_rate : float, optional
437
        The learning rate by which the previous step scaled. Defaults to 1.
438
    momentum : float, optional
439
        The momentum coefficient. Defaults to 0.
440
441
    Attributes
442
    ----------
443
    learning_rate : :class:`~tensor.SharedVariable`
444
        A variable for learning rate.
445
    momentum : :class:`~tensor.SharedVariable`
446
        A variable for momentum.
447
448
    See Also
449
    --------
450
    :class:`SharedVariableModifier`
451
452
    """
453
    def __init__(self, learning_rate=1.0, momentum=0.):
454
        scale = Scale(learning_rate=learning_rate)
455
        basic_momentum = BasicMomentum(momentum=momentum)
456
        self.learning_rate = scale.learning_rate
457
        self.momentum = basic_momentum.momentum
458
        self.components = [scale, basic_momentum]
459
460
461
class AdaDelta(StepRule):
462
    """Adapts the step size over time using only first order information.
463
464
    Parameters
465
    ----------
466
    decay_rate : float, optional
467
        Decay rate in [0, 1]. Defaults to 0.95.
468
    epsilon : float, optional
469
        Stabilizing constant for RMS. Defaults to 1e-6.
470
471
    Notes
472
    -----
473
    For more information, see [ADADELTA]_.
474
475
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
476
       Rate Method*, arXiv:1212.5701.
477
478
    """
479
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
480
        if not 0.0 <= decay_rate <= 1.0:
481
            raise ValueError("decay rate needs to be in [0, 1]")
482
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
483
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
484
        self.epsilon = shared_floatx(epsilon, "epsilon")
485
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
486
487
    def compute_step(self, parameter, previous_step):
488
        mean_square_step_tm1 = _create_algorithm_buffer_for(
489
            parameter, "mean_square_step_tm1")
490
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
491
            parameter, "mean_square_delta_x_tm1")
492
493
        mean_square_step_t = (
494
            self.decay_rate * mean_square_step_tm1 +
495
            (1 - self.decay_rate) * tensor.sqr(previous_step)
496
        )
497
498
        rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon)
499
        rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon)
500
        delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step
501
502
        mean_square_delta_x_t = (
503
            self.decay_rate * mean_square_delta_x_tm1 +
504
            (1 - self.decay_rate) * tensor.sqr(delta_x_t)
505
        )
506
507
        step = delta_x_t
508
        updates = [(mean_square_step_tm1, mean_square_step_t),
509
                   (mean_square_delta_x_tm1, mean_square_delta_x_t)]
510
        return step, updates
511
512
513
class BasicRMSProp(StepRule):
514
    """Scales the step size by a running average of the recent step norms.
515
516
    Parameters
517
    ----------
518
    decay_rate : float, optional
519
        How fast the running average decays, value in [0, 1]
520
        (lower is faster).  Defaults to 0.9.
521
    max_scaling : float, optional
522 View Code Duplication
        Maximum scaling of the step size, in case the running average is
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
523
        really small. Needs to be greater than 0. Defaults to 1e5.
524
525
    Notes
526
    -----
527
    This step rule is intended to be used in conjunction with another
528
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
529
    experience, look at :class:`RMSProp`.
530
531
    In general, this step rule should be used _before_ other step rules,
532
    because it has normalization properties that may undo their work.
533
    For instance, it should be applied first when used in conjunction
534
    with :class:`Scale`.
535
536
    For more information, see [Hint2014]_.
537
538
    """
539
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
540
        if not 0.0 <= decay_rate <= 1.0:
541
            raise ValueError("decay rate needs to be in [0, 1]")
542
        if max_scaling <= 0:
543
            raise ValueError("max. scaling needs to be greater than 0")
544
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
545
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
546
        self.epsilon = 1. / max_scaling
547
548
    def compute_step(self, parameter, previous_step):
549
        mean_square_step_tm1 = _create_algorithm_buffer_for(
550
            parameter, "mean_square_step_tm1")
551
        mean_square_step_t = (
552
            self.decay_rate * mean_square_step_tm1 +
553
            (1 - self.decay_rate) * tensor.sqr(previous_step))
554
        rms_step_t = tensor.maximum(
555
            tensor.sqrt(mean_square_step_t), self.epsilon)
556
        step = previous_step / rms_step_t
557
        updates = [(mean_square_step_tm1, mean_square_step_t)]
558
        return step, updates
559
560
561
class RMSProp(CompositeRule):
562
    """Scales the step size by a running average of the recent step norms.
563
564
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
565
    described in [Hint2014]_.
566
567
    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
568
       lecture 6a,
569
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
570
571
    Parameters
572
    ----------
573
    learning_rate : float, optional
574
        The learning rate by which the previous step scaled. Defaults to 1.
575
    decay_rate : float, optional
576
        How fast the running average decays (lower is faster).
577
        Defaults to 0.9.
578
    max_scaling : float, optional
579
        Maximum scaling of the step size, in case the running average is
580
        really small. Defaults to 1e5.
581
582
    Attributes
583
    ----------
584
    learning_rate : :class:`~tensor.SharedVariable`
585
        A variable for learning rate.
586
    decay_rate : :class:`~tensor.SharedVariable`
587
        A variable for decay rate.
588
589
    See Also
590
    --------
591
    :class:`SharedVariableModifier`
592
593
    """
594
    def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5):
595
        basic_rms_prop = BasicRMSProp(decay_rate=decay_rate,
596
                                      max_scaling=max_scaling)
597
        scale = Scale(learning_rate=learning_rate)
598
        self.learning_rate = scale.learning_rate
599
        self.decay_rate = basic_rms_prop.decay_rate
600
        self.components = [basic_rms_prop, scale]
601
602
603
class StepClipping(StepRule):
604
    """Rescales an entire step if its L2 norm exceeds a threshold.
605
606
    When the previous steps are the gradients, this step rule performs
607
    gradient clipping.
608
609
    Parameters
610
    ----------
611
    threshold : float, optional
612
        The maximum permitted L2 norm for the step. The step
613
        will be rescaled to be not higher than this quanity.
614
        If ``None``, no rescaling will be applied.
615
616
    Attributes
617
    ----------
618
    threshold : :class:`.tensor.TensorSharedVariable`
619
        The shared variable storing the clipping threshold used.
620
621
    """
622
    def __init__(self, threshold=None):
623
        if threshold:
624
            self.threshold = shared_floatx(threshold, "threshold")
625
            add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
626
627
    def compute_steps(self, previous_steps):
628
        if not hasattr(self, 'threshold'):
629
            return previous_steps
630
        norm = l2_norm(previous_steps.values())
631
        multiplier = tensor.switch(norm < self.threshold,
632
                                   1, self.threshold / norm)
633
        steps = OrderedDict(
634
            (parameter, step * multiplier)
635
            for parameter, step in previous_steps.items())
636
        return steps, []
637
638
639
class VariableClipping(StepRule):
640
    """Clip the maximum norm of individual variables along certain axes.
641
642
    This :class:`StepRule` can be used to implement L2 norm constraints on
643
    e.g. the weight vectors of individual hidden units, convolutional
644
    filters or entire weight tensors. Combine with :class:`Restrict`
645
    (and possibly :class:`CompositeRule`), to apply such constraints only
646
    to certain variables and/or apply different norm constraints to
647
    different variables.
648
649
    Parameters
650
    ----------
651
    threshold : float
652
        Maximum norm for a given (portion of a) tensor.
653
    axis : int or iterable, optional
654
        An integer single axis, or an iterable collection of integer
655
        axes over which to sum in order to calculate the L2 norm. If
656
        `None` (the default), the norm is computed over all elements
657
        of the tensor.
658
659
    Notes
660
    -----
661
    Because of the way the :class:`StepRule` API works, this particular
662
    rule implements norm clipping of the value *after* update in the
663
    following way: it computes ``parameter - previous_step``, scales it
664
    to have (possibly axes-wise) norm(s) of at most `threshold`,
665
    then subtracts *that* value from `parameter` to yield an 'equivalent
666
    step' that respects the desired norm constraints. This procedure
667
    implicitly assumes one is doing simple (stochastic) gradient descent,
668
    and so steps computed by this step rule may not make sense for use
669
    in other contexts.
670
671
    Investigations into max-norm regularization date from [Srebro2005]_.
672
    The first appearance of this technique as a regularization method
673
    for the weight vectors of individual hidden units in feed-forward
674
    neural networks may be [Hinton2012]_.
675
676
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
677
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
678
       on Learning Theory (COLT)*, June 2005.
679
680
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
681
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
682
       "Improving neural networks by preventing co-adaptation of
683
       feature detectors". arXiv:1207.0580.
684
685
    """
686
    def __init__(self, threshold, axis=None):
687
        axis = pack(axis) if axis is not None else ()
688
        self.axis = set(axis)
689
        self.threshold = shared_floatx(threshold, "threshold")
690
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)
691
        if len(axis) != len(self.axis):
692
            raise ValueError("axis must be unique")
693
694
    def compute_step(self, parameter, previous_step):
695
        if any(ax >= previous_step.ndim for ax in self.axis):
696
            raise ValueError("Invalid axis {} for {}, ndim={}".format(
697
                self.axis, parameter, previous_step.ndim))
698
        if len(self.axis) == 0:
699
            norms = l2_norm([parameter - previous_step])
700
        else:
701
            squares = tensor.sqr(parameter - previous_step)
702
            norms = tensor.sqrt(
703
                reduce(lambda t, a: t.sum(axis=a, keepdims=True),
704
                       sorted(self.axis), squares))
705
        # We want a step s* that is the same as scaling
706
        # (parameter - previous_step) by threshold / norm
707
        # when threshold < norm.
708
        shrinking_step = (parameter -
709
                          (self.threshold / norms) *
710
                          (parameter - previous_step))
711
        return tensor.switch(norms > self.threshold,
712
                             shrinking_step,
713
                             previous_step), ()
714
715
716
class AdaGrad(StepRule):
717
    """Implements the AdaGrad learning rule.
718
719
    Parameters
720
    ----------
721
    learning_rate : float, optional
722
        Step size.
723
        Default value is set to 0.0002.
724
    epsilon : float, optional
725 View Code Duplication
        Stabilizing constant for one over root of sum of squares.
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
726
        Defaults to 1e-6.
727
728
    Notes
729
    -----
730
    For more information, see [ADAGRAD]_.
731
732
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
733
       *Adaptive subgradient methods for online learning and
734
        stochastic optimization*,
735
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
736
737
    """
738
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
739
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
740
        self.epsilon = shared_floatx(epsilon, "epsilon")
741
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
742
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
743
744
    def compute_step(self, parameter, previous_step):
745
        name = 'adagrad_sqs'
746
        if parameter.name:
747
            name += '_' + parameter.name
748
        ssq = _create_algorithm_buffer_for(parameter, name=name)
749
750
        ssq_t = (tensor.sqr(previous_step) + ssq)
751
        step = (self.learning_rate * previous_step /
752
                (tensor.sqrt(ssq_t) + self.epsilon))
753
754
        updates = [(ssq, ssq_t)]
755
756
        return step, updates
757
758
759
class Adam(StepRule):
760
    """Adam optimizer as described in [King2014]_.
761
762
    .. [King2014] Diederik Kingma, Jimmy Ba,
763
       *Adam: A Method for Stochastic Optimization*,
764
       http://arxiv.org/abs/1412.6980
765
766
    Parameters
767
    ----------
768
    learning_rate : float, optional
769
        Step size.
770
        Default value is set to 0.002.
771
    beta1 : float, optional
772
        Exponential decay rate for the first moment estimates.
773
        Default value is set to 0.1.
774
    beta2 : float, optional
775
        Exponential decay rate for the second moment estimates.
776
        Default value is set to 0.001.
777
    epsilon : float, optional
778
        Default value is set to 1e-8.
779
    decay_factor : float, optional
780
        Default value is set to 1 - 1e-8.
781
782
    """
783
    def __init__(self, learning_rate=0.002,
784
                 beta1=0.1, beta2=0.001, epsilon=1e-8,
785
                 decay_factor=(1 - 1e-8)):
786
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
787
        self.beta1 = shared_floatx(beta1, "beta1")
788
        self.beta2 = shared_floatx(beta2, "beta2")
789
        self.epsilon = shared_floatx(epsilon, "epsilon")
790
        self.decay_factor = shared_floatx(decay_factor, "decay_factor")
791
        for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon,
792
                      self.decay_factor]:
793
            add_role(param, ALGORITHM_HYPERPARAMETER)
794
795
    def compute_step(self, parameter, previous_step):
796
        mean = _create_algorithm_buffer_for(parameter, 'mean')
797
        variance = _create_algorithm_buffer_for(parameter, 'variance')
798
        time = shared_floatx(0., 'time')
799
        add_role(time, ALGORITHM_BUFFER)
800
801
        t1 = time + 1
802
        learning_rate = (self.learning_rate *
803
                         tensor.sqrt((1. - (1. - self.beta2)**t1)) /
804
                         (1. - (1. - self.beta1)**t1))
805
        beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1)
806
        mean_t = beta_1t * previous_step + (1. - beta_1t) * mean
807
        variance_t = (self.beta2 * tensor.sqr(previous_step) +
808
                      (1. - self.beta2) * variance)
809
        step = (learning_rate * mean_t /
810
                (tensor.sqrt(variance_t) + self.epsilon))
811
812
        updates = [(mean, mean_t),
813
                   (variance, variance_t),
814
                   (time, t1)]
815
816
        return step, updates
817
818
819
class RemoveNotFinite(StepRule):
820
    """A step rule that skips steps with non-finite elements.
821
822
    Replaces a step (the parameter update of a single shared variable)
823
    which contains non-finite elements (such as ``inf`` or ``NaN``) with a
824
    step rescaling the parameters.
825
826
    Parameters
827
    ----------
828
    scaler : float, optional
829
        The scaling applied to the parameter in case the step contains
830
        non-finite elements. Defaults to 1, which means that parameters
831
        will not be changed.
832
833
    Notes
834
    -----
835
    This rule should be applied last!
836
837
    This trick was originally used in the GroundHog_ framework.
838
839
    .. _GroundHog: https://github.com/lisa-groundhog/GroundHog
840
841
    """
842
    def __init__(self, scaler=1):
843
        self.scaler = scaler
844
845
    def compute_step(self, parameter, previous_step):
846
        step_sum = tensor.sum(previous_step)
847
        not_finite = (tensor.isnan(step_sum) +
848
                      tensor.isinf(step_sum))
849
        step = tensor.switch(
850
            not_finite > 0, (1 - self.scaler) * parameter, previous_step)
851
        return step, []
852
853
854
class Restrict(StepRule):
855
    """Applies a given :class:`StepRule` only to certain variables.
856
857
    Example applications include clipping steps on only certain parameters,
858
    or scaling a certain kind of parameter's updates (e.g. adding an
859
    additional scalar multiplier to the steps taken on convolutional
860
    filters).
861
862
    Parameters
863
    ----------
864
    step_rule : :class:`StepRule`
865
        The :class:`StepRule` to be applied on the given variables.
866
    variables : iterable
867
        A collection of Theano variables on which to apply `step_rule`.
868
        Variables not appearing in this collection will not have
869
        `step_rule` applied to them.
870
871
    """
872
    def __init__(self, step_rule, variables):
873
        self.step_rule = step_rule
874
        self.variables = frozenset(variables)
875
876
    def compute_steps(self, previous_steps):
877
        filtered_previous_steps = dict_subset(previous_steps, self.variables)
878
        steps, updates = self.step_rule.compute_steps(filtered_previous_steps)
879
        actual = OrderedDict((parameter, steps[parameter])
880
                             if parameter in steps
881
                             else (parameter, previous_steps[parameter])
882
                             for parameter in previous_steps)
883
        return actual, updates
884