Code Duplication    Length = 41-46 lines in 2 locations

blocks/algorithms/__init__.py 2 locations

@@ 522-567 (lines=46) @@
519
    """Accumulates step with exponential discount.
520
521
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
522
    usual momentum step rule.
523
524
    Parameters
525
    ----------
526
    learning_rate : float, optional
527
        The learning rate by which the previous step scaled. Defaults to 1.
528
    momentum : float, optional
529
        The momentum coefficient. Defaults to 0.
530
531
    Attributes
532
    ----------
533
    learning_rate : :class:`~tensor.SharedVariable`
534
        A variable for learning rate.
535
    momentum : :class:`~tensor.SharedVariable`
536
        A variable for momentum.
537
538
    See Also
539
    --------
540
    :class:`SharedVariableModifier`
541
542
    """
543
    def __init__(self, learning_rate=1.0, momentum=0.):
544
        scale = Scale(learning_rate=learning_rate)
545
        basic_momentum = BasicMomentum(momentum=momentum)
546
        self.learning_rate = scale.learning_rate
547
        self.momentum = basic_momentum.momentum
548
        self.components = [scale, basic_momentum]
549
550
551
class AdaDelta(StepRule):
552
    """Adapts the step size over time using only first order information.
553
554
    Parameters
555
    ----------
556
    decay_rate : float, optional
557
        Decay rate in [0, 1]. Defaults to 0.95.
558
    epsilon : float, optional
559
        Stabilizing constant for RMS. Defaults to 1e-6.
560
561
    Notes
562
    -----
563
    For more information, see [ADADELTA]_.
564
565
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
566
       Rate Method*, arXiv:1212.5701.
567
568
    """
569
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
570
        if not 0.0 <= decay_rate <= 1.0:
@@ 725-765 (lines=41) @@
722
            norm = l2_norm(previous_steps.values())
723
            multiplier = tensor.switch(norm < self.threshold,
724
                                       1, self.threshold / norm)
725
            steps = OrderedDict(
726
                (parameter, step * multiplier)
727
                for parameter, step in previous_steps.items())
728
        return steps, []
729
730
731
class VariableClipping(StepRule):
732
    """Clip the maximum norm of individual variables along certain axes.
733
734
    This :class:`StepRule` can be used to implement L2 norm constraints on
735
    e.g. the weight vectors of individual hidden units, convolutional
736
    filters or entire weight tensors. Combine with :class:`Restrict`
737
    (and possibly :class:`CompositeRule`), to apply such constraints only
738
    to certain variables and/or apply different norm constraints to
739
    different variables.
740
741
    Parameters
742
    ----------
743
    threshold : float
744
        Maximum norm for a given (portion of a) tensor.
745
    axis : int or iterable, optional
746
        An integer single axis, or an iterable collection of integer
747
        axes over which to sum in order to calculate the L2 norm. If
748
        `None` (the default), the norm is computed over all elements
749
        of the tensor.
750
751
    Notes
752
    -----
753
    Because of the way the :class:`StepRule` API works, this particular
754
    rule implements norm clipping of the value *after* update in the
755
    following way: it computes ``parameter - previous_step``, scales it
756
    to have (possibly axes-wise) norm(s) of at most `threshold`,
757
    then subtracts *that* value from `parameter` to yield an 'equivalent
758
    step' that respects the desired norm constraints. This procedure
759
    implicitly assumes one is doing simple (stochastic) gradient descent,
760
    and so steps computed by this step rule may not make sense for use
761
    in other contexts.
762
763
    Investigations into max-norm regularization date from [Srebro2005]_.
764
    The first appearance of this technique as a regularization method
765
    for the weight vectors of individual hidden units in feed-forward
766
    neural networks may be [Hinton2012]_.
767
768
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.