Code Duplication    Length = 41-46 lines in 2 locations

blocks/algorithms/__init__.py 2 locations

@@ 522-567 (lines=46) @@
519
520
    Combines :class:`BasicMomentum` and :class:`Scale` to form the
521
    usual momentum step rule.
522
523
    Parameters
524
    ----------
525
    learning_rate : float, optional
526
        The learning rate by which the previous step scaled. Defaults to 1.
527
    momentum : float, optional
528
        The momentum coefficient. Defaults to 0.
529
530
    Attributes
531
    ----------
532
    learning_rate : :class:`~tensor.SharedVariable`
533
        A variable for learning rate.
534
    momentum : :class:`~tensor.SharedVariable`
535
        A variable for momentum.
536
537
    See Also
538
    --------
539
    :class:`SharedVariableModifier`
540
541
    """
542
    def __init__(self, learning_rate=1.0, momentum=0.):
543
        scale = Scale(learning_rate=learning_rate)
544
        basic_momentum = BasicMomentum(momentum=momentum)
545
        self.learning_rate = scale.learning_rate
546
        self.momentum = basic_momentum.momentum
547
        self.components = [scale, basic_momentum]
548
549
550
class AdaDelta(StepRule):
551
    """Adapts the step size over time using only first order information.
552
553
    Parameters
554
    ----------
555
    decay_rate : float, optional
556
        Decay rate in [0, 1]. Defaults to 0.95.
557
    epsilon : float, optional
558
        Stabilizing constant for RMS. Defaults to 1e-6.
559
560
    Notes
561
    -----
562
    For more information, see [ADADELTA]_.
563
564
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
565
       Rate Method*, arXiv:1212.5701.
566
567
    """
568
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
569
        if not 0.0 <= decay_rate <= 1.0:
570
            raise ValueError("decay rate needs to be in [0, 1]")
@@ 725-765 (lines=41) @@
722
            multiplier = tensor.switch(norm < self.threshold,
723
                                       1, self.threshold / norm)
724
            steps = OrderedDict(
725
                (parameter, step * multiplier)
726
                for parameter, step in previous_steps.items())
727
        return steps, []
728
729
730
class VariableClipping(StepRule):
731
    """Clip the maximum norm of individual variables along certain axes.
732
733
    This :class:`StepRule` can be used to implement L2 norm constraints on
734
    e.g. the weight vectors of individual hidden units, convolutional
735
    filters or entire weight tensors. Combine with :class:`Restrict`
736
    (and possibly :class:`CompositeRule`), to apply such constraints only
737
    to certain variables and/or apply different norm constraints to
738
    different variables.
739
740
    Parameters
741
    ----------
742
    threshold : float
743
        Maximum norm for a given (portion of a) tensor.
744
    axis : int or iterable, optional
745
        An integer single axis, or an iterable collection of integer
746
        axes over which to sum in order to calculate the L2 norm. If
747
        `None` (the default), the norm is computed over all elements
748
        of the tensor.
749
750
    Notes
751
    -----
752
    Because of the way the :class:`StepRule` API works, this particular
753
    rule implements norm clipping of the value *after* update in the
754
    following way: it computes ``parameter - previous_step``, scales it
755
    to have (possibly axes-wise) norm(s) of at most `threshold`,
756
    then subtracts *that* value from `parameter` to yield an 'equivalent
757
    step' that respects the desired norm constraints. This procedure
758
    implicitly assumes one is doing simple (stochastic) gradient descent,
759
    and so steps computed by this step rule may not make sense for use
760
    in other contexts.
761
762
    Investigations into max-norm regularization date from [Srebro2005]_.
763
    The first appearance of this technique as a regularization method
764
    for the weight vectors of individual hidden units in feed-forward
765
    neural networks may be [Hinton2012]_.
766
767
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
768
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference