Code Duplication    Length = 41-46 lines in 2 locations

blocks/algorithms/__init__.py 2 locations

@@ 522-567 (lines=46) @@
519
        The momentum coefficient. Defaults to 0.
520
521
    Attributes
522
    ----------
523
    learning_rate : :class:`~tensor.SharedVariable`
524
        A variable for learning rate.
525
    momentum : :class:`~tensor.SharedVariable`
526
        A variable for momentum.
527
528
    See Also
529
    --------
530
    :class:`SharedVariableModifier`
531
532
    """
533
    def __init__(self, learning_rate=1.0, momentum=0.):
534
        scale = Scale(learning_rate=learning_rate)
535
        basic_momentum = BasicMomentum(momentum=momentum)
536
        self.learning_rate = scale.learning_rate
537
        self.momentum = basic_momentum.momentum
538
        self.components = [scale, basic_momentum]
539
540
541
class AdaDelta(StepRule):
542
    """Adapts the step size over time using only first order information.
543
544
    Parameters
545
    ----------
546
    decay_rate : float, optional
547
        Decay rate in [0, 1]. Defaults to 0.95.
548
    epsilon : float, optional
549
        Stabilizing constant for RMS. Defaults to 1e-6.
550
551
    Notes
552
    -----
553
    For more information, see [ADADELTA]_.
554
555
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
556
       Rate Method*, arXiv:1212.5701.
557
558
    """
559
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
560
        if not 0.0 <= decay_rate <= 1.0:
561
            raise ValueError("decay rate needs to be in [0, 1]")
562
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
563
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
564
        self.epsilon = shared_floatx(epsilon, "epsilon")
565
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
566
567
    def compute_step(self, parameter, previous_step):
568
        mean_square_step_tm1 = _create_algorithm_buffer_for(
569
            parameter, "mean_square_step_tm1")
570
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
@@ 725-765 (lines=41) @@
722
    This :class:`StepRule` can be used to implement L2 norm constraints on
723
    e.g. the weight vectors of individual hidden units, convolutional
724
    filters or entire weight tensors. Combine with :class:`Restrict`
725
    (and possibly :class:`CompositeRule`), to apply such constraints only
726
    to certain variables and/or apply different norm constraints to
727
    different variables.
728
729
    Parameters
730
    ----------
731
    threshold : float
732
        Maximum norm for a given (portion of a) tensor.
733
    axis : int or iterable, optional
734
        An integer single axis, or an iterable collection of integer
735
        axes over which to sum in order to calculate the L2 norm. If
736
        `None` (the default), the norm is computed over all elements
737
        of the tensor.
738
739
    Notes
740
    -----
741
    Because of the way the :class:`StepRule` API works, this particular
742
    rule implements norm clipping of the value *after* update in the
743
    following way: it computes ``parameter - previous_step``, scales it
744
    to have (possibly axes-wise) norm(s) of at most `threshold`,
745
    then subtracts *that* value from `parameter` to yield an 'equivalent
746
    step' that respects the desired norm constraints. This procedure
747
    implicitly assumes one is doing simple (stochastic) gradient descent,
748
    and so steps computed by this step rule may not make sense for use
749
    in other contexts.
750
751
    Investigations into max-norm regularization date from [Srebro2005]_.
752
    The first appearance of this technique as a regularization method
753
    for the weight vectors of individual hidden units in feed-forward
754
    neural networks may be [Hinton2012]_.
755
756
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
757
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
758
       on Learning Theory (COLT)*, June 2005.
759
760
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
761
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
762
       "Improving neural networks by preventing co-adaptation of
763
       feature detectors". arXiv:1207.0580.
764
765
    """
766
    def __init__(self, threshold, axis=None):
767
        axis = pack(axis) if axis is not None else ()
768
        self.axis = set(axis)