Code Duplication    Length = 41-46 lines in 2 locations

blocks/algorithms/__init__.py 2 locations

@@ 522-567 (lines=46) @@
519
    learning_rate : :class:`~tensor.SharedVariable`
520
        A variable for learning rate.
521
    momentum : :class:`~tensor.SharedVariable`
522
        A variable for momentum.
523
524
    See Also
525
    --------
526
    :class:`SharedVariableModifier`
527
528
    """
529
    def __init__(self, learning_rate=1.0, momentum=0.):
530
        scale = Scale(learning_rate=learning_rate)
531
        basic_momentum = BasicMomentum(momentum=momentum)
532
        self.learning_rate = scale.learning_rate
533
        self.momentum = basic_momentum.momentum
534
        self.components = [scale, basic_momentum]
535
536
537
class AdaDelta(StepRule):
538
    """Adapts the step size over time using only first order information.
539
540
    Parameters
541
    ----------
542
    decay_rate : float, optional
543
        Decay rate in [0, 1]. Defaults to 0.95.
544
    epsilon : float, optional
545
        Stabilizing constant for RMS. Defaults to 1e-6.
546
547
    Notes
548
    -----
549
    For more information, see [ADADELTA]_.
550
551
    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
552
       Rate Method*, arXiv:1212.5701.
553
554
    """
555
    def __init__(self, decay_rate=0.95, epsilon=1e-6):
556
        if not 0.0 <= decay_rate <= 1.0:
557
            raise ValueError("decay rate needs to be in [0, 1]")
558
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
559
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
560
        self.epsilon = shared_floatx(epsilon, "epsilon")
561
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
562
563
    def compute_step(self, parameter, previous_step):
564
        mean_square_step_tm1 = _create_algorithm_buffer_for(
565
            parameter, "mean_square_step_tm1")
566
        mean_square_delta_x_tm1 = _create_algorithm_buffer_for(
567
            parameter, "mean_square_delta_x_tm1")
568
569
        mean_square_step_t = (
570
            self.decay_rate * mean_square_step_tm1 +
@@ 725-765 (lines=41) @@
722
    filters or entire weight tensors. Combine with :class:`Restrict`
723
    (and possibly :class:`CompositeRule`), to apply such constraints only
724
    to certain variables and/or apply different norm constraints to
725
    different variables.
726
727
    Parameters
728
    ----------
729
    threshold : float
730
        Maximum norm for a given (portion of a) tensor.
731
    axis : int or iterable, optional
732
        An integer single axis, or an iterable collection of integer
733
        axes over which to sum in order to calculate the L2 norm. If
734
        `None` (the default), the norm is computed over all elements
735
        of the tensor.
736
737
    Notes
738
    -----
739
    Because of the way the :class:`StepRule` API works, this particular
740
    rule implements norm clipping of the value *after* update in the
741
    following way: it computes ``parameter - previous_step``, scales it
742
    to have (possibly axes-wise) norm(s) of at most `threshold`,
743
    then subtracts *that* value from `parameter` to yield an 'equivalent
744
    step' that respects the desired norm constraints. This procedure
745
    implicitly assumes one is doing simple (stochastic) gradient descent,
746
    and so steps computed by this step rule may not make sense for use
747
    in other contexts.
748
749
    Investigations into max-norm regularization date from [Srebro2005]_.
750
    The first appearance of this technique as a regularization method
751
    for the weight vectors of individual hidden units in feed-forward
752
    neural networks may be [Hinton2012]_.
753
754
    .. [Srebro2005] Nathan Srebro and Adi Shraibman.
755
       "Rank, Trace-Norm and Max-Norm". *18th Annual Conference
756
       on Learning Theory (COLT)*, June 2005.
757
758
    .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava,
759
       Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov.
760
       "Improving neural networks by preventing co-adaptation of
761
       feature detectors". arXiv:1207.0580.
762
763
    """
764
    def __init__(self, threshold, axis=None):
765
        axis = pack(axis) if axis is not None else ()
766
        self.axis = set(axis)
767
        self.threshold = shared_floatx(threshold, "threshold")
768
        add_role(self.threshold, ALGORITHM_HYPERPARAMETER)