Code Duplication    Length = 41-46 lines in 2 locations

blocks/algorithms/__init__.py 2 locations

@@ 522-567 (lines=46) @@
519
    """Scales the step size by a running average of the recent step norms.
520
521
    Parameters
522
    ----------
523
    decay_rate : float, optional
524
        How fast the running average decays, value in [0, 1]
525
        (lower is faster).  Defaults to 0.9.
526
    max_scaling : float, optional
527
        Maximum scaling of the step size, in case the running average is
528
        really small. Needs to be greater than 0. Defaults to 1e5.
529
530
    Notes
531
    -----
532
    This step rule is intended to be used in conjunction with another
533
    step rule, _e.g._ :class:`Scale`. For an all-batteries-included
534
    experience, look at :class:`RMSProp`.
535
536
    In general, this step rule should be used _before_ other step rules,
537
    because it has normalization properties that may undo their work.
538
    For instance, it should be applied first when used in conjunction
539
    with :class:`Scale`.
540
541
    For more information, see [Hint2014]_.
542
543
    """
544
    def __init__(self, decay_rate=0.9, max_scaling=1e5):
545
        if not 0.0 <= decay_rate <= 1.0:
546
            raise ValueError("decay rate needs to be in [0, 1]")
547
        if max_scaling <= 0:
548
            raise ValueError("max. scaling needs to be greater than 0")
549
        self.decay_rate = shared_floatx(decay_rate, "decay_rate")
550
        add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER)
551
        self.epsilon = 1. / max_scaling
552
553
    def compute_step(self, parameter, previous_step):
554
        mean_square_step_tm1 = _create_algorithm_buffer_for(
555
            parameter, "mean_square_step_tm1")
556
        mean_square_step_t = (
557
            self.decay_rate * mean_square_step_tm1 +
558
            (1 - self.decay_rate) * tensor.sqr(previous_step))
559
        rms_step_t = tensor.maximum(
560
            tensor.sqrt(mean_square_step_t), self.epsilon)
561
        step = previous_step / rms_step_t
562
        updates = [(mean_square_step_tm1, mean_square_step_t)]
563
        return step, updates
564
565
566
class RMSProp(CompositeRule):
567
    """Scales the step size by a running average of the recent step norms.
568
569
    Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule
570
    described in [Hint2014]_.
@@ 725-765 (lines=41) @@
722
    """Implements the AdaGrad learning rule.
723
724
    Parameters
725
    ----------
726
    learning_rate : float, optional
727
        Step size.
728
        Default value is set to 0.0002.
729
    epsilon : float, optional
730
        Stabilizing constant for one over root of sum of squares.
731
        Defaults to 1e-6.
732
733
    Notes
734
    -----
735
    For more information, see [ADAGRAD]_.
736
737
    .. [ADADGRAD] Duchi J, Hazan E, Singer Y.,
738
       *Adaptive subgradient methods for online learning and
739
        stochastic optimization*,
740
       http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
741
742
    """
743
    def __init__(self, learning_rate=0.002, epsilon=1e-6):
744
        self.learning_rate = shared_floatx(learning_rate, "learning_rate")
745
        self.epsilon = shared_floatx(epsilon, "epsilon")
746
        add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER)
747
        add_role(self.epsilon, ALGORITHM_HYPERPARAMETER)
748
749
    def compute_step(self, parameter, previous_step):
750
        name = 'adagrad_sqs'
751
        if parameter.name:
752
            name += '_' + parameter.name
753
        ssq = _create_algorithm_buffer_for(parameter, name=name)
754
755
        ssq_t = (tensor.sqr(previous_step) + ssq)
756
        step = (self.learning_rate * previous_step /
757
                (tensor.sqrt(ssq_t) + self.epsilon))
758
759
        updates = [(ssq, ssq_t)]
760
761
        return step, updates
762
763
764
class Adam(StepRule):
765
    """Adam optimizer as described in [King2014]_.
766
767
    .. [King2014] Diederik Kingma, Jimmy Ba,
768
       *Adam: A Method for Stochastic Optimization*,