@@ 522-567 (lines=46) @@ | ||
519 | learning_rate : :class:`~tensor.SharedVariable` |
|
520 | A variable for learning rate. |
|
521 | momentum : :class:`~tensor.SharedVariable` |
|
522 | A variable for momentum. |
|
523 | ||
524 | See Also |
|
525 | -------- |
|
526 | :class:`SharedVariableModifier` |
|
527 | ||
528 | """ |
|
529 | def __init__(self, learning_rate=1.0, momentum=0.): |
|
530 | scale = Scale(learning_rate=learning_rate) |
|
531 | basic_momentum = BasicMomentum(momentum=momentum) |
|
532 | self.learning_rate = scale.learning_rate |
|
533 | self.momentum = basic_momentum.momentum |
|
534 | self.components = [scale, basic_momentum] |
|
535 | ||
536 | ||
537 | class AdaDelta(StepRule): |
|
538 | """Adapts the step size over time using only first order information. |
|
539 | ||
540 | Parameters |
|
541 | ---------- |
|
542 | decay_rate : float, optional |
|
543 | Decay rate in [0, 1]. Defaults to 0.95. |
|
544 | epsilon : float, optional |
|
545 | Stabilizing constant for RMS. Defaults to 1e-6. |
|
546 | ||
547 | Notes |
|
548 | ----- |
|
549 | For more information, see [ADADELTA]_. |
|
550 | ||
551 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning |
|
552 | Rate Method*, arXiv:1212.5701. |
|
553 | ||
554 | """ |
|
555 | def __init__(self, decay_rate=0.95, epsilon=1e-6): |
|
556 | if not 0.0 <= decay_rate <= 1.0: |
|
557 | raise ValueError("decay rate needs to be in [0, 1]") |
|
558 | self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
|
559 | add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
|
560 | self.epsilon = shared_floatx(epsilon, "epsilon") |
|
561 | add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
|
562 | ||
563 | def compute_step(self, parameter, previous_step): |
|
564 | mean_square_step_tm1 = _create_algorithm_buffer_for( |
|
565 | parameter, "mean_square_step_tm1") |
|
566 | mean_square_delta_x_tm1 = _create_algorithm_buffer_for( |
|
567 | parameter, "mean_square_delta_x_tm1") |
|
568 | ||
569 | mean_square_step_t = ( |
|
570 | self.decay_rate * mean_square_step_tm1 + |
|
@@ 725-765 (lines=41) @@ | ||
722 | filters or entire weight tensors. Combine with :class:`Restrict` |
|
723 | (and possibly :class:`CompositeRule`), to apply such constraints only |
|
724 | to certain variables and/or apply different norm constraints to |
|
725 | different variables. |
|
726 | ||
727 | Parameters |
|
728 | ---------- |
|
729 | threshold : float |
|
730 | Maximum norm for a given (portion of a) tensor. |
|
731 | axis : int or iterable, optional |
|
732 | An integer single axis, or an iterable collection of integer |
|
733 | axes over which to sum in order to calculate the L2 norm. If |
|
734 | `None` (the default), the norm is computed over all elements |
|
735 | of the tensor. |
|
736 | ||
737 | Notes |
|
738 | ----- |
|
739 | Because of the way the :class:`StepRule` API works, this particular |
|
740 | rule implements norm clipping of the value *after* update in the |
|
741 | following way: it computes ``parameter - previous_step``, scales it |
|
742 | to have (possibly axes-wise) norm(s) of at most `threshold`, |
|
743 | then subtracts *that* value from `parameter` to yield an 'equivalent |
|
744 | step' that respects the desired norm constraints. This procedure |
|
745 | implicitly assumes one is doing simple (stochastic) gradient descent, |
|
746 | and so steps computed by this step rule may not make sense for use |
|
747 | in other contexts. |
|
748 | ||
749 | Investigations into max-norm regularization date from [Srebro2005]_. |
|
750 | The first appearance of this technique as a regularization method |
|
751 | for the weight vectors of individual hidden units in feed-forward |
|
752 | neural networks may be [Hinton2012]_. |
|
753 | ||
754 | .. [Srebro2005] Nathan Srebro and Adi Shraibman. |
|
755 | "Rank, Trace-Norm and Max-Norm". *18th Annual Conference |
|
756 | on Learning Theory (COLT)*, June 2005. |
|
757 | ||
758 | .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava, |
|
759 | Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov. |
|
760 | "Improving neural networks by preventing co-adaptation of |
|
761 | feature detectors". arXiv:1207.0580. |
|
762 | ||
763 | """ |
|
764 | def __init__(self, threshold, axis=None): |
|
765 | axis = pack(axis) if axis is not None else () |
|
766 | self.axis = set(axis) |
|
767 | self.threshold = shared_floatx(threshold, "threshold") |
|
768 | add_role(self.threshold, ALGORITHM_HYPERPARAMETER) |