|
@@ 522-567 (lines=46) @@
|
| 519 |
|
|
| 520 |
|
Combines :class:`BasicMomentum` and :class:`Scale` to form the |
| 521 |
|
usual momentum step rule. |
| 522 |
|
|
| 523 |
|
Parameters |
| 524 |
|
---------- |
| 525 |
|
learning_rate : float, optional |
| 526 |
|
The learning rate by which the previous step scaled. Defaults to 1. |
| 527 |
|
momentum : float, optional |
| 528 |
|
The momentum coefficient. Defaults to 0. |
| 529 |
|
|
| 530 |
|
Attributes |
| 531 |
|
---------- |
| 532 |
|
learning_rate : :class:`~tensor.SharedVariable` |
| 533 |
|
A variable for learning rate. |
| 534 |
|
momentum : :class:`~tensor.SharedVariable` |
| 535 |
|
A variable for momentum. |
| 536 |
|
|
| 537 |
|
See Also |
| 538 |
|
-------- |
| 539 |
|
:class:`SharedVariableModifier` |
| 540 |
|
|
| 541 |
|
""" |
| 542 |
|
def __init__(self, learning_rate=1.0, momentum=0.): |
| 543 |
|
scale = Scale(learning_rate=learning_rate) |
| 544 |
|
basic_momentum = BasicMomentum(momentum=momentum) |
| 545 |
|
self.learning_rate = scale.learning_rate |
| 546 |
|
self.momentum = basic_momentum.momentum |
| 547 |
|
self.components = [scale, basic_momentum] |
| 548 |
|
|
| 549 |
|
|
| 550 |
|
class AdaDelta(StepRule): |
| 551 |
|
"""Adapts the step size over time using only first order information. |
| 552 |
|
|
| 553 |
|
Parameters |
| 554 |
|
---------- |
| 555 |
|
decay_rate : float, optional |
| 556 |
|
Decay rate in [0, 1]. Defaults to 0.95. |
| 557 |
|
epsilon : float, optional |
| 558 |
|
Stabilizing constant for RMS. Defaults to 1e-6. |
| 559 |
|
|
| 560 |
|
Notes |
| 561 |
|
----- |
| 562 |
|
For more information, see [ADADELTA]_. |
| 563 |
|
|
| 564 |
|
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning |
| 565 |
|
Rate Method*, arXiv:1212.5701. |
| 566 |
|
|
| 567 |
|
""" |
| 568 |
|
def __init__(self, decay_rate=0.95, epsilon=1e-6): |
| 569 |
|
if not 0.0 <= decay_rate <= 1.0: |
| 570 |
|
raise ValueError("decay rate needs to be in [0, 1]") |
|
@@ 725-765 (lines=41) @@
|
| 722 |
|
multiplier = tensor.switch(norm < self.threshold, |
| 723 |
|
1, self.threshold / norm) |
| 724 |
|
steps = OrderedDict( |
| 725 |
|
(parameter, step * multiplier) |
| 726 |
|
for parameter, step in previous_steps.items()) |
| 727 |
|
return steps, [] |
| 728 |
|
|
| 729 |
|
|
| 730 |
|
class VariableClipping(StepRule): |
| 731 |
|
"""Clip the maximum norm of individual variables along certain axes. |
| 732 |
|
|
| 733 |
|
This :class:`StepRule` can be used to implement L2 norm constraints on |
| 734 |
|
e.g. the weight vectors of individual hidden units, convolutional |
| 735 |
|
filters or entire weight tensors. Combine with :class:`Restrict` |
| 736 |
|
(and possibly :class:`CompositeRule`), to apply such constraints only |
| 737 |
|
to certain variables and/or apply different norm constraints to |
| 738 |
|
different variables. |
| 739 |
|
|
| 740 |
|
Parameters |
| 741 |
|
---------- |
| 742 |
|
threshold : float |
| 743 |
|
Maximum norm for a given (portion of a) tensor. |
| 744 |
|
axis : int or iterable, optional |
| 745 |
|
An integer single axis, or an iterable collection of integer |
| 746 |
|
axes over which to sum in order to calculate the L2 norm. If |
| 747 |
|
`None` (the default), the norm is computed over all elements |
| 748 |
|
of the tensor. |
| 749 |
|
|
| 750 |
|
Notes |
| 751 |
|
----- |
| 752 |
|
Because of the way the :class:`StepRule` API works, this particular |
| 753 |
|
rule implements norm clipping of the value *after* update in the |
| 754 |
|
following way: it computes ``parameter - previous_step``, scales it |
| 755 |
|
to have (possibly axes-wise) norm(s) of at most `threshold`, |
| 756 |
|
then subtracts *that* value from `parameter` to yield an 'equivalent |
| 757 |
|
step' that respects the desired norm constraints. This procedure |
| 758 |
|
implicitly assumes one is doing simple (stochastic) gradient descent, |
| 759 |
|
and so steps computed by this step rule may not make sense for use |
| 760 |
|
in other contexts. |
| 761 |
|
|
| 762 |
|
Investigations into max-norm regularization date from [Srebro2005]_. |
| 763 |
|
The first appearance of this technique as a regularization method |
| 764 |
|
for the weight vectors of individual hidden units in feed-forward |
| 765 |
|
neural networks may be [Hinton2012]_. |
| 766 |
|
|
| 767 |
|
.. [Srebro2005] Nathan Srebro and Adi Shraibman. |
| 768 |
|
"Rank, Trace-Norm and Max-Norm". *18th Annual Conference |