|
@@ 522-567 (lines=46) @@
|
| 519 |
|
"""Scales the step size by a running average of the recent step norms. |
| 520 |
|
|
| 521 |
|
Parameters |
| 522 |
|
---------- |
| 523 |
|
decay_rate : float, optional |
| 524 |
|
How fast the running average decays, value in [0, 1] |
| 525 |
|
(lower is faster). Defaults to 0.9. |
| 526 |
|
max_scaling : float, optional |
| 527 |
|
Maximum scaling of the step size, in case the running average is |
| 528 |
|
really small. Needs to be greater than 0. Defaults to 1e5. |
| 529 |
|
|
| 530 |
|
Notes |
| 531 |
|
----- |
| 532 |
|
This step rule is intended to be used in conjunction with another |
| 533 |
|
step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
| 534 |
|
experience, look at :class:`RMSProp`. |
| 535 |
|
|
| 536 |
|
In general, this step rule should be used _before_ other step rules, |
| 537 |
|
because it has normalization properties that may undo their work. |
| 538 |
|
For instance, it should be applied first when used in conjunction |
| 539 |
|
with :class:`Scale`. |
| 540 |
|
|
| 541 |
|
For more information, see [Hint2014]_. |
| 542 |
|
|
| 543 |
|
""" |
| 544 |
|
def __init__(self, decay_rate=0.9, max_scaling=1e5): |
| 545 |
|
if not 0.0 <= decay_rate <= 1.0: |
| 546 |
|
raise ValueError("decay rate needs to be in [0, 1]") |
| 547 |
|
if max_scaling <= 0: |
| 548 |
|
raise ValueError("max. scaling needs to be greater than 0") |
| 549 |
|
self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
| 550 |
|
add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
| 551 |
|
self.epsilon = 1. / max_scaling |
| 552 |
|
|
| 553 |
|
def compute_step(self, parameter, previous_step): |
| 554 |
|
mean_square_step_tm1 = _create_algorithm_buffer_for( |
| 555 |
|
parameter, "mean_square_step_tm1") |
| 556 |
|
mean_square_step_t = ( |
| 557 |
|
self.decay_rate * mean_square_step_tm1 + |
| 558 |
|
(1 - self.decay_rate) * tensor.sqr(previous_step)) |
| 559 |
|
rms_step_t = tensor.maximum( |
| 560 |
|
tensor.sqrt(mean_square_step_t), self.epsilon) |
| 561 |
|
step = previous_step / rms_step_t |
| 562 |
|
updates = [(mean_square_step_tm1, mean_square_step_t)] |
| 563 |
|
return step, updates |
| 564 |
|
|
| 565 |
|
|
| 566 |
|
class RMSProp(CompositeRule): |
| 567 |
|
"""Scales the step size by a running average of the recent step norms. |
| 568 |
|
|
| 569 |
|
Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule |
| 570 |
|
described in [Hint2014]_. |
|
@@ 725-765 (lines=41) @@
|
| 722 |
|
"""Implements the AdaGrad learning rule. |
| 723 |
|
|
| 724 |
|
Parameters |
| 725 |
|
---------- |
| 726 |
|
learning_rate : float, optional |
| 727 |
|
Step size. |
| 728 |
|
Default value is set to 0.0002. |
| 729 |
|
epsilon : float, optional |
| 730 |
|
Stabilizing constant for one over root of sum of squares. |
| 731 |
|
Defaults to 1e-6. |
| 732 |
|
|
| 733 |
|
Notes |
| 734 |
|
----- |
| 735 |
|
For more information, see [ADAGRAD]_. |
| 736 |
|
|
| 737 |
|
.. [ADADGRAD] Duchi J, Hazan E, Singer Y., |
| 738 |
|
*Adaptive subgradient methods for online learning and |
| 739 |
|
stochastic optimization*, |
| 740 |
|
http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf |
| 741 |
|
|
| 742 |
|
""" |
| 743 |
|
def __init__(self, learning_rate=0.002, epsilon=1e-6): |
| 744 |
|
self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
| 745 |
|
self.epsilon = shared_floatx(epsilon, "epsilon") |
| 746 |
|
add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
| 747 |
|
add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
| 748 |
|
|
| 749 |
|
def compute_step(self, parameter, previous_step): |
| 750 |
|
name = 'adagrad_sqs' |
| 751 |
|
if parameter.name: |
| 752 |
|
name += '_' + parameter.name |
| 753 |
|
ssq = _create_algorithm_buffer_for(parameter, name=name) |
| 754 |
|
|
| 755 |
|
ssq_t = (tensor.sqr(previous_step) + ssq) |
| 756 |
|
step = (self.learning_rate * previous_step / |
| 757 |
|
(tensor.sqrt(ssq_t) + self.epsilon)) |
| 758 |
|
|
| 759 |
|
updates = [(ssq, ssq_t)] |
| 760 |
|
|
| 761 |
|
return step, updates |
| 762 |
|
|
| 763 |
|
|
| 764 |
|
class Adam(StepRule): |
| 765 |
|
"""Adam optimizer as described in [King2014]_. |
| 766 |
|
|
| 767 |
|
.. [King2014] Diederik Kingma, Jimmy Ba, |
| 768 |
|
*Adam: A Method for Stochastic Optimization*, |