1 | """Training algorithms.""" |
||
0 ignored issues
–
show
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.sequences -> blocks.bricks.simple -> blocks.bricks.interfaces -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.misc -> blocks.bricks.recurrent.base -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.misc -> blocks.bricks.parallel -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.architectures -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
|
|||
2 | import logging |
||
3 | import itertools |
||
4 | from abc import ABCMeta, abstractmethod |
||
5 | from collections import OrderedDict |
||
6 | from collections import Mapping |
||
7 | from six.moves import reduce |
||
8 | |||
9 | from picklable_itertools.extras import equizip |
||
10 | |||
11 | import theano |
||
12 | from six import add_metaclass |
||
13 | from theano import tensor |
||
14 | |||
15 | from blocks.graph import ComputationGraph |
||
16 | from blocks.roles import add_role, ALGORITHM_HYPERPARAMETER, ALGORITHM_BUFFER |
||
17 | from blocks.theano_expressions import l2_norm |
||
18 | from blocks.utils import (dict_subset, pack, shared_floatx, |
||
19 | shared_floatx_zeros_matching) |
||
20 | |||
21 | logger = logging.getLogger(__name__) |
||
22 | |||
23 | |||
24 | def _create_algorithm_buffer_for(param, *args, **kwargs): |
||
25 | buf = shared_floatx_zeros_matching(param, *args, **kwargs) |
||
26 | buf.tag.for_parameter = param |
||
27 | add_role(buf, ALGORITHM_BUFFER) |
||
28 | return buf |
||
29 | |||
30 | |||
31 | @add_metaclass(ABCMeta) |
||
32 | class TrainingAlgorithm(object): |
||
33 | """Base class for training algorithms. |
||
34 | |||
35 | A training algorithm object has a simple life-cycle. |
||
36 | First it is initialized by calling its :meth:`initialize` method. |
||
37 | At this stage, for instance, Theano functions can be compiled. |
||
38 | After that the :meth:`process_batch` method is repeatedly |
||
39 | called with a batch of training data as a parameter. |
||
40 | |||
41 | """ |
||
42 | @abstractmethod |
||
43 | def initialize(self, **kwargs): |
||
44 | """Initialize the training algorithm.""" |
||
45 | pass |
||
46 | |||
47 | @abstractmethod |
||
48 | def process_batch(self, batch): |
||
49 | """Process a batch of training data. |
||
50 | |||
51 | Attributes |
||
52 | ---------- |
||
53 | batch : dict |
||
54 | A dictionary of (source name, data) pairs. |
||
55 | |||
56 | """ |
||
57 | pass |
||
58 | |||
59 | |||
60 | variable_mismatch_error = """ |
||
61 | |||
62 | Blocks tried to match the sources ({sources}) of the training dataset to \ |
||
63 | the names of the Theano variables ({variables}), but failed to do so. \ |
||
64 | If you want to train on a subset of the sources that your dataset provides, \ |
||
65 | pass the `sources` keyword argument to its constructor, use the \ |
||
66 | FilterSources transformer provided by Fuel, or pass on_unused_sources='warn' \ |
||
67 | or on_unused_sources='ignore' to the GradientDescent algorithm.""" |
||
68 | |||
69 | source_missing_error = """ |
||
70 | |||
71 | Blocks didn't find all the sources ({sources}) of the training dataset \ |
||
72 | that match the names of the Theano variables ({variables}).""" |
||
73 | |||
74 | |||
75 | determinism_error = """Cannot infer parameter list in a fixed order. |
||
76 | |||
77 | Because dictionaries are unordered (and Python uses randomized hashing, \ |
||
78 | which can change the iteration order over the same dictionary from one \ |
||
79 | interpreter session to the next), Blocks cannot infer the parameters list \ |
||
80 | from a plain dictionary of gradients in an order that is reproducible \ |
||
81 | across interpreter sessions; please either specify the parameters \ |
||
82 | explicitly or pass gradients as an OrderedDict (though exercise care in \ |
||
83 | constructing that OrderedDict, as an OrderedDict created by iterating \ |
||
84 | over an unordered iterable (e.g. a dict) will still have an arbitrary \ |
||
85 | and unpredictable order that could cause problems with \ |
||
86 | reproducibility).""" |
||
87 | |||
88 | |||
89 | class UpdatesAlgorithm(TrainingAlgorithm): |
||
90 | """Base class for algorithms that use Theano functions with updates. |
||
91 | |||
92 | Parameters |
||
93 | ---------- |
||
94 | updates : list of tuples or :class:`~collections.OrderedDict` |
||
95 | The updates that should be performed. |
||
96 | theano_func_kwargs : dict, optional |
||
97 | A passthrough to `theano.function` for additional arguments. |
||
98 | Useful for passing `profile` or `mode` arguments to the theano |
||
99 | function that will be compiled for the algorithm. |
||
100 | on_unused_sources : str, one of 'raise' (default), 'ignore', 'warn' |
||
101 | Controls behavior when not all sources in a batch are used |
||
102 | (i.e. there is no variable with a matching name in the inputs |
||
103 | of the computational graph of the updates). |
||
104 | |||
105 | Attributes |
||
106 | ---------- |
||
107 | updates : list of :class:`~tensor.TensorSharedVariable` updates |
||
108 | Updates to be done for every batch. It is required that the |
||
109 | updates are done using the old values of optimized parameters. |
||
110 | |||
111 | Notes |
||
112 | ----- |
||
113 | Changing `updates` attribute or calling `add_updates` after |
||
114 | the `initialize` method is called will have no effect. |
||
115 | |||
116 | """ |
||
117 | def __init__(self, updates=None, theano_func_kwargs=None, |
||
118 | on_unused_sources='raise', **kwargs): |
||
119 | self.updates = [] if updates is None else updates |
||
120 | self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs |
||
121 | is not None else dict()) |
||
122 | self.on_unused_sources = on_unused_sources |
||
123 | super(UpdatesAlgorithm, self).__init__(**kwargs) |
||
124 | |||
125 | def initialize(self): |
||
126 | logger.info("Initializing the training algorithm") |
||
127 | update_values = [new_value for _, new_value in self.updates] |
||
128 | logger.debug("Inferring graph inputs...") |
||
129 | self.inputs = ComputationGraph(update_values).inputs |
||
130 | logger.debug("Compiling training function...") |
||
131 | self._function = theano.function( |
||
132 | self.inputs, [], updates=self.updates, **self.theano_func_kwargs) |
||
133 | logger.info("The training algorithm is initialized") |
||
134 | |||
135 | @property |
||
136 | def updates(self): |
||
137 | return self._updates |
||
138 | |||
139 | @updates.setter |
||
140 | def updates(self, value): |
||
141 | self._updates = value |
||
142 | |||
143 | def add_updates(self, updates): |
||
144 | """Add updates to the training process. |
||
145 | |||
146 | The updates will be done _before_ the parameters are changed. |
||
147 | |||
148 | Parameters |
||
149 | ---------- |
||
150 | updates : list of tuples or :class:`~collections.OrderedDict` |
||
151 | The updates to add. |
||
152 | |||
153 | """ |
||
154 | if isinstance(updates, OrderedDict): |
||
155 | updates = list(updates.items()) |
||
156 | if not isinstance(updates, list): |
||
157 | raise ValueError |
||
158 | self.updates.extend(updates) |
||
159 | |||
160 | def _validate_source_names(self, batch): |
||
161 | in_names = [v.name for v in self.inputs] |
||
162 | |||
163 | if not set(in_names).issubset(set(batch.keys())): |
||
164 | raise ValueError("Didn't find all sources: " + |
||
165 | source_missing_error.format( |
||
166 | sources=batch.keys(), |
||
167 | variables=in_names)) |
||
168 | if not set(batch.keys()).issubset(set(in_names)): |
||
169 | if self.on_unused_sources == 'ignore': |
||
170 | pass |
||
171 | elif self.on_unused_sources == 'warn': |
||
172 | if not hasattr(self, '_unused_source_warned'): |
||
173 | logger.warn(variable_mismatch_error.format( |
||
174 | sources=batch.keys(), |
||
175 | variables=in_names)) |
||
176 | self._unused_source_warned = True |
||
177 | elif self.on_unused_sources == 'raise': |
||
178 | raise ValueError( |
||
179 | "mismatch of variable names and data sources" + |
||
180 | variable_mismatch_error.format( |
||
181 | sources=batch.keys(), |
||
182 | variables=in_names)) |
||
183 | else: |
||
184 | raise ValueError("Wrong value of on_unused_sources: {}." |
||
185 | .format(self.on_unused_sources)) |
||
186 | |||
187 | def process_batch(self, batch): |
||
188 | self._validate_source_names(batch) |
||
189 | ordered_batch = [batch[v.name] for v in self.inputs] |
||
190 | self._function(*ordered_batch) |
||
191 | |||
192 | |||
193 | class GradientDescent(UpdatesAlgorithm): |
||
194 | """A base class for all gradient descent algorithms. |
||
195 | |||
196 | By "gradient descent" we mean a training algorithm of the following |
||
197 | form: |
||
198 | |||
199 | .. code-block:: python |
||
200 | |||
201 | for batch in data: |
||
202 | steps = step_rule.compute_steps(parameters, |
||
203 | gradients_wr_parameters) |
||
204 | for parameter in parameters: |
||
205 | parameter -= steps[parameter] |
||
206 | |||
207 | Note, that the step is *subtracted, not added*! This is done in order |
||
208 | to make step rule chaining possible. |
||
209 | |||
210 | Parameters |
||
211 | ---------- |
||
212 | cost : :class:`~tensor.TensorVariable`, optional |
||
213 | The objective to be minimized. Unused if `gradients` is specified. |
||
214 | parameters : list of :class:`~tensor.TensorSharedVariable`, optional |
||
215 | The parameters to be tuned. If not provided, inferred from the |
||
216 | keys of `gradients` (in which case `gradients` *must* be an |
||
217 | `OrderedDict`). |
||
218 | step_rule : instance of :class:`StepRule`, optional |
||
219 | An object encapsulating most of the algorithm's logic. Its |
||
220 | `compute_steps` method is called to get Theano expression for |
||
221 | steps. Note, that the step rule might have a state, e.g. to |
||
222 | remember a weighted sum of gradients from previous steps like it is |
||
223 | done in gradient descent with momentum. If ``None``, an instance of |
||
224 | :class:`Scale` is created. |
||
225 | gradients : OrderedDict or list of 2-tuples, optional |
||
226 | A dictionary mapping a parameter to an expression for the cost's |
||
227 | gradient with respect to the parameter, or equivalently, a list of |
||
228 | (parameter, gradient) tuples. If ``None``, the gradient |
||
229 | are taken automatically using :func:`theano.gradient.grad`. |
||
230 | known_grads : dict, optional |
||
231 | A passthrough to `theano.tensor.grad`'s `known_grads` argument. |
||
232 | Useful when you know the [approximate] gradients of some |
||
233 | sub-expressions and would like Theano to use that information |
||
234 | to compute parameter gradients. Only makes sense when `gradients` |
||
235 | is `None`. |
||
236 | consider_constant : list, optional |
||
237 | A passthrough to `theano.tensor.grad`'s `consider_constant` |
||
238 | argument. A list of expressions through which gradients will not |
||
239 | be backpropagated. Only makes sense when `gradients` is `None`. |
||
240 | |||
241 | Attributes |
||
242 | ---------- |
||
243 | gradients : OrderedDict |
||
244 | The gradient dictionary. |
||
245 | step_rule : instance of :class:`StepRule` |
||
246 | The step rule. |
||
247 | |||
248 | Notes |
||
249 | ----- |
||
250 | Changing `updates` attribute or calling `add_updates` after |
||
251 | the `initialize` method is called will have no effect. |
||
252 | |||
253 | If a cost and parameters are provided, gradients are taken immediately |
||
254 | upon construction, and changes to these attributes after construction |
||
255 | will have no effect. |
||
256 | |||
257 | `gradients` must be an `OrderedDict` if `parameters` is unspecified |
||
258 | because ordinary dictionaries have an unpredictable iteration |
||
259 | order due to hash randomization (which is enabled by default since |
||
260 | versions 2.7.3 and 3.2.3 of Python). This source of variability, |
||
261 | when combined with Theano's heuristic graph optimizations, can cause |
||
262 | serious reproducibility issues. |
||
263 | |||
264 | """ |
||
265 | def __init__(self, cost=None, parameters=None, step_rule=None, |
||
266 | gradients=None, known_grads=None, consider_constant=None, |
||
267 | **kwargs): |
||
268 | # Set initial values for cost, parameters, gradients. |
||
269 | self.cost = cost |
||
270 | self.parameters = parameters |
||
271 | # Coerce lists of tuples to OrderedDict. Do not coerce Mappings, |
||
272 | # as we don't want to convert dict -> OrderedDict and give it |
||
273 | # an arbitrary, non-deterministic order. |
||
274 | if gradients is not None and not isinstance(gradients, Mapping): |
||
275 | gradients = OrderedDict(gradients) |
||
276 | self.gradients = gradients |
||
277 | |||
278 | # If we don't have gradients, we'll need to infer them from the |
||
279 | # cost and the parameters, both of which must not be None. |
||
280 | if not self.gradients: |
||
281 | self.gradients = self._compute_gradients(known_grads, |
||
282 | consider_constant) |
||
283 | else: |
||
284 | if cost is not None: |
||
285 | logger.warning(('{}: gradients already specified directly; ' |
||
286 | 'cost is unused.' |
||
287 | .format(self.__class__.__name__))) |
||
288 | if self.parameters is None and isinstance(gradients, OrderedDict): |
||
289 | # If the dictionary is ordered, it's safe to use the keys |
||
290 | # as they have a deterministic order. |
||
291 | self.parameters = list(self.gradients.keys()) |
||
292 | elif self.parameters is not None: |
||
293 | # If parameters and gradients.keys() don't match we can |
||
294 | # try to recover if gradients is ordered. |
||
295 | if set(self.parameters) != set(self.gradients.keys()): |
||
296 | logger.warn("Specified parameters list does not match " |
||
297 | "keys in provided gradient dictionary; " |
||
298 | "using parameters inferred from gradients") |
||
299 | if not isinstance(self.gradients, OrderedDict): |
||
300 | raise ValueError(determinism_error) |
||
301 | self.parameters = list(self.gradients.keys()) |
||
302 | else: |
||
303 | # self.parameters is not None, and gradients isn't |
||
304 | # an OrderedDict. We can't do anything safe. |
||
305 | raise ValueError(determinism_error) |
||
306 | if known_grads: |
||
307 | raise ValueError("known_grads has no effect when gradients " |
||
308 | "are passed in") |
||
309 | if consider_constant is not None: |
||
310 | raise ValueError("consider_constant has no effect when " |
||
311 | "gradients are passed in") |
||
312 | |||
313 | # The order in which the different gradient terms appears |
||
314 | # here matters, as floating point addition is non-commutative (and |
||
315 | # Theano's graph optimizations are not order-independent). |
||
316 | # This is why we do not use .values(). |
||
317 | gradient_values = [self.gradients[p] for p in self.parameters] |
||
318 | self.total_gradient_norm = (l2_norm(gradient_values) |
||
319 | .copy(name="total_gradient_norm")) |
||
320 | |||
321 | self.step_rule = step_rule if step_rule else Scale() |
||
322 | logger.debug("Computing parameter steps...") |
||
323 | self.steps, self.step_rule_updates = ( |
||
324 | self.step_rule.compute_steps(self.gradients)) |
||
325 | |||
326 | # Same as gradient_values above: the order may influence a |
||
327 | # bunch of things, so enforce a consistent one (don't use |
||
328 | # .values()). |
||
329 | step_values = [self.steps[p] for p in self.parameters] |
||
330 | self.total_step_norm = (l2_norm(step_values) |
||
331 | .copy(name="total_step_norm")) |
||
332 | |||
333 | # Once again, iterating on gradients may not be deterministically |
||
334 | # ordered if it is not an OrderedDict. We add the updates here in |
||
335 | # the order specified in self.parameters. Keep it this way to |
||
336 | # maintain reproducibility. |
||
337 | kwargs.setdefault('updates', []).extend( |
||
338 | itertools.chain(((parameter, parameter - self.steps[parameter]) |
||
339 | for parameter in self.parameters), |
||
340 | self.step_rule_updates) |
||
341 | ) |
||
342 | super(GradientDescent, self).__init__(**kwargs) |
||
343 | |||
344 | def _compute_gradients(self, known_grads, consider_constant): |
||
345 | if self.cost is None: |
||
346 | raise ValueError("can't infer gradients; no cost specified") |
||
347 | elif self.parameters is None or len(self.parameters) == 0: |
||
348 | raise ValueError("can't infer gradients; no parameters " |
||
349 | "specified") |
||
350 | # While this strictly speaking could be a dict and not an |
||
351 | # OrderedDict (because we iterate over it in the order of |
||
352 | # self.parameters), this guards a little bit against |
||
353 | # nondeterminism introduced by future refactoring. |
||
354 | logger.info("Taking the cost gradient") |
||
355 | gradients = OrderedDict( |
||
356 | equizip(self.parameters, tensor.grad( |
||
357 | self.cost, self.parameters, |
||
358 | known_grads=known_grads, |
||
359 | consider_constant=consider_constant))) |
||
360 | logger.info("The cost gradient computation graph is built") |
||
361 | return gradients |
||
362 | |||
363 | |||
364 | @add_metaclass(ABCMeta) |
||
365 | class StepRule(object): |
||
366 | """A rule to compute steps for a gradient descent algorithm.""" |
||
367 | def compute_step(self, parameter, previous_step): |
||
368 | """Build a Theano expression for the step for a parameter. |
||
369 | |||
370 | This method is called by default implementation of |
||
371 | :meth:`compute_steps`, it relieves from writing a loop each time. |
||
372 | |||
373 | Parameters |
||
374 | ---------- |
||
375 | parameter : :class:`~tensor.TensorSharedVariable` |
||
376 | The parameter. |
||
377 | previous_step : :class:`~tensor.TensorVariable` |
||
378 | Some quantity related to the gradient of the cost with respect |
||
379 | to the parameter, either the gradient itself or a step in a |
||
380 | related direction. |
||
381 | |||
382 | Returns |
||
383 | ------- |
||
384 | step : :class:`~theano.Variable` |
||
385 | Theano variable for the step to take. |
||
386 | updates : list |
||
387 | A list of tuples representing updates to be performed. This |
||
388 | is useful for stateful rules such as :class:`Momentum` which |
||
389 | need to update shared variables after itetations. |
||
390 | |||
391 | """ |
||
392 | raise NotImplementedError |
||
393 | |||
394 | def compute_steps(self, previous_steps): |
||
395 | """Build a Theano expression for steps for all parameters. |
||
396 | |||
397 | Override this method if you want to process the steps |
||
398 | with respect to all parameters as a whole, not parameter-wise. |
||
399 | |||
400 | Parameters |
||
401 | ---------- |
||
402 | previous_steps : OrderedDict |
||
403 | An :class:`~OrderedDict` of |
||
404 | (:class:`~tensor.TensorSharedVariable` |
||
405 | :class:`~tensor.TensorVariable`) pairs. The keys are the |
||
406 | parameters being trained, the values are the expressions for |
||
407 | quantities related to gradients of the cost with respect to |
||
408 | the parameters, either the gradients themselves or steps in |
||
409 | related directions. |
||
410 | |||
411 | Returns |
||
412 | ------- |
||
413 | steps : OrderedDict |
||
414 | A dictionary of the proposed steps in the same form as |
||
415 | `previous_steps`. |
||
416 | updates : list |
||
417 | A list of tuples representing updates to be performed. |
||
418 | |||
419 | """ |
||
420 | parameter_wise = [self.compute_step(parameter, |
||
421 | previous_steps[parameter]) |
||
422 | for parameter in previous_steps] |
||
423 | steps, updates = equizip(*parameter_wise) |
||
424 | steps = OrderedDict((parameter, step) for parameter, step |
||
425 | in equizip(previous_steps.keys(), steps)) |
||
426 | updates = list(itertools.chain(*updates)) |
||
427 | return steps, updates |
||
428 | |||
429 | |||
430 | class CompositeRule(StepRule): |
||
431 | """Chains several step rules. |
||
432 | |||
433 | Parameters |
||
434 | ---------- |
||
435 | components : list of :class:`StepRule` |
||
436 | The learning rules to be chained. The rules will be applied in the |
||
437 | order as given. |
||
438 | |||
439 | """ |
||
440 | def __init__(self, components): |
||
441 | self.components = components |
||
442 | |||
443 | def compute_steps(self, previous_steps): |
||
444 | steps = previous_steps |
||
445 | updates = [] |
||
446 | for rule in self.components: |
||
447 | steps, more_updates = rule.compute_steps(steps) |
||
448 | updates += more_updates |
||
449 | return steps, updates |
||
450 | |||
451 | |||
452 | class Scale(StepRule): |
||
453 | """A step in the direction proportional to the previous step. |
||
454 | |||
455 | If used in :class:`GradientDescent` alone, this step rule implements |
||
456 | steepest descent. |
||
457 | |||
458 | Parameters |
||
459 | ---------- |
||
460 | learning_rate : float |
||
461 | The learning rate by which the previous step is multiplied to |
||
462 | produce the step. |
||
463 | |||
464 | Attributes |
||
465 | ---------- |
||
466 | learning_rate : :class:`~tensor.TensorSharedVariable` |
||
467 | The shared variable storing the learning rate used. |
||
468 | |||
469 | """ |
||
470 | def __init__(self, learning_rate=1.0): |
||
471 | self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
||
472 | add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
||
473 | |||
474 | def compute_step(self, parameter, previous_step): |
||
475 | return self.learning_rate * previous_step, [] |
||
476 | |||
477 | |||
478 | class BasicMomentum(StepRule): |
||
479 | """Accumulates step with exponential discount. |
||
480 | |||
481 | Parameters |
||
482 | ---------- |
||
483 | momentum : float, optional |
||
484 | The momentum coefficient. Defaults to 0. |
||
485 | |||
486 | Notes |
||
487 | ----- |
||
488 | This step rule is intended to be used in conjunction with another |
||
489 | step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
||
490 | experience, look at :class:`Momentum`. |
||
491 | |||
492 | """ |
||
493 | def __init__(self, momentum=0.): |
||
494 | self.momentum = shared_floatx(momentum, "momentum") |
||
495 | add_role(self.momentum, ALGORITHM_HYPERPARAMETER) |
||
496 | |||
497 | def compute_step(self, parameter, previous_step): |
||
498 | velocity = _create_algorithm_buffer_for(parameter, "velocity") |
||
499 | step = self.momentum * velocity + previous_step |
||
500 | updates = [(velocity, step)] |
||
501 | return step, updates |
||
502 | |||
503 | |||
504 | class Momentum(CompositeRule): |
||
505 | """Accumulates step with exponential discount. |
||
506 | |||
507 | Combines :class:`BasicMomentum` and :class:`Scale` to form the |
||
508 | usual momentum step rule. |
||
509 | |||
510 | Parameters |
||
511 | ---------- |
||
512 | learning_rate : float, optional |
||
513 | The learning rate by which the previous step scaled. Defaults to 1. |
||
514 | momentum : float, optional |
||
515 | The momentum coefficient. Defaults to 0. |
||
516 | |||
517 | Attributes |
||
518 | ---------- |
||
519 | learning_rate : :class:`~tensor.SharedVariable` |
||
520 | A variable for learning rate. |
||
521 | momentum : :class:`~tensor.SharedVariable` |
||
522 | View Code Duplication | A variable for momentum. |
|
523 | |||
524 | See Also |
||
525 | -------- |
||
526 | :class:`SharedVariableModifier` |
||
527 | |||
528 | """ |
||
529 | def __init__(self, learning_rate=1.0, momentum=0.): |
||
530 | scale = Scale(learning_rate=learning_rate) |
||
531 | basic_momentum = BasicMomentum(momentum=momentum) |
||
532 | self.learning_rate = scale.learning_rate |
||
533 | self.momentum = basic_momentum.momentum |
||
534 | self.components = [scale, basic_momentum] |
||
535 | |||
536 | |||
537 | class AdaDelta(StepRule): |
||
538 | """Adapts the step size over time using only first order information. |
||
539 | |||
540 | Parameters |
||
541 | ---------- |
||
542 | decay_rate : float, optional |
||
543 | Decay rate in [0, 1]. Defaults to 0.95. |
||
544 | epsilon : float, optional |
||
545 | Stabilizing constant for RMS. Defaults to 1e-6. |
||
546 | |||
547 | Notes |
||
548 | ----- |
||
549 | For more information, see [ADADELTA]_. |
||
550 | |||
551 | .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning |
||
552 | Rate Method*, arXiv:1212.5701. |
||
553 | |||
554 | """ |
||
555 | def __init__(self, decay_rate=0.95, epsilon=1e-6): |
||
556 | if not 0.0 <= decay_rate <= 1.0: |
||
557 | raise ValueError("decay rate needs to be in [0, 1]") |
||
558 | self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
||
559 | add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
||
560 | self.epsilon = shared_floatx(epsilon, "epsilon") |
||
561 | add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
||
562 | |||
563 | def compute_step(self, parameter, previous_step): |
||
564 | mean_square_step_tm1 = _create_algorithm_buffer_for( |
||
565 | parameter, "mean_square_step_tm1") |
||
566 | mean_square_delta_x_tm1 = _create_algorithm_buffer_for( |
||
567 | parameter, "mean_square_delta_x_tm1") |
||
568 | |||
569 | mean_square_step_t = ( |
||
570 | self.decay_rate * mean_square_step_tm1 + |
||
571 | (1 - self.decay_rate) * tensor.sqr(previous_step) |
||
572 | ) |
||
573 | |||
574 | rms_delta_x_tm1 = tensor.sqrt(mean_square_delta_x_tm1 + self.epsilon) |
||
575 | rms_step_t = tensor.sqrt(mean_square_step_t + self.epsilon) |
||
576 | delta_x_t = rms_delta_x_tm1 / rms_step_t * previous_step |
||
577 | |||
578 | mean_square_delta_x_t = ( |
||
579 | self.decay_rate * mean_square_delta_x_tm1 + |
||
580 | (1 - self.decay_rate) * tensor.sqr(delta_x_t) |
||
581 | ) |
||
582 | |||
583 | step = delta_x_t |
||
584 | updates = [(mean_square_step_tm1, mean_square_step_t), |
||
585 | (mean_square_delta_x_tm1, mean_square_delta_x_t)] |
||
586 | return step, updates |
||
587 | |||
588 | |||
589 | class BasicRMSProp(StepRule): |
||
590 | """Scales the step size by a running average of the recent step norms. |
||
591 | |||
592 | Parameters |
||
593 | ---------- |
||
594 | decay_rate : float, optional |
||
595 | How fast the running average decays, value in [0, 1] |
||
596 | (lower is faster). Defaults to 0.9. |
||
597 | max_scaling : float, optional |
||
598 | Maximum scaling of the step size, in case the running average is |
||
599 | really small. Needs to be greater than 0. Defaults to 1e5. |
||
600 | |||
601 | Notes |
||
602 | ----- |
||
603 | This step rule is intended to be used in conjunction with another |
||
604 | step rule, _e.g._ :class:`Scale`. For an all-batteries-included |
||
605 | experience, look at :class:`RMSProp`. |
||
606 | |||
607 | In general, this step rule should be used _before_ other step rules, |
||
608 | because it has normalization properties that may undo their work. |
||
609 | For instance, it should be applied first when used in conjunction |
||
610 | with :class:`Scale`. |
||
611 | |||
612 | For more information, see [Hint2014]_. |
||
613 | |||
614 | """ |
||
615 | def __init__(self, decay_rate=0.9, max_scaling=1e5): |
||
616 | if not 0.0 <= decay_rate <= 1.0: |
||
617 | raise ValueError("decay rate needs to be in [0, 1]") |
||
618 | if max_scaling <= 0: |
||
619 | raise ValueError("max. scaling needs to be greater than 0") |
||
620 | self.decay_rate = shared_floatx(decay_rate, "decay_rate") |
||
621 | add_role(self.decay_rate, ALGORITHM_HYPERPARAMETER) |
||
622 | self.epsilon = 1. / max_scaling |
||
623 | |||
624 | def compute_step(self, parameter, previous_step): |
||
625 | mean_square_step_tm1 = _create_algorithm_buffer_for( |
||
626 | parameter, "mean_square_step_tm1") |
||
627 | mean_square_step_t = ( |
||
628 | self.decay_rate * mean_square_step_tm1 + |
||
629 | (1 - self.decay_rate) * tensor.sqr(previous_step)) |
||
630 | rms_step_t = tensor.maximum( |
||
631 | tensor.sqrt(mean_square_step_t), self.epsilon) |
||
632 | step = previous_step / rms_step_t |
||
633 | updates = [(mean_square_step_tm1, mean_square_step_t)] |
||
634 | return step, updates |
||
635 | |||
636 | |||
637 | class RMSProp(CompositeRule): |
||
638 | """Scales the step size by a running average of the recent step norms. |
||
639 | |||
640 | Combines :class:`BasicRMSProp` and :class:`Scale` to form the step rule |
||
641 | described in [Hint2014]_. |
||
642 | |||
643 | .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*, |
||
644 | lecture 6a, |
||
645 | http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf |
||
646 | |||
647 | Parameters |
||
648 | ---------- |
||
649 | learning_rate : float, optional |
||
650 | The learning rate by which the previous step scaled. Defaults to 1. |
||
651 | decay_rate : float, optional |
||
652 | How fast the running average decays (lower is faster). |
||
653 | Defaults to 0.9. |
||
654 | max_scaling : float, optional |
||
655 | Maximum scaling of the step size, in case the running average is |
||
656 | really small. Defaults to 1e5. |
||
657 | |||
658 | Attributes |
||
659 | ---------- |
||
660 | learning_rate : :class:`~tensor.SharedVariable` |
||
661 | A variable for learning rate. |
||
662 | decay_rate : :class:`~tensor.SharedVariable` |
||
663 | A variable for decay rate. |
||
664 | |||
665 | See Also |
||
666 | -------- |
||
667 | :class:`SharedVariableModifier` |
||
668 | |||
669 | """ |
||
670 | def __init__(self, learning_rate=1.0, decay_rate=0.9, max_scaling=1e5): |
||
671 | basic_rms_prop = BasicRMSProp(decay_rate=decay_rate, |
||
672 | max_scaling=max_scaling) |
||
673 | scale = Scale(learning_rate=learning_rate) |
||
674 | self.learning_rate = scale.learning_rate |
||
675 | self.decay_rate = basic_rms_prop.decay_rate |
||
676 | self.components = [basic_rms_prop, scale] |
||
677 | |||
678 | |||
679 | class StepClipping(StepRule): |
||
680 | """Rescales an entire step if its L2 norm exceeds a threshold. |
||
681 | |||
682 | When the previous steps are the gradients, this step rule performs |
||
683 | gradient clipping. |
||
684 | |||
685 | Parameters |
||
686 | ---------- |
||
687 | threshold : float, optional |
||
688 | The maximum permitted L2 norm for the step. The step |
||
689 | will be rescaled to be not higher than this quanity. |
||
690 | If ``None``, no rescaling will be applied. |
||
691 | |||
692 | Attributes |
||
693 | ---------- |
||
694 | threshold : :class:`.tensor.TensorSharedVariable` |
||
695 | The shared variable storing the clipping threshold used. |
||
696 | |||
697 | """ |
||
698 | def __init__(self, threshold=None): |
||
699 | if threshold is not None: |
||
700 | threshold = shared_floatx(threshold, "threshold") |
||
701 | add_role(threshold, ALGORITHM_HYPERPARAMETER) |
||
702 | self.threshold = threshold |
||
703 | |||
704 | def compute_steps(self, previous_steps): |
||
705 | if self.threshold is None: |
||
706 | steps = previous_steps |
||
707 | else: |
||
708 | norm = l2_norm(previous_steps.values()) |
||
709 | multiplier = tensor.switch(norm < self.threshold, |
||
710 | 1, self.threshold / norm) |
||
711 | steps = OrderedDict( |
||
712 | (parameter, step * multiplier) |
||
713 | for parameter, step in previous_steps.items()) |
||
714 | return steps, [] |
||
715 | |||
716 | |||
717 | class VariableClipping(StepRule): |
||
718 | """Clip the maximum norm of individual variables along certain axes. |
||
719 | |||
720 | This :class:`StepRule` can be used to implement L2 norm constraints on |
||
721 | e.g. the weight vectors of individual hidden units, convolutional |
||
722 | filters or entire weight tensors. Combine with :class:`Restrict` |
||
723 | (and possibly :class:`CompositeRule`), to apply such constraints only |
||
724 | to certain variables and/or apply different norm constraints to |
||
725 | View Code Duplication | different variables. |
|
726 | |||
727 | Parameters |
||
728 | ---------- |
||
729 | threshold : float |
||
730 | Maximum norm for a given (portion of a) tensor. |
||
731 | axis : int or iterable, optional |
||
732 | An integer single axis, or an iterable collection of integer |
||
733 | axes over which to sum in order to calculate the L2 norm. If |
||
734 | `None` (the default), the norm is computed over all elements |
||
735 | of the tensor. |
||
736 | |||
737 | Notes |
||
738 | ----- |
||
739 | Because of the way the :class:`StepRule` API works, this particular |
||
740 | rule implements norm clipping of the value *after* update in the |
||
741 | following way: it computes ``parameter - previous_step``, scales it |
||
742 | to have (possibly axes-wise) norm(s) of at most `threshold`, |
||
743 | then subtracts *that* value from `parameter` to yield an 'equivalent |
||
744 | step' that respects the desired norm constraints. This procedure |
||
745 | implicitly assumes one is doing simple (stochastic) gradient descent, |
||
746 | and so steps computed by this step rule may not make sense for use |
||
747 | in other contexts. |
||
748 | |||
749 | Investigations into max-norm regularization date from [Srebro2005]_. |
||
750 | The first appearance of this technique as a regularization method |
||
751 | for the weight vectors of individual hidden units in feed-forward |
||
752 | neural networks may be [Hinton2012]_. |
||
753 | |||
754 | .. [Srebro2005] Nathan Srebro and Adi Shraibman. |
||
755 | "Rank, Trace-Norm and Max-Norm". *18th Annual Conference |
||
756 | on Learning Theory (COLT)*, June 2005. |
||
757 | |||
758 | .. [Hinton2012] Geoffrey E. Hinton, Nitish Srivastava, |
||
759 | Alex Krizhevsky, Ilya Sutskever, Ruslan R. Salakhutdinov. |
||
760 | "Improving neural networks by preventing co-adaptation of |
||
761 | feature detectors". arXiv:1207.0580. |
||
762 | |||
763 | """ |
||
764 | def __init__(self, threshold, axis=None): |
||
765 | axis = pack(axis) if axis is not None else () |
||
766 | self.axis = set(axis) |
||
767 | self.threshold = shared_floatx(threshold, "threshold") |
||
768 | add_role(self.threshold, ALGORITHM_HYPERPARAMETER) |
||
769 | if len(axis) != len(self.axis): |
||
770 | raise ValueError("axis must be unique") |
||
771 | |||
772 | def compute_step(self, parameter, previous_step): |
||
773 | if any(ax >= previous_step.ndim for ax in self.axis): |
||
774 | raise ValueError("Invalid axis {} for {}, ndim={}".format( |
||
775 | self.axis, parameter, previous_step.ndim)) |
||
776 | if len(self.axis) == 0: |
||
777 | norms = l2_norm([parameter - previous_step]) |
||
778 | else: |
||
779 | squares = tensor.sqr(parameter - previous_step) |
||
780 | norms = tensor.sqrt( |
||
781 | reduce(lambda t, a: t.sum(axis=a, keepdims=True), |
||
782 | sorted(self.axis), squares)) |
||
783 | # We want a step s* that is the same as scaling |
||
784 | # (parameter - previous_step) by threshold / norm |
||
785 | # when threshold < norm. |
||
786 | shrinking_step = (parameter - |
||
787 | (self.threshold / norms) * |
||
788 | (parameter - previous_step)) |
||
789 | return tensor.switch(norms > self.threshold, |
||
790 | shrinking_step, |
||
791 | previous_step), () |
||
792 | |||
793 | |||
794 | class AdaGrad(StepRule): |
||
795 | """Implements the AdaGrad learning rule. |
||
796 | |||
797 | Parameters |
||
798 | ---------- |
||
799 | learning_rate : float, optional |
||
800 | Step size. |
||
801 | Default value is set to 0.0002. |
||
802 | epsilon : float, optional |
||
803 | Stabilizing constant for one over root of sum of squares. |
||
804 | Defaults to 1e-6. |
||
805 | |||
806 | Notes |
||
807 | ----- |
||
808 | For more information, see [ADAGRAD]_. |
||
809 | |||
810 | .. [ADAGRAD] Duchi J, Hazan E, Singer Y., |
||
811 | *Adaptive subgradient methods for online learning and |
||
812 | stochastic optimization*, |
||
813 | http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf |
||
814 | |||
815 | """ |
||
816 | def __init__(self, learning_rate=0.002, epsilon=1e-6): |
||
817 | self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
||
818 | self.epsilon = shared_floatx(epsilon, "epsilon") |
||
819 | add_role(self.learning_rate, ALGORITHM_HYPERPARAMETER) |
||
820 | add_role(self.epsilon, ALGORITHM_HYPERPARAMETER) |
||
821 | |||
822 | def compute_step(self, parameter, previous_step): |
||
823 | name = 'adagrad_sqs' |
||
824 | if parameter.name: |
||
825 | name += '_' + parameter.name |
||
826 | ssq = _create_algorithm_buffer_for(parameter, name=name) |
||
827 | |||
828 | ssq_t = (tensor.sqr(previous_step) + ssq) |
||
829 | step = (self.learning_rate * previous_step / |
||
830 | (tensor.sqrt(ssq_t) + self.epsilon)) |
||
831 | |||
832 | updates = [(ssq, ssq_t)] |
||
833 | |||
834 | return step, updates |
||
835 | |||
836 | |||
837 | class Adam(StepRule): |
||
838 | """Adam optimizer as described in [King2014]_. |
||
839 | |||
840 | .. [King2014] Diederik Kingma, Jimmy Ba, |
||
841 | *Adam: A Method for Stochastic Optimization*, |
||
842 | http://arxiv.org/abs/1412.6980 |
||
843 | |||
844 | Parameters |
||
845 | ---------- |
||
846 | learning_rate : float, optional |
||
847 | Step size. |
||
848 | Default value is set to 0.002. |
||
849 | beta1 : float, optional |
||
850 | Exponential decay rate for the first moment estimates. |
||
851 | Default value is set to 0.1. |
||
852 | beta2 : float, optional |
||
853 | Exponential decay rate for the second moment estimates. |
||
854 | Default value is set to 0.001. |
||
855 | epsilon : float, optional |
||
856 | Default value is set to 1e-8. |
||
857 | decay_factor : float, optional |
||
858 | Default value is set to 1 - 1e-8. |
||
859 | |||
860 | """ |
||
861 | def __init__(self, learning_rate=0.002, |
||
862 | beta1=0.1, beta2=0.001, epsilon=1e-8, |
||
863 | decay_factor=(1 - 1e-8)): |
||
864 | self.learning_rate = shared_floatx(learning_rate, "learning_rate") |
||
865 | self.beta1 = shared_floatx(beta1, "beta1") |
||
866 | self.beta2 = shared_floatx(beta2, "beta2") |
||
867 | self.epsilon = shared_floatx(epsilon, "epsilon") |
||
868 | self.decay_factor = shared_floatx(decay_factor, "decay_factor") |
||
869 | for param in [self.learning_rate, self.beta1, self.beta2, self.epsilon, |
||
870 | self.decay_factor]: |
||
871 | add_role(param, ALGORITHM_HYPERPARAMETER) |
||
872 | |||
873 | def compute_step(self, parameter, previous_step): |
||
874 | mean = _create_algorithm_buffer_for(parameter, 'mean') |
||
875 | variance = _create_algorithm_buffer_for(parameter, 'variance') |
||
876 | time = shared_floatx(0., 'time') |
||
877 | add_role(time, ALGORITHM_BUFFER) |
||
878 | |||
879 | t1 = time + 1 |
||
880 | learning_rate = (self.learning_rate * |
||
881 | tensor.sqrt((1. - (1. - self.beta2)**t1)) / |
||
882 | (1. - (1. - self.beta1)**t1)) |
||
883 | beta_1t = 1 - (1 - self.beta1) * self.decay_factor ** (t1 - 1) |
||
884 | mean_t = beta_1t * previous_step + (1. - beta_1t) * mean |
||
885 | variance_t = (self.beta2 * tensor.sqr(previous_step) + |
||
886 | (1. - self.beta2) * variance) |
||
887 | step = (learning_rate * mean_t / |
||
888 | (tensor.sqrt(variance_t) + self.epsilon)) |
||
889 | |||
890 | updates = [(mean, mean_t), |
||
891 | (variance, variance_t), |
||
892 | (time, t1)] |
||
893 | |||
894 | return step, updates |
||
895 | |||
896 | |||
897 | class RemoveNotFinite(StepRule): |
||
898 | """A step rule that skips steps with non-finite elements. |
||
899 | |||
900 | Replaces a step (the parameter update of a single shared variable) |
||
901 | which contains non-finite elements (such as ``inf`` or ``NaN``) with a |
||
902 | step rescaling the parameters. |
||
903 | |||
904 | Parameters |
||
905 | ---------- |
||
906 | scaler : float, optional |
||
907 | The scaling applied to the parameter in case the step contains |
||
908 | non-finite elements. Defaults to 1, which means that parameters |
||
909 | will not be changed. |
||
910 | |||
911 | Notes |
||
912 | ----- |
||
913 | This rule should be applied last! |
||
914 | |||
915 | This trick was originally used in the GroundHog_ framework. |
||
916 | |||
917 | .. _GroundHog: https://github.com/lisa-groundhog/GroundHog |
||
918 | |||
919 | """ |
||
920 | def __init__(self, scaler=1): |
||
921 | self.scaler = scaler |
||
922 | |||
923 | def compute_step(self, parameter, previous_step): |
||
924 | step_sum = tensor.sum(previous_step) |
||
925 | not_finite = (tensor.isnan(step_sum) + |
||
926 | tensor.isinf(step_sum)) |
||
927 | step = tensor.switch( |
||
928 | not_finite > 0, (1 - self.scaler) * parameter, previous_step) |
||
929 | return step, [] |
||
930 | |||
931 | |||
932 | class Restrict(StepRule): |
||
933 | """Applies a given :class:`StepRule` only to certain variables. |
||
934 | |||
935 | Example applications include clipping steps on only certain parameters, |
||
936 | or scaling a certain kind of parameter's updates (e.g. adding an |
||
937 | additional scalar multiplier to the steps taken on convolutional |
||
938 | filters). |
||
939 | |||
940 | Parameters |
||
941 | ---------- |
||
942 | step_rule : :class:`StepRule` |
||
943 | The :class:`StepRule` to be applied on the given variables. |
||
944 | variables : iterable |
||
945 | A collection of Theano variables on which to apply `step_rule`. |
||
946 | Variables not appearing in this collection will not have |
||
947 | `step_rule` applied to them. |
||
948 | |||
949 | """ |
||
950 | def __init__(self, step_rule, variables): |
||
951 | self.step_rule = step_rule |
||
952 | self.variables = frozenset(variables) |
||
953 | |||
954 | def compute_steps(self, previous_steps): |
||
955 | filtered_previous_steps = dict_subset(previous_steps, self.variables) |
||
956 | steps, updates = self.step_rule.compute_steps(filtered_previous_steps) |
||
957 | actual = OrderedDict((parameter, steps[parameter]) |
||
958 | if parameter in steps |
||
959 | else (parameter, previous_steps[parameter]) |
||
960 | for parameter in previous_steps) |
||
961 | return actual, updates |
||
962 |
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.