1 | # -*- coding: utf-8 -*- |
||
0 ignored issues
–
show
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.architectures -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.architectures -> blocks.bricks.simple -> blocks.bricks.wrappers -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.architectures -> blocks.bricks.simple -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.misc -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.misc -> blocks.bricks.parallel -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.bn -> blocks.bricks.sequences -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
There seems to be a cyclic import (blocks.bricks -> blocks.bricks.recurrent -> blocks.bricks.recurrent.architectures -> blocks.bricks.recurrent.base -> blocks.bricks.base -> blocks.graph -> blocks.graph.bn).
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.
Loading history...
|
|||
2 | import numpy |
||
3 | from theano import tensor |
||
4 | |||
5 | from ..base import application, lazy |
||
6 | from ..simple import Initializable, Logistic, Tanh |
||
7 | from ...roles import add_role, WEIGHT, INITIAL_STATE |
||
8 | from ...utils import shared_floatx_nans, shared_floatx_zeros |
||
9 | from .base import BaseRecurrent, recurrent |
||
10 | |||
11 | |||
12 | class SimpleRecurrent(BaseRecurrent, Initializable): |
||
13 | """The traditional recurrent transition. |
||
14 | |||
15 | The most well-known recurrent transition: a matrix multiplication, |
||
16 | optionally followed by a non-linearity. |
||
17 | |||
18 | Parameters |
||
19 | ---------- |
||
20 | dim : int |
||
21 | The dimension of the hidden state |
||
22 | activation : :class:`~.bricks.Brick` |
||
23 | The brick to apply as activation. |
||
24 | |||
25 | Notes |
||
26 | ----- |
||
27 | See :class:`.Initializable` for initialization parameters. |
||
28 | |||
29 | """ |
||
30 | @lazy(allocation=['dim']) |
||
31 | def __init__(self, dim, activation, **kwargs): |
||
32 | self.dim = dim |
||
33 | children = [activation] |
||
34 | kwargs.setdefault('children', []).extend(children) |
||
35 | super(SimpleRecurrent, self).__init__(**kwargs) |
||
36 | |||
37 | @property |
||
38 | def W(self): |
||
39 | return self.parameters[0] |
||
40 | |||
41 | def get_dim(self, name): |
||
42 | if name == 'mask': |
||
43 | return 0 |
||
44 | if name in (SimpleRecurrent.apply.sequences + |
||
45 | SimpleRecurrent.apply.states): |
||
46 | return self.dim |
||
47 | return super(SimpleRecurrent, self).get_dim(name) |
||
48 | |||
49 | def _allocate(self): |
||
50 | self.parameters.append(shared_floatx_nans((self.dim, self.dim), |
||
51 | name="W")) |
||
52 | add_role(self.parameters[0], WEIGHT) |
||
53 | self.parameters.append(shared_floatx_zeros((self.dim,), |
||
54 | name="initial_state")) |
||
55 | add_role(self.parameters[1], INITIAL_STATE) |
||
56 | |||
57 | def _initialize(self): |
||
58 | self.weights_init.initialize(self.W, self.rng) |
||
59 | |||
60 | @recurrent(sequences=['inputs', 'mask'], states=['states'], |
||
61 | outputs=['states'], contexts=[]) |
||
62 | def apply(self, inputs, states, mask=None): |
||
63 | """Apply the simple transition. |
||
64 | |||
65 | Parameters |
||
66 | ---------- |
||
67 | inputs : :class:`~tensor.TensorVariable` |
||
68 | The 2D inputs, in the shape (batch, features). |
||
69 | states : :class:`~tensor.TensorVariable` |
||
70 | The 2D states, in the shape (batch, features). |
||
71 | mask : :class:`~tensor.TensorVariable` |
||
72 | A 1D binary array in the shape (batch,) which is 1 if |
||
73 | there is data available, 0 if not. Assumed to be 1-s |
||
74 | only if not given. |
||
75 | |||
76 | """ |
||
77 | next_states = inputs + tensor.dot(states, self.W) |
||
78 | next_states = self.children[0].apply(next_states) |
||
79 | if mask: |
||
80 | next_states = (mask[:, None] * next_states + |
||
81 | (1 - mask[:, None]) * states) |
||
82 | return next_states |
||
83 | |||
84 | @application(outputs=apply.states) |
||
85 | def initial_states(self, batch_size, *args, **kwargs): |
||
86 | return tensor.repeat(self.parameters[1][None, :], batch_size, 0) |
||
87 | |||
88 | |||
89 | class LSTM(BaseRecurrent, Initializable): |
||
90 | u"""Long Short Term Memory. |
||
91 | |||
92 | Every unit of an LSTM is equipped with input, forget and output gates. |
||
93 | This implementation is based on code by Mohammad Pezeshki that |
||
94 | implements the architecture used in [GSS03]_ and [Grav13]_. It aims to |
||
95 | do as many computations in parallel as possible and expects the last |
||
96 | dimension of the input to be four times the output dimension. |
||
97 | |||
98 | Unlike a vanilla LSTM as described in [HS97]_, this model has peephole |
||
99 | connections from the cells to the gates. The output gates receive |
||
100 | information about the cells at the current time step, while the other |
||
101 | gates only receive information about the cells at the previous time |
||
102 | step. All 'peephole' weight matrices are diagonal. |
||
103 | |||
104 | .. [GSS03] Gers, Felix A., Nicol N. Schraudolph, and Jürgen |
||
105 | Schmidhuber, *Learning precise timing with LSTM recurrent |
||
106 | networks*, Journal of Machine Learning Research 3 (2003), |
||
107 | pp. 115-143. |
||
108 | .. [Grav13] Graves, Alex, *Generating sequences with recurrent neural |
||
109 | networks*, arXiv preprint arXiv:1308.0850 (2013). |
||
110 | .. [HS97] Sepp Hochreiter, and Jürgen Schmidhuber, *Long Short-Term |
||
111 | Memory*, Neural Computation 9(8) (1997), pp. 1735-1780. |
||
112 | |||
113 | Parameters |
||
114 | ---------- |
||
115 | dim : int |
||
116 | The dimension of the hidden state. |
||
117 | activation : :class:`~.bricks.Brick`, optional |
||
118 | The activation function. The default and by far the most popular |
||
119 | is :class:`.Tanh`. |
||
120 | gate_activation : :class:`~.bricks.Brick` or None |
||
121 | The brick to apply as activation for gates (input/output/forget). |
||
122 | If ``None`` a :class:`.Logistic` brick is used. |
||
123 | |||
124 | Notes |
||
125 | ----- |
||
126 | See :class:`.Initializable` for initialization parameters. |
||
127 | |||
128 | """ |
||
129 | @lazy(allocation=['dim']) |
||
130 | def __init__(self, dim, activation=None, gate_activation=None, **kwargs): |
||
131 | self.dim = dim |
||
132 | |||
133 | if not activation: |
||
134 | activation = Tanh() |
||
135 | if not gate_activation: |
||
136 | gate_activation = Logistic() |
||
137 | self.activation = activation |
||
138 | self.gate_activation = gate_activation |
||
139 | |||
140 | children = [self.activation, self.gate_activation] |
||
141 | kwargs.setdefault('children', []).extend(children) |
||
142 | super(LSTM, self).__init__(**kwargs) |
||
143 | |||
144 | def get_dim(self, name): |
||
145 | if name == 'inputs': |
||
146 | return self.dim * 4 |
||
147 | if name in ['states', 'cells']: |
||
148 | return self.dim |
||
149 | if name == 'mask': |
||
150 | return 0 |
||
151 | return super(LSTM, self).get_dim(name) |
||
152 | |||
153 | def _allocate(self): |
||
154 | self.W_state = shared_floatx_nans((self.dim, 4*self.dim), |
||
0 ignored issues
–
show
The name
W_state does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site.
Loading history...
|
|||
155 | name='W_state') |
||
156 | self.W_cell_to_in = shared_floatx_nans((self.dim,), |
||
0 ignored issues
–
show
The name
W_cell_to_in does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site.
Loading history...
|
|||
157 | name='W_cell_to_in') |
||
158 | self.W_cell_to_forget = shared_floatx_nans((self.dim,), |
||
0 ignored issues
–
show
The name
W_cell_to_forget does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site.
Loading history...
|
|||
159 | name='W_cell_to_forget') |
||
160 | self.W_cell_to_out = shared_floatx_nans((self.dim,), |
||
0 ignored issues
–
show
The name
W_cell_to_out does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$ ).
This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site.
Loading history...
|
|||
161 | name='W_cell_to_out') |
||
162 | # The underscore is required to prevent collision with |
||
163 | # the `initial_state` application method |
||
164 | self.initial_state_ = shared_floatx_zeros((self.dim,), |
||
165 | name="initial_state") |
||
166 | self.initial_cells = shared_floatx_zeros((self.dim,), |
||
167 | name="initial_cells") |
||
168 | add_role(self.W_state, WEIGHT) |
||
169 | add_role(self.W_cell_to_in, WEIGHT) |
||
170 | add_role(self.W_cell_to_forget, WEIGHT) |
||
171 | add_role(self.W_cell_to_out, WEIGHT) |
||
172 | add_role(self.initial_state_, INITIAL_STATE) |
||
173 | add_role(self.initial_cells, INITIAL_STATE) |
||
174 | |||
175 | self.parameters = [ |
||
176 | self.W_state, self.W_cell_to_in, self.W_cell_to_forget, |
||
177 | self.W_cell_to_out, self.initial_state_, self.initial_cells] |
||
178 | |||
179 | def _initialize(self): |
||
180 | for weights in self.parameters[:4]: |
||
181 | self.weights_init.initialize(weights, self.rng) |
||
182 | |||
183 | @recurrent(sequences=['inputs', 'mask'], states=['states', 'cells'], |
||
184 | contexts=[], outputs=['states', 'cells']) |
||
185 | def apply(self, inputs, states, cells, mask=None): |
||
186 | """Apply the Long Short Term Memory transition. |
||
187 | |||
188 | Parameters |
||
189 | ---------- |
||
190 | states : :class:`~tensor.TensorVariable` |
||
191 | The 2 dimensional matrix of current states in the shape |
||
192 | (batch_size, features). Required for `one_step` usage. |
||
193 | cells : :class:`~tensor.TensorVariable` |
||
194 | The 2 dimensional matrix of current cells in the shape |
||
195 | (batch_size, features). Required for `one_step` usage. |
||
196 | inputs : :class:`~tensor.TensorVariable` |
||
197 | The 2 dimensional matrix of inputs in the shape (batch_size, |
||
198 | features * 4). The `inputs` needs to be four times the |
||
199 | dimension of the LSTM brick to insure each four gates receive |
||
200 | different transformations of the input. See [Grav13]_ |
||
201 | equations 7 to 10 for more details. The `inputs` are then split |
||
202 | in this order: Input gates, forget gates, cells and output |
||
203 | gates. |
||
204 | mask : :class:`~tensor.TensorVariable` |
||
205 | A 1D binary array in the shape (batch,) which is 1 if there is |
||
206 | data available, 0 if not. Assumed to be 1-s only if not given. |
||
207 | |||
208 | .. [Grav13] Graves, Alex, *Generating sequences with recurrent* |
||
209 | *neural networks*, arXiv preprint arXiv:1308.0850 (2013). |
||
210 | |||
211 | Returns |
||
212 | ------- |
||
213 | states : :class:`~tensor.TensorVariable` |
||
214 | Next states of the network. |
||
215 | cells : :class:`~tensor.TensorVariable` |
||
216 | Next cell activations of the network. |
||
217 | |||
218 | """ |
||
219 | def slice_last(x, no): |
||
220 | return x[:, no*self.dim: (no+1)*self.dim] |
||
221 | |||
222 | activation = tensor.dot(states, self.W_state) + inputs |
||
223 | in_gate = self.gate_activation.apply( |
||
224 | slice_last(activation, 0) + cells * self.W_cell_to_in) |
||
225 | forget_gate = self.gate_activation.apply( |
||
226 | slice_last(activation, 1) + cells * self.W_cell_to_forget) |
||
227 | next_cells = ( |
||
228 | forget_gate * cells + |
||
229 | in_gate * self.activation.apply(slice_last(activation, 2))) |
||
230 | out_gate = self.gate_activation.apply( |
||
231 | slice_last(activation, 3) + next_cells * self.W_cell_to_out) |
||
232 | next_states = out_gate * self.activation.apply(next_cells) |
||
233 | |||
234 | if mask: |
||
235 | next_states = (mask[:, None] * next_states + |
||
236 | (1 - mask[:, None]) * states) |
||
237 | next_cells = (mask[:, None] * next_cells + |
||
238 | (1 - mask[:, None]) * cells) |
||
239 | |||
240 | return next_states, next_cells |
||
241 | |||
242 | @application(outputs=apply.states) |
||
243 | def initial_states(self, batch_size, *args, **kwargs): |
||
244 | return [tensor.repeat(self.initial_state_[None, :], batch_size, 0), |
||
245 | tensor.repeat(self.initial_cells[None, :], batch_size, 0)] |
||
246 | |||
247 | |||
248 | class GatedRecurrent(BaseRecurrent, Initializable): |
||
249 | u"""Gated recurrent neural network. |
||
250 | |||
251 | Gated recurrent neural network (GRNN) as introduced in [CvMG14]_. Every |
||
252 | unit of a GRNN is equipped with update and reset gates that facilitate |
||
253 | better gradient propagation. |
||
254 | |||
255 | Parameters |
||
256 | ---------- |
||
257 | dim : int |
||
258 | The dimension of the hidden state. |
||
259 | activation : :class:`~.bricks.Brick` or None |
||
260 | The brick to apply as activation. If ``None`` a |
||
261 | :class:`.Tanh` brick is used. |
||
262 | gate_activation : :class:`~.bricks.Brick` or None |
||
263 | The brick to apply as activation for gates. If ``None`` a |
||
264 | :class:`.Logistic` brick is used. |
||
265 | |||
266 | Notes |
||
267 | ----- |
||
268 | See :class:`.Initializable` for initialization parameters. |
||
269 | |||
270 | .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, |
||
271 | Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua |
||
272 | Bengio, *Learning Phrase Representations using RNN Encoder-Decoder |
||
273 | for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. |
||
274 | |||
275 | """ |
||
276 | @lazy(allocation=['dim']) |
||
277 | def __init__(self, dim, activation=None, gate_activation=None, |
||
278 | **kwargs): |
||
279 | self.dim = dim |
||
280 | |||
281 | if not activation: |
||
282 | activation = Tanh() |
||
283 | if not gate_activation: |
||
284 | gate_activation = Logistic() |
||
285 | self.activation = activation |
||
286 | self.gate_activation = gate_activation |
||
287 | |||
288 | children = [activation, gate_activation] |
||
289 | kwargs.setdefault('children', []).extend(children) |
||
290 | super(GatedRecurrent, self).__init__(**kwargs) |
||
291 | |||
292 | @property |
||
293 | def state_to_state(self): |
||
294 | return self.parameters[0] |
||
295 | |||
296 | @property |
||
297 | def state_to_gates(self): |
||
298 | return self.parameters[1] |
||
299 | |||
300 | def get_dim(self, name): |
||
301 | if name == 'mask': |
||
302 | return 0 |
||
303 | if name in ['inputs', 'states']: |
||
304 | return self.dim |
||
305 | if name == 'gate_inputs': |
||
306 | return 2 * self.dim |
||
307 | return super(GatedRecurrent, self).get_dim(name) |
||
308 | |||
309 | def _allocate(self): |
||
310 | self.parameters.append(shared_floatx_nans((self.dim, self.dim), |
||
311 | name='state_to_state')) |
||
312 | self.parameters.append(shared_floatx_nans((self.dim, 2 * self.dim), |
||
313 | name='state_to_gates')) |
||
314 | self.parameters.append(shared_floatx_zeros((self.dim,), |
||
315 | name="initial_state")) |
||
316 | for i in range(2): |
||
317 | if self.parameters[i]: |
||
318 | add_role(self.parameters[i], WEIGHT) |
||
319 | add_role(self.parameters[2], INITIAL_STATE) |
||
320 | |||
321 | def _initialize(self): |
||
322 | self.weights_init.initialize(self.state_to_state, self.rng) |
||
323 | state_to_update = self.weights_init.generate( |
||
324 | self.rng, (self.dim, self.dim)) |
||
325 | state_to_reset = self.weights_init.generate( |
||
326 | self.rng, (self.dim, self.dim)) |
||
327 | self.state_to_gates.set_value( |
||
328 | numpy.hstack([state_to_update, state_to_reset])) |
||
329 | |||
330 | @recurrent(sequences=['mask', 'inputs', 'gate_inputs'], |
||
331 | states=['states'], outputs=['states'], contexts=[]) |
||
332 | def apply(self, inputs, gate_inputs, states, mask=None): |
||
333 | """Apply the gated recurrent transition. |
||
334 | |||
335 | Parameters |
||
336 | ---------- |
||
337 | states : :class:`~tensor.TensorVariable` |
||
338 | The 2 dimensional matrix of current states in the shape |
||
339 | (batch_size, dim). Required for `one_step` usage. |
||
340 | inputs : :class:`~tensor.TensorVariable` |
||
341 | The 2 dimensional matrix of inputs in the shape (batch_size, |
||
342 | dim) |
||
343 | gate_inputs : :class:`~tensor.TensorVariable` |
||
344 | The 2 dimensional matrix of inputs to the gates in the |
||
345 | shape (batch_size, 2 * dim). |
||
346 | mask : :class:`~tensor.TensorVariable` |
||
347 | A 1D binary array in the shape (batch,) which is 1 if there is |
||
348 | data available, 0 if not. Assumed to be 1-s only if not given. |
||
349 | |||
350 | Returns |
||
351 | ------- |
||
352 | output : :class:`~tensor.TensorVariable` |
||
353 | Next states of the network. |
||
354 | |||
355 | """ |
||
356 | gate_values = self.gate_activation.apply( |
||
357 | states.dot(self.state_to_gates) + gate_inputs) |
||
358 | update_values = gate_values[:, :self.dim] |
||
359 | reset_values = gate_values[:, self.dim:] |
||
360 | states_reset = states * reset_values |
||
361 | next_states = self.activation.apply( |
||
362 | states_reset.dot(self.state_to_state) + inputs) |
||
363 | next_states = (next_states * update_values + |
||
364 | states * (1 - update_values)) |
||
365 | if mask: |
||
366 | next_states = (mask[:, None] * next_states + |
||
367 | (1 - mask[:, None]) * states) |
||
368 | return next_states |
||
369 | |||
370 | @application(outputs=apply.states) |
||
371 | def initial_states(self, batch_size, *args, **kwargs): |
||
372 | return [tensor.repeat(self.parameters[2][None, :], batch_size, 0)] |
||
373 |
Cyclic imports may cause partly loaded modules to be returned. This might lead to unexpected runtime behavior which is hard to debug.