Issues (119)

blocks/bricks/recurrent/architectures.py (4 issues)

1
# -*- coding: utf-8 -*-
2
import numpy
3
from theano import tensor
4
5
from ..base import application, lazy
6
from ..simple import Initializable, Logistic, Tanh
7
from ...roles import add_role, WEIGHT, INITIAL_STATE
8
from ...utils import shared_floatx_nans, shared_floatx_zeros
9
from .base import BaseRecurrent, recurrent
10
11
12
class SimpleRecurrent(BaseRecurrent, Initializable):
13
    """The traditional recurrent transition.
14
15
    The most well-known recurrent transition: a matrix multiplication,
16
    optionally followed by a non-linearity.
17
18
    Parameters
19
    ----------
20
    dim : int
21
        The dimension of the hidden state
22
    activation : :class:`~.bricks.Brick`
23
        The brick to apply as activation.
24
25
    Notes
26
    -----
27
    See :class:`.Initializable` for initialization parameters.
28
29
    """
30
    @lazy(allocation=['dim'])
31
    def __init__(self, dim, activation, **kwargs):
32
        self.dim = dim
33
        children = [activation]
34
        kwargs.setdefault('children', []).extend(children)
35
        super(SimpleRecurrent, self).__init__(**kwargs)
36
37
    @property
38
    def W(self):
39
        return self.parameters[0]
40
41
    def get_dim(self, name):
42
        if name == 'mask':
43
            return 0
44
        if name in (SimpleRecurrent.apply.sequences +
45
                    SimpleRecurrent.apply.states):
46
            return self.dim
47
        return super(SimpleRecurrent, self).get_dim(name)
48
49
    def _allocate(self):
50
        self.parameters.append(shared_floatx_nans((self.dim, self.dim),
51
                                                  name="W"))
52
        add_role(self.parameters[0], WEIGHT)
53
        self.parameters.append(shared_floatx_zeros((self.dim,),
54
                                                   name="initial_state"))
55
        add_role(self.parameters[1], INITIAL_STATE)
56
57
    def _initialize(self):
58
        self.weights_init.initialize(self.W, self.rng)
59
60
    @recurrent(sequences=['inputs', 'mask'], states=['states'],
61
               outputs=['states'], contexts=[])
62
    def apply(self, inputs, states, mask=None):
63
        """Apply the simple transition.
64
65
        Parameters
66
        ----------
67
        inputs : :class:`~tensor.TensorVariable`
68
            The 2D inputs, in the shape (batch, features).
69
        states : :class:`~tensor.TensorVariable`
70
            The 2D states, in the shape (batch, features).
71
        mask : :class:`~tensor.TensorVariable`
72
            A 1D binary array in the shape (batch,) which is 1 if
73
            there is data available, 0 if not. Assumed to be 1-s
74
            only if not given.
75
76
        """
77
        next_states = inputs + tensor.dot(states, self.W)
78
        next_states = self.children[0].apply(next_states)
79
        if mask:
80
            next_states = (mask[:, None] * next_states +
81
                           (1 - mask[:, None]) * states)
82
        return next_states
83
84
    @application(outputs=apply.states)
85
    def initial_states(self, batch_size, *args, **kwargs):
86
        return tensor.repeat(self.parameters[1][None, :], batch_size, 0)
87
88
89
class LSTM(BaseRecurrent, Initializable):
90
    u"""Long Short Term Memory.
91
92
    Every unit of an LSTM is equipped with input, forget and output gates.
93
    This implementation is based on code by Mohammad Pezeshki that
94
    implements the architecture used in [GSS03]_ and [Grav13]_. It aims to
95
    do as many computations in parallel as possible and expects the last
96
    dimension of the input to be four times the output dimension.
97
98
    Unlike a vanilla LSTM as described in [HS97]_, this model has peephole
99
    connections from the cells to the gates. The output gates receive
100
    information about the cells at the current time step, while the other
101
    gates only receive information about the cells at the previous time
102
    step. All 'peephole' weight matrices are diagonal.
103
104
    .. [GSS03] Gers, Felix A., Nicol N. Schraudolph, and Jürgen
105
        Schmidhuber, *Learning precise timing with LSTM recurrent
106
        networks*, Journal of Machine Learning Research 3 (2003),
107
        pp. 115-143.
108
    .. [Grav13] Graves, Alex, *Generating sequences with recurrent neural
109
        networks*, arXiv preprint arXiv:1308.0850 (2013).
110
    .. [HS97] Sepp Hochreiter, and Jürgen Schmidhuber, *Long Short-Term
111
        Memory*, Neural Computation 9(8) (1997), pp. 1735-1780.
112
113
    Parameters
114
    ----------
115
    dim : int
116
        The dimension of the hidden state.
117
    activation : :class:`~.bricks.Brick`, optional
118
        The activation function. The default and by far the most popular
119
        is :class:`.Tanh`.
120
    gate_activation : :class:`~.bricks.Brick` or None
121
        The brick to apply as activation for gates (input/output/forget).
122
        If ``None`` a :class:`.Logistic` brick is used.
123
124
    Notes
125
    -----
126
    See :class:`.Initializable` for initialization parameters.
127
128
    """
129
    @lazy(allocation=['dim'])
130
    def __init__(self, dim, activation=None, gate_activation=None, **kwargs):
131
        self.dim = dim
132
133
        if not activation:
134
            activation = Tanh()
135
        if not gate_activation:
136
            gate_activation = Logistic()
137
        self.activation = activation
138
        self.gate_activation = gate_activation
139
140
        children = [self.activation, self.gate_activation]
141
        kwargs.setdefault('children', []).extend(children)
142
        super(LSTM, self).__init__(**kwargs)
143
144
    def get_dim(self, name):
145
        if name == 'inputs':
146
            return self.dim * 4
147
        if name in ['states', 'cells']:
148
            return self.dim
149
        if name == 'mask':
150
            return 0
151
        return super(LSTM, self).get_dim(name)
152
153
    def _allocate(self):
154
        self.W_state = shared_floatx_nans((self.dim, 4*self.dim),
0 ignored issues
show
Coding Style Naming introduced by
The name W_state does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
155
                                          name='W_state')
156
        self.W_cell_to_in = shared_floatx_nans((self.dim,),
0 ignored issues
show
Coding Style Naming introduced by
The name W_cell_to_in does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
157
                                               name='W_cell_to_in')
158
        self.W_cell_to_forget = shared_floatx_nans((self.dim,),
0 ignored issues
show
Coding Style Naming introduced by
The name W_cell_to_forget does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
159
                                                   name='W_cell_to_forget')
160
        self.W_cell_to_out = shared_floatx_nans((self.dim,),
0 ignored issues
show
Coding Style Naming introduced by
The name W_cell_to_out does not conform to the attribute naming conventions ((([a-z_][a-z0-9_]{0,30})|(_?[A-Z]))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
161
                                                name='W_cell_to_out')
162
        # The underscore is required to prevent collision with
163
        # the `initial_state` application method
164
        self.initial_state_ = shared_floatx_zeros((self.dim,),
165
                                                  name="initial_state")
166
        self.initial_cells = shared_floatx_zeros((self.dim,),
167
                                                 name="initial_cells")
168
        add_role(self.W_state, WEIGHT)
169
        add_role(self.W_cell_to_in, WEIGHT)
170
        add_role(self.W_cell_to_forget, WEIGHT)
171
        add_role(self.W_cell_to_out, WEIGHT)
172
        add_role(self.initial_state_, INITIAL_STATE)
173
        add_role(self.initial_cells, INITIAL_STATE)
174
175
        self.parameters = [
176
            self.W_state, self.W_cell_to_in, self.W_cell_to_forget,
177
            self.W_cell_to_out, self.initial_state_, self.initial_cells]
178
179
    def _initialize(self):
180
        for weights in self.parameters[:4]:
181
            self.weights_init.initialize(weights, self.rng)
182
183
    @recurrent(sequences=['inputs', 'mask'], states=['states', 'cells'],
184
               contexts=[], outputs=['states', 'cells'])
185
    def apply(self, inputs, states, cells, mask=None):
186
        """Apply the Long Short Term Memory transition.
187
188
        Parameters
189
        ----------
190
        states : :class:`~tensor.TensorVariable`
191
            The 2 dimensional matrix of current states in the shape
192
            (batch_size, features). Required for `one_step` usage.
193
        cells : :class:`~tensor.TensorVariable`
194
            The 2 dimensional matrix of current cells in the shape
195
            (batch_size, features). Required for `one_step` usage.
196
        inputs : :class:`~tensor.TensorVariable`
197
            The 2 dimensional matrix of inputs in the shape (batch_size,
198
            features * 4). The `inputs` needs to be four times the
199
            dimension of the LSTM brick to insure each four gates receive
200
            different transformations of the input. See [Grav13]_
201
            equations 7 to 10 for more details. The `inputs` are then split
202
            in this order: Input gates, forget gates, cells and output
203
            gates.
204
        mask : :class:`~tensor.TensorVariable`
205
            A 1D binary array in the shape (batch,) which is 1 if there is
206
            data available, 0 if not. Assumed to be 1-s only if not given.
207
208
        .. [Grav13] Graves, Alex, *Generating sequences with recurrent*
209
            *neural networks*, arXiv preprint arXiv:1308.0850 (2013).
210
211
        Returns
212
        -------
213
        states : :class:`~tensor.TensorVariable`
214
            Next states of the network.
215
        cells : :class:`~tensor.TensorVariable`
216
            Next cell activations of the network.
217
218
        """
219
        def slice_last(x, no):
220
            return x[:, no*self.dim: (no+1)*self.dim]
221
222
        activation = tensor.dot(states, self.W_state) + inputs
223
        in_gate = self.gate_activation.apply(
224
            slice_last(activation, 0) + cells * self.W_cell_to_in)
225
        forget_gate = self.gate_activation.apply(
226
            slice_last(activation, 1) + cells * self.W_cell_to_forget)
227
        next_cells = (
228
            forget_gate * cells +
229
            in_gate * self.activation.apply(slice_last(activation, 2)))
230
        out_gate = self.gate_activation.apply(
231
            slice_last(activation, 3) + next_cells * self.W_cell_to_out)
232
        next_states = out_gate * self.activation.apply(next_cells)
233
234
        if mask:
235
            next_states = (mask[:, None] * next_states +
236
                           (1 - mask[:, None]) * states)
237
            next_cells = (mask[:, None] * next_cells +
238
                          (1 - mask[:, None]) * cells)
239
240
        return next_states, next_cells
241
242
    @application(outputs=apply.states)
243
    def initial_states(self, batch_size, *args, **kwargs):
244
        return [tensor.repeat(self.initial_state_[None, :], batch_size, 0),
245
                tensor.repeat(self.initial_cells[None, :], batch_size, 0)]
246
247
248
class GatedRecurrent(BaseRecurrent, Initializable):
249
    u"""Gated recurrent neural network.
250
251
    Gated recurrent neural network (GRNN) as introduced in [CvMG14]_. Every
252
    unit of a GRNN is equipped with update and reset gates that facilitate
253
    better gradient propagation.
254
255
    Parameters
256
    ----------
257
    dim : int
258
        The dimension of the hidden state.
259
    activation : :class:`~.bricks.Brick` or None
260
        The brick to apply as activation. If ``None`` a
261
        :class:`.Tanh` brick is used.
262
    gate_activation : :class:`~.bricks.Brick` or None
263
        The brick to apply as activation for gates. If ``None`` a
264
        :class:`.Logistic` brick is used.
265
266
    Notes
267
    -----
268
    See :class:`.Initializable` for initialization parameters.
269
270
    .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre,
271
       Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua
272
       Bengio, *Learning Phrase Representations using RNN Encoder-Decoder
273
       for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734.
274
275
    """
276
    @lazy(allocation=['dim'])
277
    def __init__(self, dim, activation=None, gate_activation=None,
278
                 **kwargs):
279
        self.dim = dim
280
281
        if not activation:
282
            activation = Tanh()
283
        if not gate_activation:
284
            gate_activation = Logistic()
285
        self.activation = activation
286
        self.gate_activation = gate_activation
287
288
        children = [activation, gate_activation]
289
        kwargs.setdefault('children', []).extend(children)
290
        super(GatedRecurrent, self).__init__(**kwargs)
291
292
    @property
293
    def state_to_state(self):
294
        return self.parameters[0]
295
296
    @property
297
    def state_to_gates(self):
298
        return self.parameters[1]
299
300
    def get_dim(self, name):
301
        if name == 'mask':
302
            return 0
303
        if name in ['inputs', 'states']:
304
            return self.dim
305
        if name == 'gate_inputs':
306
            return 2 * self.dim
307
        return super(GatedRecurrent, self).get_dim(name)
308
309
    def _allocate(self):
310
        self.parameters.append(shared_floatx_nans((self.dim, self.dim),
311
                                                  name='state_to_state'))
312
        self.parameters.append(shared_floatx_nans((self.dim, 2 * self.dim),
313
                                                  name='state_to_gates'))
314
        self.parameters.append(shared_floatx_zeros((self.dim,),
315
                                                   name="initial_state"))
316
        for i in range(2):
317
            if self.parameters[i]:
318
                add_role(self.parameters[i], WEIGHT)
319
        add_role(self.parameters[2], INITIAL_STATE)
320
321
    def _initialize(self):
322
        self.weights_init.initialize(self.state_to_state, self.rng)
323
        state_to_update = self.weights_init.generate(
324
            self.rng, (self.dim, self.dim))
325
        state_to_reset = self.weights_init.generate(
326
            self.rng, (self.dim, self.dim))
327
        self.state_to_gates.set_value(
328
            numpy.hstack([state_to_update, state_to_reset]))
329
330
    @recurrent(sequences=['mask', 'inputs', 'gate_inputs'],
331
               states=['states'], outputs=['states'], contexts=[])
332
    def apply(self, inputs, gate_inputs, states, mask=None):
333
        """Apply the gated recurrent transition.
334
335
        Parameters
336
        ----------
337
        states : :class:`~tensor.TensorVariable`
338
            The 2 dimensional matrix of current states in the shape
339
            (batch_size, dim). Required for `one_step` usage.
340
        inputs : :class:`~tensor.TensorVariable`
341
            The 2 dimensional matrix of inputs in the shape (batch_size,
342
            dim)
343
        gate_inputs : :class:`~tensor.TensorVariable`
344
            The 2 dimensional matrix of inputs to the gates in the
345
            shape (batch_size, 2 * dim).
346
        mask : :class:`~tensor.TensorVariable`
347
            A 1D binary array in the shape (batch,) which is 1 if there is
348
            data available, 0 if not. Assumed to be 1-s only if not given.
349
350
        Returns
351
        -------
352
        output : :class:`~tensor.TensorVariable`
353
            Next states of the network.
354
355
        """
356
        gate_values = self.gate_activation.apply(
357
            states.dot(self.state_to_gates) + gate_inputs)
358
        update_values = gate_values[:, :self.dim]
359
        reset_values = gate_values[:, self.dim:]
360
        states_reset = states * reset_values
361
        next_states = self.activation.apply(
362
            states_reset.dot(self.state_to_state) + inputs)
363
        next_states = (next_states * update_values +
364
                       states * (1 - update_values))
365
        if mask:
366
            next_states = (mask[:, None] * next_states +
367
                           (1 - mask[:, None]) * states)
368
        return next_states
369
370
    @application(outputs=apply.states)
371
    def initial_states(self, batch_size, *args, **kwargs):
372
        return [tensor.repeat(self.parameters[2][None, :], batch_size, 0)]
373