1 | """Some of the simplest individual bricks.""" |
||
2 | import logging |
||
3 | |||
4 | from theano import tensor |
||
5 | |||
6 | from blocks.bricks.base import application, Brick, lazy |
||
7 | from blocks.bricks.interfaces import Activation, Feedforward, Initializable |
||
8 | from blocks.bricks.interfaces import LinearLike, Random # noqa |
||
9 | |||
10 | from blocks.bricks.wrappers import WithExtraDims |
||
11 | from blocks.roles import add_role, WEIGHT, BIAS |
||
12 | from blocks.utils import shared_floatx_nans |
||
13 | |||
14 | logger = logging.getLogger(__name__) |
||
15 | |||
16 | |||
17 | class Linear(LinearLike, Feedforward): |
||
18 | r"""A linear transformation with optional bias. |
||
19 | |||
20 | Brick which applies a linear (affine) transformation by multiplying |
||
21 | the input with a weight matrix. By default, a bias term is added |
||
22 | (see :class:`Initializable` for information on disabling this). |
||
23 | |||
24 | Parameters |
||
25 | ---------- |
||
26 | input_dim : int |
||
27 | The dimension of the input. Required by :meth:`~.Brick.allocate`. |
||
28 | output_dim : int |
||
29 | The dimension of the output. Required by :meth:`~.Brick.allocate`. |
||
30 | |||
31 | Notes |
||
32 | ----- |
||
33 | See :class:`Initializable` for initialization parameters. |
||
34 | |||
35 | A linear transformation with bias is a matrix multiplication followed |
||
36 | by a vector summation. |
||
37 | |||
38 | .. math:: f(\mathbf{x}) = \mathbf{W}\mathbf{x} + \mathbf{b} |
||
39 | |||
40 | """ |
||
41 | @lazy(allocation=['input_dim', 'output_dim']) |
||
42 | def __init__(self, input_dim, output_dim, **kwargs): |
||
43 | super(Linear, self).__init__(**kwargs) |
||
44 | self.input_dim = input_dim |
||
45 | self.output_dim = output_dim |
||
46 | |||
47 | def _allocate(self): |
||
48 | W = shared_floatx_nans((self.input_dim, self.output_dim), name='W') |
||
49 | add_role(W, WEIGHT) |
||
50 | self.parameters.append(W) |
||
51 | self.add_auxiliary_variable(W.norm(2), name='W_norm') |
||
52 | if getattr(self, 'use_bias', True): |
||
53 | b = shared_floatx_nans((self.output_dim,), name='b') |
||
54 | add_role(b, BIAS) |
||
55 | self.parameters.append(b) |
||
56 | self.add_auxiliary_variable(b.norm(2), name='b_norm') |
||
57 | |||
58 | @application(inputs=['input_'], outputs=['output']) |
||
59 | def apply(self, input_): |
||
60 | """Apply the linear transformation. |
||
61 | |||
62 | Parameters |
||
63 | ---------- |
||
64 | input_ : :class:`~tensor.TensorVariable` |
||
65 | The input on which to apply the transformation |
||
66 | |||
67 | Returns |
||
68 | ------- |
||
69 | output : :class:`~tensor.TensorVariable` |
||
70 | The transformed input plus optional bias |
||
71 | |||
72 | """ |
||
73 | output = tensor.dot(input_, self.W) |
||
74 | if getattr(self, 'use_bias', True): |
||
75 | output += self.b |
||
76 | return output |
||
77 | |||
78 | def get_dim(self, name): |
||
79 | if name == 'input_': |
||
80 | return self.input_dim |
||
81 | if name == 'output': |
||
82 | return self.output_dim |
||
83 | super(Linear, self).get_dim(name) |
||
84 | |||
85 | |||
86 | class Bias(Feedforward, Initializable): |
||
87 | """Add a bias (i.e. sum with a vector).""" |
||
88 | @lazy(allocation=['dim']) |
||
89 | def __init__(self, dim, **kwargs): |
||
90 | super(Bias, self).__init__(**kwargs) |
||
91 | self.dim = dim |
||
92 | |||
93 | def _allocate(self): |
||
94 | b = shared_floatx_nans((self.output_dim,), name='b') |
||
95 | add_role(b, BIAS) |
||
96 | self.parameters.append(b) |
||
97 | |||
98 | def _initialize(self): |
||
99 | b, = self.parameters |
||
0 ignored issues
–
show
|
|||
100 | self.biases_init.initialize(b, self.rng) |
||
101 | |||
102 | @application(inputs=['input_'], outputs=['output']) |
||
103 | def apply(self, input_): |
||
104 | """Apply the linear transformation. |
||
105 | |||
106 | Parameters |
||
107 | ---------- |
||
108 | input_ : :class:`~tensor.TensorVariable` |
||
109 | The input on which to apply the transformation |
||
110 | |||
111 | Returns |
||
112 | ------- |
||
113 | output : :class:`~tensor.TensorVariable` |
||
114 | The transformed input plus optional bias |
||
115 | |||
116 | """ |
||
117 | b, = self.parameters |
||
0 ignored issues
–
show
|
|||
118 | return input_ + b |
||
119 | |||
120 | def get_dim(self, name): |
||
121 | if name in ['input_', 'output']: |
||
122 | return self.dim |
||
123 | super(Bias, self).get_dim(name) |
||
124 | |||
125 | def _get_dim(self): |
||
126 | return self.dim |
||
127 | |||
128 | def _set_dim(self, value): |
||
129 | self.dim = value |
||
130 | |||
131 | input_dim = output_dim = property(_get_dim, _set_dim) |
||
132 | |||
133 | |||
134 | class Maxout(Brick): |
||
135 | """Maxout pooling transformation. |
||
136 | |||
137 | A brick that does max pooling over groups of input units. If you use |
||
138 | this code in a research project, please cite [GWFM13]_. |
||
139 | |||
140 | .. [GWFM13] Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron |
||
141 | Courville, and Yoshua Bengio, *Maxout networks*, ICML (2013), pp. |
||
142 | 1319-1327. |
||
143 | |||
144 | Parameters |
||
145 | ---------- |
||
146 | num_pieces : int |
||
147 | The size of the groups the maximum is taken over. |
||
148 | |||
149 | Notes |
||
150 | ----- |
||
151 | Maxout applies a set of linear transformations to a vector and selects |
||
152 | for each output dimension the result with the highest value. |
||
153 | |||
154 | """ |
||
155 | @lazy(allocation=['num_pieces']) |
||
156 | def __init__(self, num_pieces, **kwargs): |
||
157 | super(Maxout, self).__init__(**kwargs) |
||
158 | self.num_pieces = num_pieces |
||
159 | |||
160 | @application(inputs=['input_'], outputs=['output']) |
||
161 | def apply(self, input_): |
||
162 | """Apply the maxout transformation. |
||
163 | |||
164 | Parameters |
||
165 | ---------- |
||
166 | input_ : :class:`~tensor.TensorVariable` |
||
167 | The input on which to apply the transformation |
||
168 | |||
169 | Returns |
||
170 | ------- |
||
171 | output : :class:`~tensor.TensorVariable` |
||
172 | The transformed input |
||
173 | |||
174 | """ |
||
175 | last_dim = input_.shape[-1] |
||
176 | output_dim = last_dim // self.num_pieces |
||
177 | new_shape = ([input_.shape[i] for i in range(input_.ndim - 1)] + |
||
178 | [output_dim, self.num_pieces]) |
||
179 | output = tensor.max(input_.reshape(new_shape, ndim=input_.ndim + 1), |
||
180 | axis=input_.ndim) |
||
181 | return output |
||
182 | |||
183 | |||
184 | class LinearMaxout(Initializable, Feedforward): |
||
185 | """Maxout pooling following a linear transformation. |
||
186 | |||
187 | This code combines the :class:`Linear` brick with a :class:`Maxout` |
||
188 | brick. |
||
189 | |||
190 | Parameters |
||
191 | ---------- |
||
192 | input_dim : int |
||
193 | The dimension of the input. Required by :meth:`~.Brick.allocate`. |
||
194 | output_dim : int |
||
195 | The dimension of the output. Required by :meth:`~.Brick.allocate`. |
||
196 | num_pieces : int |
||
197 | The number of linear functions. Required by |
||
198 | :meth:`~.Brick.allocate`. |
||
199 | |||
200 | Notes |
||
201 | ----- |
||
202 | See :class:`Initializable` for initialization parameters. |
||
203 | |||
204 | """ |
||
205 | @lazy(allocation=['input_dim', 'output_dim', 'num_pieces']) |
||
206 | def __init__(self, input_dim, output_dim, num_pieces, **kwargs): |
||
207 | self.linear = Linear() |
||
208 | self.maxout = Maxout() |
||
209 | children = [self.linear, self.maxout] |
||
210 | kwargs.setdefault('children', []).extend(children) |
||
211 | super(LinearMaxout, self).__init__(**kwargs) |
||
212 | |||
213 | self.input_dim = input_dim |
||
214 | self.output_dim = output_dim |
||
215 | self.num_pieces = num_pieces |
||
216 | |||
217 | @property |
||
218 | def input_dim(self): |
||
219 | return self.linear.input_dim |
||
220 | |||
221 | @input_dim.setter |
||
222 | def input_dim(self, value): |
||
223 | self.linear.input_dim = value |
||
224 | |||
225 | def _push_allocation_config(self): |
||
226 | self.linear.output_dim = self.output_dim * self.num_pieces |
||
227 | self.maxout.num_pieces = self.num_pieces |
||
228 | |||
229 | @application(inputs=['input_'], outputs=['output']) |
||
230 | def apply(self, input_): |
||
231 | """Apply the linear transformation followed by maxout. |
||
232 | |||
233 | Parameters |
||
234 | ---------- |
||
235 | input_ : :class:`~tensor.TensorVariable` |
||
236 | The input on which to apply the transformations |
||
237 | |||
238 | Returns |
||
239 | ------- |
||
240 | output : :class:`~tensor.TensorVariable` |
||
241 | The transformed input |
||
242 | |||
243 | """ |
||
244 | pre_activation = self.linear.apply(input_) |
||
245 | output = self.maxout.apply(pre_activation) |
||
246 | return output |
||
247 | |||
248 | |||
249 | class Identity(Activation): |
||
250 | @application(inputs=['input_'], outputs=['output']) |
||
251 | def apply(self, input_): |
||
252 | return input_ |
||
253 | |||
254 | |||
255 | class Tanh(Activation): |
||
256 | @application(inputs=['input_'], outputs=['output']) |
||
257 | def apply(self, input_): |
||
258 | return tensor.tanh(input_) |
||
259 | |||
260 | |||
261 | class Logistic(Activation): |
||
262 | @application(inputs=['input_'], outputs=['output']) |
||
263 | def apply(self, input_): |
||
264 | return tensor.nnet.sigmoid(input_) |
||
265 | |||
266 | |||
267 | class Softplus(Activation): |
||
268 | r""" Softplus brick. |
||
269 | |||
270 | The softplus is defined as :math:`\zeta(x) = \log(1+e^x)`. |
||
271 | |||
272 | .. Dugas, C., Bengio, Y., Belisle, F., Nadeau, C., and Garcia, |
||
273 | R. (2001). Incorporating second-order functional knowledge |
||
274 | for better option pricing. In NIPS 13 . MIT Press. |
||
275 | |||
276 | """ |
||
277 | @application(inputs=['input_'], outputs=['output']) |
||
278 | def apply(self, input_): |
||
279 | return tensor.nnet.softplus(input_) |
||
280 | |||
281 | |||
282 | class Rectifier(Activation): |
||
283 | @application(inputs=['input_'], outputs=['output']) |
||
284 | def apply(self, input_): |
||
285 | return tensor.nnet.relu(input_) |
||
286 | |||
287 | |||
288 | class LeakyRectifier(Activation): |
||
289 | r"""Leaky ReLU |
||
290 | |||
291 | Like Rectifier, but inputs are scaled by small constant for negative |
||
292 | inputs. |
||
293 | |||
294 | .. math:: f(x) = \text{max}(x, ax) |
||
295 | |||
296 | Parameters |
||
297 | ---------- |
||
298 | leak : float, optional |
||
299 | The scalar to multiply negative values by. Named 'a' above. |
||
300 | |||
301 | .. Maas, Andrew L., Awni Y. Hannun, and Andrew Y. Ng. Rectifier |
||
302 | nonlinearities improve neural network acoustic models. Proc. |
||
303 | ICML. Vol. 30. 2013. |
||
304 | |||
305 | """ |
||
306 | def __init__(self, leak=0.01, **kwargs): |
||
307 | super(LeakyRectifier, self).__init__(**kwargs) |
||
308 | self._leak = leak |
||
309 | |||
310 | @application(inputs=['input_'], outputs=['output']) |
||
311 | def apply(self, input_): |
||
312 | return tensor.nnet.relu(input_, alpha=self._leak) |
||
313 | |||
314 | |||
315 | class Softmax(Brick): |
||
316 | """A softmax brick. |
||
317 | |||
318 | Works with 2-dimensional inputs only. If you need more, |
||
319 | see :class:`NDimensionalSoftmax`. |
||
320 | |||
321 | """ |
||
322 | @application(inputs=['input_'], outputs=['output']) |
||
323 | def apply(self, input_): |
||
324 | """Standard softmax. |
||
325 | |||
326 | Parameters |
||
327 | ---------- |
||
328 | input_ : :class:`~theano.Variable` |
||
329 | A matrix, each row contains unnormalized log-probabilities of a |
||
330 | distribution. |
||
331 | |||
332 | Returns |
||
333 | ------- |
||
334 | output_ : :class:`~theano.Variable` |
||
335 | A matrix with probabilities in each row for each distribution |
||
336 | from `input_`. |
||
337 | |||
338 | """ |
||
339 | return tensor.nnet.softmax(input_) |
||
340 | |||
341 | @application(inputs=['input_'], outputs=['output']) |
||
342 | def log_probabilities(self, input_): |
||
343 | """Normalize log-probabilities. |
||
344 | |||
345 | Converts unnormalized log-probabilities (exponents of which do not |
||
346 | sum to one) into actual log-probabilities (exponents of which sum |
||
347 | to one). |
||
348 | |||
349 | Parameters |
||
350 | ---------- |
||
351 | input_ : :class:`~theano.Variable` |
||
352 | A matrix, each row contains unnormalized log-probabilities of a |
||
353 | distribution. |
||
354 | |||
355 | Returns |
||
356 | ------- |
||
357 | output : :class:`~theano.Variable` |
||
358 | A matrix with normalized log-probabilities in each row for each |
||
359 | distribution from `input_`. |
||
360 | |||
361 | """ |
||
362 | shifted = input_ - input_.max(axis=1, keepdims=True) |
||
363 | return shifted - tensor.log( |
||
364 | tensor.exp(shifted).sum(axis=1, keepdims=True)) |
||
365 | |||
366 | @application(inputs=['y', 'x'], outputs=['output']) |
||
367 | def categorical_cross_entropy(self, application_call, y, x): |
||
368 | """Computationally stable cross-entropy for pre-softmax values. |
||
369 | |||
370 | Parameters |
||
371 | ---------- |
||
372 | y : :class:`~tensor.TensorVariable` |
||
373 | In the case of a matrix argument, each row represents a |
||
374 | probabilility distribution. In the vector case, each element |
||
375 | represents a distribution by specifying the position of 1 in a |
||
376 | 1-hot vector. |
||
377 | x : :class:`~tensor.TensorVariable` |
||
378 | A matrix, each row contains unnormalized probabilities of a |
||
379 | distribution. |
||
380 | |||
381 | Returns |
||
382 | ------- |
||
383 | cost : :class:`~tensor.TensorVariable` |
||
384 | A vector of cross-entropies between respective distributions |
||
385 | from y and x. |
||
386 | |||
387 | """ |
||
388 | x = self.log_probabilities(x) |
||
389 | application_call.add_auxiliary_variable( |
||
390 | x.copy(name='log_probabilities')) |
||
391 | if y.ndim == x.ndim - 1: |
||
392 | indices = tensor.arange(y.shape[0]) * x.shape[1] + y |
||
393 | cost = -x.flatten()[indices] |
||
394 | elif y.ndim == x.ndim: |
||
395 | cost = -(x * y).sum(axis=1) |
||
396 | else: |
||
397 | raise TypeError('rank mismatch between x and y') |
||
398 | return cost |
||
399 | |||
400 | |||
401 | class NDimensionalSoftmax(Softmax): |
||
402 | decorators = [WithExtraDims()] |
||
403 |
This happens when the amount of values does not equal the amount of labels: