blocks.bricks.BatchNormalization._allocate() - Code Metrics - Inspection of "WIP: Brick-based batch normalization." - mila-udem/blocks - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#941)

by David

created 2016-01-21 20:24 UTC

blocks.bricks.BatchNormalization._allocate() C

↳ Parent: blocks.bricks.BatchNormalization

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	7
dl	0
loc	34
rs	5.5

import collections

import numpy
from picklable_itertools.extras import equizip
import theano
from theano import tensor
from theano.tensor.nnet import bn

from ..graph import add_annotation
from ..initialization import Constant
from ..roles import (WEIGHT, BIAS, BATCH_NORM_POPULATION_MEAN,
                     BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
                     BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
                     add_role)
from ..utils import (shared_floatx_zeros, shared_floatx,
                     shared_floatx_nans)
from .base import lazy, application
from .sequences import Sequence, Feedforward, MLP
from .interfaces import RNGMixin


def _add_batch_axis(var):
    """Prepend a singleton axis to a TensorVariable and name it."""
    new_var = new_var = tensor.shape_padleft(var)
    new_var.name = 'shape_padleft({})'.format(var.name)
    return new_var


def _add_role_and_annotate(var, role, annotations=()):
    """Add a role and zero or more annotations to a variable."""
    add_role(var, role)
    for annotation in annotations:
        add_annotation(var, annotation)


class BatchNormalization(RNGMixin, Feedforward):
    r"""Normalizes activations, parameterizes a scale and shift.

    Parameters
    ----------
    input_dim : int or tuple
        Shape of a single input example. It is assumed that a batch axis
        will be prepended to this.
    broadcastable : tuple, optional
        Tuple the same length as `input_dim` which specifies which of the
        per-example axes should be averaged over to compute means and
        standard deviations. For example, in order to normalize over all
        spatial locations in a `(batch_index, channels, height, width)`
        image, pass `(False, True, True)`.
    conserve_memory : bool, optional
        Use an implementation that stores less intermediate state and
        therefore uses less memory, at the expense of 5-10% speed. Default
        is `True`.
    epsilon : float, optional
       The stabilizing constant for the minibatch standard deviation
       computation (when the brick is run in training mode).
       Added to the variance inside the square root, as in the
       batch normalization paper.
    weights_init : object, optional
        Initialization object to use for the learned scaling parameter
        ($\\gamma$ in [BN]_). By default, uses constant initialization
        of 1.
    biases_init : object, optional
        Initialization object to use for the learned shift parameter
        ($\\beta$ in [BN]_). By default, uses constant initialization of 0.

    Notes
    -----
    In order for trained models to behave sensibly immediately upon
    upon deserialization, by default, this brick runs in *inference* mode,
    using a population mean and population standard deviation (initialized
    to zeros and ones respectively) to normalize activations. It is
    expected that the user will adapt these during training in some
    fashion, independently of the training objective, e.g. by taking a
    moving average of minibatch-wise statistics.

    In order to *train* with batch normalization, one must obtain a
    training graph by transforming the original inference graph. See
    :func:`~blocks.graph.apply_batch_normalization` for a routine to
    transform graphs, and :func:`~blocks.graph.batch_normalization`
    for a context manager that may enable shorter compile times
    (every instance of :class:`BatchNormalization` is itself a context
    manager, entry into which causes applications to be in minibatch
    "training" mode, however it is usually more convenient to use
    :func:`~blocks.graph.batch_normalization` to enable this behaviour
    for all of your graph's :class:`BatchNormalization` bricks at once).

    Note that training in inference mode should be avoided, as this
    brick introduces scales and shift parameters (tagged with the
    `PARAMETER` role) that, in the absence of batch normalization,
    usually makes things unstable. If you must do this, filter for and
    remove `BATCH_NORM_SHIFT` and `BATCH_NORM_SCALE` from the list of
    parameters you are training, and this brick should behave as a
    (somewhat expensive) no-op.

    This Brick accepts `weights_init` and `biases_init` arguments but is
    *not* an instance of :class:`~blocks.bricks.Initializable`, and will
    therefore not receive pushed initialization config from any parent
    brick. In almost all cases, you will probably want to stick with the
    defaults (unit scale and zero offset), but you can explicitly pass one
    or both initializers to override this.

    This has the necessary properties to be inserted into a
    :class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
    the `input_dim` should be omitted at construction, to be inferred from
    the layer below.

    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, broadcastable=None,
                 conserve_memory=True, epsilon=1e-4, weights_init=None,
                 biases_init=None, **kwargs):
        self.input_dim = input_dim
        self.broadcastable = broadcastable
        self.conserve_memory = conserve_memory
        self.epsilon = epsilon
        self.weights_init = (Constant(1) if weights_init is None
                             else weights_init)
        self.biases_init = (Constant(0) if biases_init is None
                            else biases_init)
        self._training_mode = []
        super(BatchNormalization, self).__init__(**kwargs)

    @application(inputs=['input_'], outputs=['output'])
    def apply(self, input_, application_call):
        if self._training_mode:
            mean, stdev = self._compute_training_statistics(input_)
        else:
            mean, stdev = self._prepare_population_statistics()
        # Useful for filtration of calls that were already made in
        # training mode when doing graph transformations.
        application_call.metadata['training_mode'] = self._training_mode
        # Useful for retrieving a list of updates for population
        # statistics. Ditch the broadcastable first axis, though, to
        # make it the same dimensions as the population mean and stdev
        # shared variables.
        application_call.metadata['offset'] = mean[0]
        application_call.metadata['divisor'] = stdev[0]
        # Give these quantities roles in the graph.
        _add_role_and_annotate(mean, BATCH_NORM_OFFSET,
                               [self, application_call])
        _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
                               [self, application_call])
        W = _add_batch_axis(self.W)
        b = _add_batch_axis(self.b)
        # Heavy lifting is done by the Theano utility function.
        normalized = bn.batch_normalization(input_, W, b, mean, stdev,
                                            mode=('low_mem'
                                                  if self.conserve_memory
                                                  else 'high_mem'))
        return normalized

    def __enter__(self):
        self._training_mode.append(True)

    def __exit__(self, *exc_info):
        self._training_mode.pop()

    def _compute_training_statistics(self, input_):
        axes = (0,) + tuple((i + 1) for i, b in
                            enumerate(self.population_mean.broadcastable)
                            if b)
        mean = input_.mean(axis=axes, keepdims=True)
        assert mean.broadcastable[1:] == self.population_mean.broadcastable
        stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
                            numpy.cast[theano.config.floatX](self.epsilon))
        assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
        add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
        add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
        return mean, stdev

    def _prepare_population_statistics(self):
        mean = _add_batch_axis(self.population_mean)
        stdev = _add_batch_axis(self.population_stdev)
        return mean, stdev

    def _allocate(self):
        input_dim = ((self.input_dim,)
                     if not isinstance(self.input_dim, collections.Sequence)
                     else self.input_dim)
        broadcastable = (tuple(False for _ in input_dim)
                         if self.broadcastable is None else self.broadcastable)
        if len(input_dim) != len(broadcastable):
            raise ValueError("input_dim and broadcastable must be same length")
        var_dim = tuple(1 if broadcast else dim for dim, broadcast in
                        equizip(input_dim, broadcastable))
        broadcastable = broadcastable

        # "gamma", from the Ioffe & Szegedy manuscript.
        self.W = shared_floatx_nans(var_dim, name='batch_norm_scale',
                                    broadcastable=broadcastable)

        # "beta", from the Ioffe & Szegedy manuscript.
        self.b = shared_floatx_nans(var_dim, name='batch_norm_shift',
                                    broadcastable=broadcastable)
        add_role(self.W, WEIGHT)
        add_role(self.b, BIAS)
        self.parameters.append(self.W)
        self.parameters.append(self.b)

        # These aren't technically parameters, in that they should not be
        # learned using the same cost function as other model parameters.
        self.population_mean = shared_floatx_zeros(var_dim,
                                                   name='population_mean',
                                                   broadcastable=broadcastable)
        self.population_stdev = shared_floatx(numpy.ones(var_dim),
                                              name='population_stdev',
                                              broadcastable=broadcastable)
        add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
        add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)

    def _initialize(self):
        self.biases_init.initialize(self.b, self.rng)
        self.weights_init.initialize(self.W, self.rng)

    # Needed for the Feedforward interface.
    @property
    def output_dim(self):
        return self.input_dim

    # The following properties allow for BatchNormalization bricks
    # to be used directly inside of a ConvolutionalSequence.
    @property
    def image_size(self):
        return self.input_dim[-2:]

    @image_size.setter
    def image_size(self, value):
        if not isinstance(self.input_dim, collections.Sequence):
            self.input_dim = (None,) + tuple(value)
        else:
            self.input_dim = (self.input_dim[0],) + tuple(value)

    @property
    def num_channels(self):
        return self.input_dim[0]

    @num_channels.setter
    def num_channels(self, value):
        if not isinstance(self.input_dim, collections.Sequence):
            self.input_dim = (value,) + (None, None)
        else:
            self.input_dim = (value,) + self.input_dim[-2:]

    def get_dim(self, name):
        if name in ('input', 'output'):
            return self.input_dim
        else:
            raise KeyError

    @property
    def num_output_channels(self):
        return self.num_channels


class SpatialBatchNormalization(BatchNormalization):
    """Convenient subclass for batch normalization across spatial inputs.

    Parameters
    ----------
    input_dim : int or tuple
        The input size of a single example. Must be length at least 2.
        It's assumed that the first axis of this tuple is a "channels"
        axis, which should not be summed over, and all remaining
        dimensions are spatial dimensions.

    Notes
    -----
    See :class:`BatchNormalization` for more details (and additional
    keyword arguments).

    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, **kwargs):
        if not isinstance(input_dim,
                          collections.Sequence) or len(input_dim) < 2:
            raise ValueError('expected input_dim to be length >= 2 '
                             'e.g. (channels, height, width)')
        broadcastable = (False,) + ((True,) * (len(input_dim) - 1))
        kwargs.setdefault('broadcastable', broadcastable)
        super(SpatialBatchNormalization, self).__init__(input_dim, **kwargs)


class BatchNormalizedMLP(MLP):
    """Convenient subclass for building an MLP with batch normalization.

    Parameters
    ----------
    conserve_memory : bool, optional
        See :class:`BatchNormalization`.

    Notes
    -----
    All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
    activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
    containing an appropriate :class:`BatchNormalization` brick and
    the activation that follows it.

    By default, the contained :class:`~blocks.bricks.Linear` bricks will
    not contain any biases, as they could be canceled out by the biases
    in the :class:`BatchNormalization` bricks being added. Pass
    `use_bias` with a value of `True` if you really want this for some
    reason.

    """
    @lazy(allocation=['dims'])
    def __init__(self, activations, dims, *args, **kwargs):
        conserve_memory = kwargs.pop('conserve_memory', True)
        activations = [
            Sequence([
                BatchNormalization(conserve_memory=conserve_memory).apply,
                act.apply
            ], name='batch_norm_activation_{}'.format(i))
            for i, act in enumerate(activations)
        ]
        # Batch normalization bricks incorporate a bias, so there's no
        # need for our Linear bricks to have them.
        kwargs.setdefault('use_bias', False)
        super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
                                                 **kwargs)

    @property
    def conserve_memory(self):
        return self._conserve_memory

    @conserve_memory.setter
    def conserve_memory(self, value):
        self._conserve_memory = value
        for act in self.activations:
            assert isinstance(act.children[0], BatchNormalization)
            act.children[0].conserve_memory = value

    def _push_allocation_config(self):
        super(BatchNormalizedMLP, self)._push_allocation_config()
        # Do the extra allocation pushing for the BatchNormalization
        # bricks. They need as their input dimension the output dimension
        # of each linear transformation.  Exclude the first dimension,
        # which is the input dimension.
        for act, dim in equizip(self.activations, self.dims[1:]):
            assert isinstance(act.children[0], BatchNormalization)
            act.children[0].input_dim = dim


1			import collections
2
3			import numpy
4			from picklable_itertools.extras import equizip
5			import theano
6			from theano import tensor
7			from theano.tensor.nnet import bn
8
9			from ..graph import add_annotation
10			from ..initialization import Constant
11			from ..roles import (WEIGHT, BIAS, BATCH_NORM_POPULATION_MEAN,
12			BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
13			BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
14			add_role)
15			from ..utils import (shared_floatx_zeros, shared_floatx,
16			shared_floatx_nans)
17			from .base import lazy, application
18			from .sequences import Sequence, Feedforward, MLP
19			from .interfaces import RNGMixin
20
21
22			def _add_batch_axis(var):
23			"""Prepend a singleton axis to a TensorVariable and name it."""
24			new_var = new_var = tensor.shape_padleft(var)
25			new_var.name = 'shape_padleft({})'.format(var.name)
26			return new_var
27
28
29			def _add_role_and_annotate(var, role, annotations=()):
30			"""Add a role and zero or more annotations to a variable."""
31			add_role(var, role)
32			for annotation in annotations:
33			add_annotation(var, annotation)
34
35
36			class BatchNormalization(RNGMixin, Feedforward):
37			r"""Normalizes activations, parameterizes a scale and shift.
38
39			Parameters
40			----------
41			input_dim : int or tuple
42			Shape of a single input example. It is assumed that a batch axis
43			will be prepended to this.
44			broadcastable : tuple, optional
45			Tuple the same length as `input_dim` which specifies which of the
46			per-example axes should be averaged over to compute means and
47			standard deviations. For example, in order to normalize over all
48			spatial locations in a `(batch_index, channels, height, width)`
49			image, pass `(False, True, True)`.
50			conserve_memory : bool, optional
51			Use an implementation that stores less intermediate state and
52			therefore uses less memory, at the expense of 5-10% speed. Default
53			is `True`.
54			epsilon : float, optional
55			The stabilizing constant for the minibatch standard deviation
56			computation (when the brick is run in training mode).
57			Added to the variance inside the square root, as in the
58			batch normalization paper.
59			weights_init : object, optional
60			Initialization object to use for the learned scaling parameter
61			($\\gamma$ in [BN]_). By default, uses constant initialization
62			of 1.
63			biases_init : object, optional
64			Initialization object to use for the learned shift parameter
65			($\\beta$ in [BN]_). By default, uses constant initialization of 0.
66
67			Notes
68			-----
69			In order for trained models to behave sensibly immediately upon
70			upon deserialization, by default, this brick runs in inference mode,
71			using a population mean and population standard deviation (initialized
72			to zeros and ones respectively) to normalize activations. It is
73			expected that the user will adapt these during training in some
74			fashion, independently of the training objective, e.g. by taking a
75			moving average of minibatch-wise statistics.
76
77			In order to train with batch normalization, one must obtain a
78			training graph by transforming the original inference graph. See
79			:func:`~blocks.graph.apply_batch_normalization` for a routine to
80			transform graphs, and :func:`~blocks.graph.batch_normalization`
81			for a context manager that may enable shorter compile times
82			(every instance of :class:`BatchNormalization` is itself a context
83			manager, entry into which causes applications to be in minibatch
84			"training" mode, however it is usually more convenient to use
85			:func:`~blocks.graph.batch_normalization` to enable this behaviour
86			for all of your graph's :class:`BatchNormalization` bricks at once).
87
88			Note that training in inference mode should be avoided, as this
89			brick introduces scales and shift parameters (tagged with the
90			`PARAMETER` role) that, in the absence of batch normalization,
91			usually makes things unstable. If you must do this, filter for and
92			remove `BATCH_NORM_SHIFT` and `BATCH_NORM_SCALE` from the list of
93			parameters you are training, and this brick should behave as a
94			(somewhat expensive) no-op.
95
96			This Brick accepts `weights_init` and `biases_init` arguments but is
97			not an instance of :class:`~blocks.bricks.Initializable`, and will
98			therefore not receive pushed initialization config from any parent
99			brick. In almost all cases, you will probably want to stick with the
100			defaults (unit scale and zero offset), but you can explicitly pass one
101			or both initializers to override this.
102
103			This has the necessary properties to be inserted into a
104			:class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
105			the `input_dim` should be omitted at construction, to be inferred from
106			the layer below.
107
108			"""
109			@lazy(allocation=['input_dim'])
110			def __init__(self, input_dim, broadcastable=None,
111			conserve_memory=True, epsilon=1e-4, weights_init=None,
112			biases_init=None, **kwargs):
113			self.input_dim = input_dim
114			self.broadcastable = broadcastable
115			self.conserve_memory = conserve_memory
116			self.epsilon = epsilon
117			self.weights_init = (Constant(1) if weights_init is None
118			else weights_init)
119			self.biases_init = (Constant(0) if biases_init is None
120			else biases_init)
121			self._training_mode = []
122			super(BatchNormalization, self).__init__(**kwargs)
123
124			@application(inputs=['input_'], outputs=['output'])
125			def apply(self, input_, application_call):
126			if self._training_mode:
127			mean, stdev = self._compute_training_statistics(input_)
128			else:
129			mean, stdev = self._prepare_population_statistics()
130			# Useful for filtration of calls that were already made in
131			# training mode when doing graph transformations.
132			application_call.metadata['training_mode'] = self._training_mode
133			# Useful for retrieving a list of updates for population
134			# statistics. Ditch the broadcastable first axis, though, to
135			# make it the same dimensions as the population mean and stdev
136			# shared variables.
137			application_call.metadata['offset'] = mean[0]
138			application_call.metadata['divisor'] = stdev[0]
139			# Give these quantities roles in the graph.
140			_add_role_and_annotate(mean, BATCH_NORM_OFFSET,
141			[self, application_call])
142			_add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
143			[self, application_call])
144			W = _add_batch_axis(self.W)
145			b = _add_batch_axis(self.b)
146			# Heavy lifting is done by the Theano utility function.
147			normalized = bn.batch_normalization(input_, W, b, mean, stdev,
148			mode=('low_mem'
149			if self.conserve_memory
150			else 'high_mem'))
151			return normalized
152
153			def __enter__(self):
154			self._training_mode.append(True)
155
156			def __exit__(self, *exc_info):
157			self._training_mode.pop()
158
159			def _compute_training_statistics(self, input_):
160			axes = (0,) + tuple((i + 1) for i, b in
161			enumerate(self.population_mean.broadcastable)
162			if b)
163			mean = input_.mean(axis=axes, keepdims=True)
164			assert mean.broadcastable[1:] == self.population_mean.broadcastable
165			stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
166			numpy.cast[theano.config.floatX](self.epsilon))
167			assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
168			add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
169			add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
170			return mean, stdev
171
172			def _prepare_population_statistics(self):
173			mean = _add_batch_axis(self.population_mean)
174			stdev = _add_batch_axis(self.population_stdev)
175			return mean, stdev
176
177			def _allocate(self):
178			input_dim = ((self.input_dim,)
179			if not isinstance(self.input_dim, collections.Sequence)
180			else self.input_dim)
181			broadcastable = (tuple(False for _ in input_dim)
182			if self.broadcastable is None else self.broadcastable)
183			if len(input_dim) != len(broadcastable):
184			raise ValueError("input_dim and broadcastable must be same length")
185			var_dim = tuple(1 if broadcast else dim for dim, broadcast in
186			equizip(input_dim, broadcastable))
187			broadcastable = broadcastable
188
189			# "gamma", from the Ioffe & Szegedy manuscript.
190			self.W = shared_floatx_nans(var_dim, name='batch_norm_scale',
191			broadcastable=broadcastable)
192
193			# "beta", from the Ioffe & Szegedy manuscript.
194			self.b = shared_floatx_nans(var_dim, name='batch_norm_shift',
195			broadcastable=broadcastable)
196			add_role(self.W, WEIGHT)
197			add_role(self.b, BIAS)
198			self.parameters.append(self.W)
199			self.parameters.append(self.b)
200
201			# These aren't technically parameters, in that they should not be
202			# learned using the same cost function as other model parameters.
203			self.population_mean = shared_floatx_zeros(var_dim,
204			name='population_mean',
205			broadcastable=broadcastable)
206			self.population_stdev = shared_floatx(numpy.ones(var_dim),
207			name='population_stdev',
208			broadcastable=broadcastable)
209			add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
210			add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)
211
212			def _initialize(self):
213			self.biases_init.initialize(self.b, self.rng)
214			self.weights_init.initialize(self.W, self.rng)
215
216			# Needed for the Feedforward interface.
217			@property
218			def output_dim(self):
219			return self.input_dim
220
221			# The following properties allow for BatchNormalization bricks
222			# to be used directly inside of a ConvolutionalSequence.
223			@property
224			def image_size(self):
225			return self.input_dim[-2:]
226
227			@image_size.setter
228			def image_size(self, value):
229			if not isinstance(self.input_dim, collections.Sequence):
230			self.input_dim = (None,) + tuple(value)
231			else:
232			self.input_dim = (self.input_dim[0],) + tuple(value)
233
234			@property
235			def num_channels(self):
236			return self.input_dim[0]
237
238			@num_channels.setter
239			def num_channels(self, value):
240			if not isinstance(self.input_dim, collections.Sequence):
241			self.input_dim = (value,) + (None, None)
242			else:
243			self.input_dim = (value,) + self.input_dim[-2:]
244
245			def get_dim(self, name):
246			if name in ('input', 'output'):
247			return self.input_dim
248			else:
249			raise KeyError
250
251			@property
252			def num_output_channels(self):
253			return self.num_channels
254
255
256			class SpatialBatchNormalization(BatchNormalization):
257			"""Convenient subclass for batch normalization across spatial inputs.
258
259			Parameters
260			----------
261			input_dim : int or tuple
262			The input size of a single example. Must be length at least 2.
263			It's assumed that the first axis of this tuple is a "channels"
264			axis, which should not be summed over, and all remaining
265			dimensions are spatial dimensions.
266
267			Notes
268			-----
269			See :class:`BatchNormalization` for more details (and additional
270			keyword arguments).
271
272			"""
273			@lazy(allocation=['input_dim'])
274			def __init__(self, input_dim, **kwargs):
275			if not isinstance(input_dim,
276			collections.Sequence) or len(input_dim) < 2:
277			raise ValueError('expected input_dim to be length >= 2 '
278			'e.g. (channels, height, width)')
279			broadcastable = (False,) + ((True,) * (len(input_dim) - 1))
280			kwargs.setdefault('broadcastable', broadcastable)
281			super(SpatialBatchNormalization, self).__init__(input_dim, **kwargs)
282
283
284			class BatchNormalizedMLP(MLP):
285			"""Convenient subclass for building an MLP with batch normalization.
286
287			Parameters
288			----------
289			conserve_memory : bool, optional
290			See :class:`BatchNormalization`.
291
292			Notes
293			-----
294			All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
295			activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
296			containing an appropriate :class:`BatchNormalization` brick and
297			the activation that follows it.
298
299			By default, the contained :class:`~blocks.bricks.Linear` bricks will
300			not contain any biases, as they could be canceled out by the biases
301			in the :class:`BatchNormalization` bricks being added. Pass
302			`use_bias` with a value of `True` if you really want this for some
303			reason.
304
305			"""
306			@lazy(allocation=['dims'])
307			def __init__(self, activations, dims, args, *kwargs):
308			conserve_memory = kwargs.pop('conserve_memory', True)
309			activations = [
310			Sequence([
311			BatchNormalization(conserve_memory=conserve_memory).apply,
312			act.apply
313			], name='batch_norm_activation_{}'.format(i))
314			for i, act in enumerate(activations)
315			]
316			# Batch normalization bricks incorporate a bias, so there's no
317			# need for our Linear bricks to have them.
318			kwargs.setdefault('use_bias', False)
319			super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
320			**kwargs)
321
322			@property
323			def conserve_memory(self):
324			return self._conserve_memory
325
326			@conserve_memory.setter
327			def conserve_memory(self, value):
328			self._conserve_memory = value
329			for act in self.activations:
330			assert isinstance(act.children[0], BatchNormalization)
331			act.children[0].conserve_memory = value
332
333			def _push_allocation_config(self):
334			super(BatchNormalizedMLP, self)._push_allocation_config()
335			# Do the extra allocation pushing for the BatchNormalization
336			# bricks. They need as their input dimension the output dimension
337			# of each linear transformation. Exclude the first dimension,
338			# which is the input dimension.
339			for act, dim in equizip(self.activations, self.dims[1:]):
340			assert isinstance(act.children[0], BatchNormalization)
341			act.children[0].input_dim = dim
342

mila-udem / blocks

Pull Request — master (#941)

blocks.bricks.BatchNormalization._allocate() C

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like