blocks.bricks.BatchNormalization._allocate() - Code Metrics - Inspection of "Brick-based batch normalization." - mila-udem/blocks - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#941)

by David

created 2016-01-22 00:28 UTC

blocks.bricks.BatchNormalization._allocate() C

↳ Parent: blocks.bricks.BatchNormalization

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	7
dl	0
loc	39
rs	5.5

import collections

import numpy
from picklable_itertools.extras import equizip
import theano
from theano import tensor
from theano.tensor.nnet import bn

from ..graph import add_annotation
from ..initialization import Constant
from ..roles import (BATCH_NORM_POPULATION_MEAN,
                     BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
                     BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
                     BATCH_NORM_SHIFT_PARAMETER, BATCH_NORM_SCALE_PARAMETER,
                     add_role)
from ..utils import (shared_floatx_zeros, shared_floatx,
                     shared_floatx_nans)
from .base import lazy, application
from .sequences import Sequence, Feedforward, MLP
from .interfaces import RNGMixin


def _add_batch_axis(var):
    """Prepend a singleton axis to a TensorVariable and name it."""
    new_var = new_var = tensor.shape_padleft(var)
    new_var.name = 'shape_padleft({})'.format(var.name)
    return new_var


def _add_role_and_annotate(var, role, annotations=()):
    """Add a role and zero or more annotations to a variable."""
    add_role(var, role)
    for annotation in annotations:
        add_annotation(var, annotation)


class BatchNormalization(RNGMixin, Feedforward):
    r"""Normalizes activations, parameterizes a scale and shift.

    Parameters
    ----------
    input_dim : int or tuple
        Shape of a single input example. It is assumed that a batch axis
        will be prepended to this.
    broadcastable : tuple, optional
        Tuple the same length as `input_dim` which specifies which of the
        per-example axes should be averaged over to compute means and
        standard deviations. For example, in order to normalize over all
        spatial locations in a `(batch_index, channels, height, width)`
        image, pass `(False, True, True)`.
    conserve_memory : bool, optional
        Use an implementation that stores less intermediate state and
        therefore uses less memory, at the expense of 5-10% speed. Default
        is `True`.
    epsilon : float, optional
       The stabilizing constant for the minibatch standard deviation
       computation (when the brick is run in training mode).
       Added to the variance inside the square root, as in the
       batch normalization paper.
    scale_init : object, optional
        Initialization object to use for the learned scaling parameter
        ($\\gamma$ in [BN]_). By default, uses constant initialization
        of 1.
    shift_init : object, optional
        Initialization object to use for the learned shift parameter
        ($\\beta$ in [BN]_). By default, uses constant initialization of 0.

    Notes
    -----
    In order for trained models to behave sensibly immediately upon
    upon deserialization, by default, this brick runs in *inference* mode,
    using a population mean and population standard deviation (initialized
    to zeros and ones respectively) to normalize activations. It is
    expected that the user will adapt these during training in some
    fashion, independently of the training objective, e.g. by taking a
    moving average of minibatch-wise statistics.

    In order to *train* with batch normalization, one must obtain a
    training graph by transforming the original inference graph. See
    :func:`~blocks.graph.apply_batch_normalization` for a routine to
    transform graphs, and :func:`~blocks.graph.batch_normalization`
    for a context manager that may enable shorter compile times
    (every instance of :class:`BatchNormalization` is itself a context
    manager, entry into which causes applications to be in minibatch
    "training" mode, however it is usually more convenient to use
    :func:`~blocks.graph.batch_normalization` to enable this behaviour
    for all of your graph's :class:`BatchNormalization` bricks at once).

    Note that training in inference mode should be avoided, as this
    brick introduces scales and shift parameters (tagged with the
    `PARAMETER` role) that, in the absence of batch normalization,
    usually makes things unstable. If you must do this, filter for and
    remove `BATCH_NORM_SHIFT_PARAMETER` and `BATCH_NORM_SCALE_PARAMETER`
    from the list of parameters you are training, and this brick should
    behave as a (somewhat expensive) no-op.

    This Brick accepts `scale_init` and `shift_init` arguments but is
    *not* an instance of :class:`~blocks.bricks.Initializable`, and will
    therefore not receive pushed initialization config from any parent
    brick. In almost all cases, you will probably want to stick with the
    defaults (unit scale and zero offset), but you can explicitly pass one
    or both initializers to override this.

    This has the necessary properties to be inserted into a
    :class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
    the `input_dim` should be omitted at construction, to be inferred from
    the layer below.

    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, broadcastable=None,
                 conserve_memory=True, epsilon=1e-4, scale_init=None,
                 shift_init=None, **kwargs):
        self.input_dim = input_dim
        self.broadcastable = broadcastable
        self.conserve_memory = conserve_memory
        self.epsilon = epsilon
        self.scale_init = (Constant(1) if scale_init is None
                           else scale_init)
        self.shift_init = (Constant(0) if shift_init is None
                           else shift_init)
        self._training_mode = []
        super(BatchNormalization, self).__init__(**kwargs)

    @application(inputs=['input_'], outputs=['output'])
    def apply(self, input_, application_call):
        if self._training_mode:
            mean, stdev = self._compute_training_statistics(input_)
        else:
            mean, stdev = self._prepare_population_statistics()
        # Useful for filtration of calls that were already made in
        # training mode when doing graph transformations.
        # Very important to cast to bool, as self._training_mode is
        # normally a list (to support nested context managers), which would
        # otherwise get passed by reference and be remotely mutated.
        application_call.metadata['training_mode'] = bool(self._training_mode)
        # Useful for retrieving a list of updates for population
        # statistics. Ditch the broadcastable first axis, though, to
        # make it the same dimensions as the population mean and stdev
        # shared variables.
        application_call.metadata['offset'] = mean[0]
        application_call.metadata['divisor'] = stdev[0]
        # Give these quantities roles in the graph.
        _add_role_and_annotate(mean, BATCH_NORM_OFFSET,
                               [self, application_call])
        _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
                               [self, application_call])
        scale = _add_batch_axis(self.scale)
        shift = _add_batch_axis(self.shift)
        # Heavy lifting is done by the Theano utility function.
        normalized = bn.batch_normalization(input_, scale, shift, mean, stdev,
                                            mode=('low_mem'
                                                  if self.conserve_memory
                                                  else 'high_mem'))
        return normalized

    def __enter__(self):
        self._training_mode.append(True)

    def __exit__(self, *exc_info):
        self._training_mode.pop()

    def _compute_training_statistics(self, input_):
        axes = (0,) + tuple((i + 1) for i, b in
                            enumerate(self.population_mean.broadcastable)
                            if b)
        mean = input_.mean(axis=axes, keepdims=True)
        assert mean.broadcastable[1:] == self.population_mean.broadcastable
        stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
                            numpy.cast[theano.config.floatX](self.epsilon))
        assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
        add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
        add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
        return mean, stdev

    def _prepare_population_statistics(self):
        mean = _add_batch_axis(self.population_mean)
        stdev = _add_batch_axis(self.population_stdev)
        return mean, stdev

    def _allocate(self):
        input_dim = ((self.input_dim,)
                     if not isinstance(self.input_dim, collections.Sequence)
                     else self.input_dim)
        broadcastable = (tuple(False for _ in input_dim)
                         if self.broadcastable is None else self.broadcastable)
        if len(input_dim) != len(broadcastable):
            raise ValueError("input_dim and broadcastable must be same length")
        var_dim = tuple(1 if broadcast else dim for dim, broadcast in
                        equizip(input_dim, broadcastable))
        broadcastable = broadcastable

        # "gamma", from the Ioffe & Szegedy manuscript.
        self.scale = shared_floatx_nans(var_dim, name='batch_norm_scale',
                                        broadcastable=broadcastable)

        # "beta", from the Ioffe & Szegedy manuscript.
        self.shift = shared_floatx_nans(var_dim, name='batch_norm_shift',
                                        broadcastable=broadcastable)
        add_role(self.scale, BATCH_NORM_SCALE_PARAMETER)
        add_role(self.shift, BATCH_NORM_SHIFT_PARAMETER)
        self.parameters.append(self.scale)
        self.parameters.append(self.shift)

        # These aren't technically parameters, in that they should not be
        # learned using the same cost function as other model parameters.
        self.population_mean = shared_floatx_zeros(var_dim,
                                                   name='population_mean',
                                                   broadcastable=broadcastable)
        self.population_stdev = shared_floatx(numpy.ones(var_dim),
                                              name='population_stdev',
                                              broadcastable=broadcastable)
        add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
        add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)

        # Normally these would get annotated by an AnnotatingList, but they
        # aren't in self.parameters.
        add_annotation(self.population_mean, self)
        add_annotation(self.population_stdev, self)

    def _initialize(self):
        self.shift_init.initialize(self.shift, self.rng)
        self.scale_init.initialize(self.scale, self.rng)

    # Needed for the Feedforward interface.
    @property
    def output_dim(self):
        return self.input_dim

    # The following properties allow for BatchNormalization bricks
    # to be used directly inside of a ConvolutionalSequence.
    @property
    def image_size(self):
        return self.input_dim[-2:]

    @image_size.setter
    def image_size(self, value):
        if not isinstance(self.input_dim, collections.Sequence):
            self.input_dim = (None,) + tuple(value)
        else:
            self.input_dim = (self.input_dim[0],) + tuple(value)

    @property
    def num_channels(self):
        return self.input_dim[0]

    @num_channels.setter
    def num_channels(self, value):
        if not isinstance(self.input_dim, collections.Sequence):
            self.input_dim = (value,) + (None, None)
        else:
            self.input_dim = (value,) + self.input_dim[-2:]

    def get_dim(self, name):
        if name in ('input', 'output'):
            return self.input_dim
        else:
            raise KeyError

    @property
    def num_output_channels(self):
        return self.num_channels


class SpatialBatchNormalization(BatchNormalization):
    """Convenient subclass for batch normalization across spatial inputs.

    Parameters
    ----------
    input_dim : int or tuple
        The input size of a single example. Must be length at least 2.
        It's assumed that the first axis of this tuple is a "channels"
        axis, which should not be summed over, and all remaining
        dimensions are spatial dimensions.

    Notes
    -----
    See :class:`BatchNormalization` for more details (and additional
    keyword arguments).

    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, **kwargs):
        if not isinstance(input_dim,
                          collections.Sequence) or len(input_dim) < 2:
            raise ValueError('expected input_dim to be length >= 2 '
                             'e.g. (channels, height, width)')
        broadcastable = (False,) + ((True,) * (len(input_dim) - 1))
        kwargs.setdefault('broadcastable', broadcastable)
        super(SpatialBatchNormalization, self).__init__(input_dim, **kwargs)


class BatchNormalizedMLP(MLP):
    """Convenient subclass for building an MLP with batch normalization.

    Parameters
    ----------
    conserve_memory : bool, optional
        See :class:`BatchNormalization`.

    Notes
    -----
    All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
    activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
    containing an appropriate :class:`BatchNormalization` brick and
    the activation that follows it.

    By default, the contained :class:`~blocks.bricks.Linear` bricks will
    not contain any biases, as they could be canceled out by the biases
    in the :class:`BatchNormalization` bricks being added. Pass
    `use_bias` with a value of `True` if you really want this for some
    reason.

    """
    @lazy(allocation=['dims'])
    def __init__(self, activations, dims, *args, **kwargs):
        conserve_memory = kwargs.pop('conserve_memory', True)
        activations = [
            Sequence([
                BatchNormalization(conserve_memory=conserve_memory).apply,
                act.apply
            ], name='batch_norm_activation_{}'.format(i))
            for i, act in enumerate(activations)
        ]
        # Batch normalization bricks incorporate a bias, so there's no
        # need for our Linear bricks to have them.
        kwargs.setdefault('use_bias', False)
        super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
                                                 **kwargs)

    @property
    def conserve_memory(self):
        return self._conserve_memory

    @conserve_memory.setter
    def conserve_memory(self, value):
        self._conserve_memory = value
        for act in self.activations:
            assert isinstance(act.children[0], BatchNormalization)
            act.children[0].conserve_memory = value

    def _push_allocation_config(self):
        super(BatchNormalizedMLP, self)._push_allocation_config()
        # Do the extra allocation pushing for the BatchNormalization
        # bricks. They need as their input dimension the output dimension
        # of each linear transformation.  Exclude the first dimension,
        # which is the input dimension.
        for act, dim in equizip(self.activations, self.dims[1:]):
            assert isinstance(act.children[0], BatchNormalization)
            act.children[0].input_dim = dim


1			import collections
2
3			import numpy
4			from picklable_itertools.extras import equizip
5			import theano
6			from theano import tensor
7			from theano.tensor.nnet import bn
8
9			from ..graph import add_annotation
10			from ..initialization import Constant
11			from ..roles import (BATCH_NORM_POPULATION_MEAN,
12			BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
13			BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
14			BATCH_NORM_SHIFT_PARAMETER, BATCH_NORM_SCALE_PARAMETER,
15			add_role)
16			from ..utils import (shared_floatx_zeros, shared_floatx,
17			shared_floatx_nans)
18			from .base import lazy, application
19			from .sequences import Sequence, Feedforward, MLP
20			from .interfaces import RNGMixin
21
22
23			def _add_batch_axis(var):
24			"""Prepend a singleton axis to a TensorVariable and name it."""
25			new_var = new_var = tensor.shape_padleft(var)
26			new_var.name = 'shape_padleft({})'.format(var.name)
27			return new_var
28
29
30			def _add_role_and_annotate(var, role, annotations=()):
31			"""Add a role and zero or more annotations to a variable."""
32			add_role(var, role)
33			for annotation in annotations:
34			add_annotation(var, annotation)
35
36
37			class BatchNormalization(RNGMixin, Feedforward):
38			r"""Normalizes activations, parameterizes a scale and shift.
39
40			Parameters
41			----------
42			input_dim : int or tuple
43			Shape of a single input example. It is assumed that a batch axis
44			will be prepended to this.
45			broadcastable : tuple, optional
46			Tuple the same length as `input_dim` which specifies which of the
47			per-example axes should be averaged over to compute means and
48			standard deviations. For example, in order to normalize over all
49			spatial locations in a `(batch_index, channels, height, width)`
50			image, pass `(False, True, True)`.
51			conserve_memory : bool, optional
52			Use an implementation that stores less intermediate state and
53			therefore uses less memory, at the expense of 5-10% speed. Default
54			is `True`.
55			epsilon : float, optional
56			The stabilizing constant for the minibatch standard deviation
57			computation (when the brick is run in training mode).
58			Added to the variance inside the square root, as in the
59			batch normalization paper.
60			scale_init : object, optional
61			Initialization object to use for the learned scaling parameter
62			($\\gamma$ in [BN]_). By default, uses constant initialization
63			of 1.
64			shift_init : object, optional
65			Initialization object to use for the learned shift parameter
66			($\\beta$ in [BN]_). By default, uses constant initialization of 0.
67
68			Notes
69			-----
70			In order for trained models to behave sensibly immediately upon
71			upon deserialization, by default, this brick runs in inference mode,
72			using a population mean and population standard deviation (initialized
73			to zeros and ones respectively) to normalize activations. It is
74			expected that the user will adapt these during training in some
75			fashion, independently of the training objective, e.g. by taking a
76			moving average of minibatch-wise statistics.
77
78			In order to train with batch normalization, one must obtain a
79			training graph by transforming the original inference graph. See
80			:func:`~blocks.graph.apply_batch_normalization` for a routine to
81			transform graphs, and :func:`~blocks.graph.batch_normalization`
82			for a context manager that may enable shorter compile times
83			(every instance of :class:`BatchNormalization` is itself a context
84			manager, entry into which causes applications to be in minibatch
85			"training" mode, however it is usually more convenient to use
86			:func:`~blocks.graph.batch_normalization` to enable this behaviour
87			for all of your graph's :class:`BatchNormalization` bricks at once).
88
89			Note that training in inference mode should be avoided, as this
90			brick introduces scales and shift parameters (tagged with the
91			`PARAMETER` role) that, in the absence of batch normalization,
92			usually makes things unstable. If you must do this, filter for and
93			remove `BATCH_NORM_SHIFT_PARAMETER` and `BATCH_NORM_SCALE_PARAMETER`
94			from the list of parameters you are training, and this brick should
95			behave as a (somewhat expensive) no-op.
96
97			This Brick accepts `scale_init` and `shift_init` arguments but is
98			not an instance of :class:`~blocks.bricks.Initializable`, and will
99			therefore not receive pushed initialization config from any parent
100			brick. In almost all cases, you will probably want to stick with the
101			defaults (unit scale and zero offset), but you can explicitly pass one
102			or both initializers to override this.
103
104			This has the necessary properties to be inserted into a
105			:class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
106			the `input_dim` should be omitted at construction, to be inferred from
107			the layer below.
108
109			"""
110			@lazy(allocation=['input_dim'])
111			def __init__(self, input_dim, broadcastable=None,
112			conserve_memory=True, epsilon=1e-4, scale_init=None,
113			shift_init=None, **kwargs):
114			self.input_dim = input_dim
115			self.broadcastable = broadcastable
116			self.conserve_memory = conserve_memory
117			self.epsilon = epsilon
118			self.scale_init = (Constant(1) if scale_init is None
119			else scale_init)
120			self.shift_init = (Constant(0) if shift_init is None
121			else shift_init)
122			self._training_mode = []
123			super(BatchNormalization, self).__init__(**kwargs)
124
125			@application(inputs=['input_'], outputs=['output'])
126			def apply(self, input_, application_call):
127			if self._training_mode:
128			mean, stdev = self._compute_training_statistics(input_)
129			else:
130			mean, stdev = self._prepare_population_statistics()
131			# Useful for filtration of calls that were already made in
132			# training mode when doing graph transformations.
133			# Very important to cast to bool, as self._training_mode is
134			# normally a list (to support nested context managers), which would
135			# otherwise get passed by reference and be remotely mutated.
136			application_call.metadata['training_mode'] = bool(self._training_mode)
137			# Useful for retrieving a list of updates for population
138			# statistics. Ditch the broadcastable first axis, though, to
139			# make it the same dimensions as the population mean and stdev
140			# shared variables.
141			application_call.metadata['offset'] = mean[0]
142			application_call.metadata['divisor'] = stdev[0]
143			# Give these quantities roles in the graph.
144			_add_role_and_annotate(mean, BATCH_NORM_OFFSET,
145			[self, application_call])
146			_add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
147			[self, application_call])
148			scale = _add_batch_axis(self.scale)
149			shift = _add_batch_axis(self.shift)
150			# Heavy lifting is done by the Theano utility function.
151			normalized = bn.batch_normalization(input_, scale, shift, mean, stdev,
152			mode=('low_mem'
153			if self.conserve_memory
154			else 'high_mem'))
155			return normalized
156
157			def __enter__(self):
158			self._training_mode.append(True)
159
160			def __exit__(self, *exc_info):
161			self._training_mode.pop()
162
163			def _compute_training_statistics(self, input_):
164			axes = (0,) + tuple((i + 1) for i, b in
165			enumerate(self.population_mean.broadcastable)
166			if b)
167			mean = input_.mean(axis=axes, keepdims=True)
168			assert mean.broadcastable[1:] == self.population_mean.broadcastable
169			stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
170			numpy.cast[theano.config.floatX](self.epsilon))
171			assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
172			add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
173			add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
174			return mean, stdev
175
176			def _prepare_population_statistics(self):
177			mean = _add_batch_axis(self.population_mean)
178			stdev = _add_batch_axis(self.population_stdev)
179			return mean, stdev
180
181			def _allocate(self):
182			input_dim = ((self.input_dim,)
183			if not isinstance(self.input_dim, collections.Sequence)
184			else self.input_dim)
185			broadcastable = (tuple(False for _ in input_dim)
186			if self.broadcastable is None else self.broadcastable)
187			if len(input_dim) != len(broadcastable):
188			raise ValueError("input_dim and broadcastable must be same length")
189			var_dim = tuple(1 if broadcast else dim for dim, broadcast in
190			equizip(input_dim, broadcastable))
191			broadcastable = broadcastable
192
193			# "gamma", from the Ioffe & Szegedy manuscript.
194			self.scale = shared_floatx_nans(var_dim, name='batch_norm_scale',
195			broadcastable=broadcastable)
196
197			# "beta", from the Ioffe & Szegedy manuscript.
198			self.shift = shared_floatx_nans(var_dim, name='batch_norm_shift',
199			broadcastable=broadcastable)
200			add_role(self.scale, BATCH_NORM_SCALE_PARAMETER)
201			add_role(self.shift, BATCH_NORM_SHIFT_PARAMETER)
202			self.parameters.append(self.scale)
203			self.parameters.append(self.shift)
204
205			# These aren't technically parameters, in that they should not be
206			# learned using the same cost function as other model parameters.
207			self.population_mean = shared_floatx_zeros(var_dim,
208			name='population_mean',
209			broadcastable=broadcastable)
210			self.population_stdev = shared_floatx(numpy.ones(var_dim),
211			name='population_stdev',
212			broadcastable=broadcastable)
213			add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
214			add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)
215
216			# Normally these would get annotated by an AnnotatingList, but they
217			# aren't in self.parameters.
218			add_annotation(self.population_mean, self)
219			add_annotation(self.population_stdev, self)
220
221			def _initialize(self):
222			self.shift_init.initialize(self.shift, self.rng)
223			self.scale_init.initialize(self.scale, self.rng)
224
225			# Needed for the Feedforward interface.
226			@property
227			def output_dim(self):
228			return self.input_dim
229
230			# The following properties allow for BatchNormalization bricks
231			# to be used directly inside of a ConvolutionalSequence.
232			@property
233			def image_size(self):
234			return self.input_dim[-2:]
235
236			@image_size.setter
237			def image_size(self, value):
238			if not isinstance(self.input_dim, collections.Sequence):
239			self.input_dim = (None,) + tuple(value)
240			else:
241			self.input_dim = (self.input_dim[0],) + tuple(value)
242
243			@property
244			def num_channels(self):
245			return self.input_dim[0]
246
247			@num_channels.setter
248			def num_channels(self, value):
249			if not isinstance(self.input_dim, collections.Sequence):
250			self.input_dim = (value,) + (None, None)
251			else:
252			self.input_dim = (value,) + self.input_dim[-2:]
253
254			def get_dim(self, name):
255			if name in ('input', 'output'):
256			return self.input_dim
257			else:
258			raise KeyError
259
260			@property
261			def num_output_channels(self):
262			return self.num_channels
263
264
265			class SpatialBatchNormalization(BatchNormalization):
266			"""Convenient subclass for batch normalization across spatial inputs.
267
268			Parameters
269			----------
270			input_dim : int or tuple
271			The input size of a single example. Must be length at least 2.
272			It's assumed that the first axis of this tuple is a "channels"
273			axis, which should not be summed over, and all remaining
274			dimensions are spatial dimensions.
275
276			Notes
277			-----
278			See :class:`BatchNormalization` for more details (and additional
279			keyword arguments).
280
281			"""
282			@lazy(allocation=['input_dim'])
283			def __init__(self, input_dim, **kwargs):
284			if not isinstance(input_dim,
285			collections.Sequence) or len(input_dim) < 2:
286			raise ValueError('expected input_dim to be length >= 2 '
287			'e.g. (channels, height, width)')
288			broadcastable = (False,) + ((True,) * (len(input_dim) - 1))
289			kwargs.setdefault('broadcastable', broadcastable)
290			super(SpatialBatchNormalization, self).__init__(input_dim, **kwargs)
291
292
293			class BatchNormalizedMLP(MLP):
294			"""Convenient subclass for building an MLP with batch normalization.
295
296			Parameters
297			----------
298			conserve_memory : bool, optional
299			See :class:`BatchNormalization`.
300
301			Notes
302			-----
303			All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
304			activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
305			containing an appropriate :class:`BatchNormalization` brick and
306			the activation that follows it.
307
308			By default, the contained :class:`~blocks.bricks.Linear` bricks will
309			not contain any biases, as they could be canceled out by the biases
310			in the :class:`BatchNormalization` bricks being added. Pass
311			`use_bias` with a value of `True` if you really want this for some
312			reason.
313
314			"""
315			@lazy(allocation=['dims'])
316			def __init__(self, activations, dims, args, *kwargs):
317			conserve_memory = kwargs.pop('conserve_memory', True)
318			activations = [
319			Sequence([
320			BatchNormalization(conserve_memory=conserve_memory).apply,
321			act.apply
322			], name='batch_norm_activation_{}'.format(i))
323			for i, act in enumerate(activations)
324			]
325			# Batch normalization bricks incorporate a bias, so there's no
326			# need for our Linear bricks to have them.
327			kwargs.setdefault('use_bias', False)
328			super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
329			**kwargs)
330
331			@property
332			def conserve_memory(self):
333			return self._conserve_memory
334
335			@conserve_memory.setter
336			def conserve_memory(self, value):
337			self._conserve_memory = value
338			for act in self.activations:
339			assert isinstance(act.children[0], BatchNormalization)
340			act.children[0].conserve_memory = value
341
342			def _push_allocation_config(self):
343			super(BatchNormalizedMLP, self)._push_allocation_config()
344			# Do the extra allocation pushing for the BatchNormalization
345			# bricks. They need as their input dimension the output dimension
346			# of each linear transformation. Exclude the first dimension,
347			# which is the input dimension.
348			for act, dim in equizip(self.activations, self.dims[1:]):
349			assert isinstance(act.children[0], BatchNormalization)
350			act.children[0].input_dim = dim
351

mila-udem / blocks

Pull Request — master (#941)

blocks.bricks.BatchNormalization._allocate() C

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like