blocks.bricks.BatchNormalization._compute_training_statistics() - Code Metrics - Inspection of "WIP: Brick-based batch normalization." - mila-udem/blocks - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#941)

by David

created 2016-01-21 23:49 UTC

_compute_training_statistics() B

↳ Parent: blocks.bricks.BatchNormalization

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	5
dl	0
loc	12
rs	8.5454

import collections

import numpy
from picklable_itertools.extras import equizip
import theano
from theano import tensor
from theano.tensor.nnet import bn

from ..graph import add_annotation
from ..initialization import Constant
from ..roles import (WEIGHT, BIAS, BATCH_NORM_POPULATION_MEAN,
                     BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
                     BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
                     add_role)
from ..utils import (shared_floatx_zeros, shared_floatx,
                     shared_floatx_nans)
from .base import lazy, application
from .sequences import Sequence, Feedforward, MLP
from .interfaces import RNGMixin


def _add_batch_axis(var):
    """Prepend a singleton axis to a TensorVariable and name it."""
    new_var = new_var = tensor.shape_padleft(var)
    new_var.name = 'shape_padleft({})'.format(var.name)
    return new_var


def _add_role_and_annotate(var, role, annotations=()):
    """Add a role and zero or more annotations to a variable."""
    add_role(var, role)
    for annotation in annotations:
        add_annotation(var, annotation)


class BatchNormalization(RNGMixin, Feedforward):
    r"""Normalizes activations, parameterizes a scale and shift.

    Parameters
    ----------
    input_dim : int or tuple
        Shape of a single input example. It is assumed that a batch axis
        will be prepended to this.
    broadcastable : tuple, optional
        Tuple the same length as `input_dim` which specifies which of the
        per-example axes should be averaged over to compute means and
        standard deviations. For example, in order to normalize over all
        spatial locations in a `(batch_index, channels, height, width)`
        image, pass `(False, True, True)`.
    conserve_memory : bool, optional
        Use an implementation that stores less intermediate state and
        therefore uses less memory, at the expense of 5-10% speed. Default
        is `True`.
    epsilon : float, optional
       The stabilizing constant for the minibatch standard deviation
       computation (when the brick is run in training mode).
       Added to the variance inside the square root, as in the
       batch normalization paper.
    weights_init : object, optional
        Initialization object to use for the learned scaling parameter
        ($\\gamma$ in [BN]_). By default, uses constant initialization
        of 1.
    biases_init : object, optional
        Initialization object to use for the learned shift parameter
        ($\\beta$ in [BN]_). By default, uses constant initialization of 0.

    Notes
    -----
    In order for trained models to behave sensibly immediately upon
    upon deserialization, by default, this brick runs in *inference* mode,
    using a population mean and population standard deviation (initialized
    to zeros and ones respectively) to normalize activations. It is
    expected that the user will adapt these during training in some
    fashion, independently of the training objective, e.g. by taking a
    moving average of minibatch-wise statistics.

    In order to *train* with batch normalization, one must obtain a
    training graph by transforming the original inference graph. See
    :func:`~blocks.graph.apply_batch_normalization` for a routine to
    transform graphs, and :func:`~blocks.graph.batch_normalization`
    for a context manager that may enable shorter compile times
    (every instance of :class:`BatchNormalization` is itself a context
    manager, entry into which causes applications to be in minibatch
    "training" mode, however it is usually more convenient to use
    :func:`~blocks.graph.batch_normalization` to enable this behaviour
    for all of your graph's :class:`BatchNormalization` bricks at once).

    Note that training in inference mode should be avoided, as this
    brick introduces scales and shift parameters (tagged with the
    `PARAMETER` role) that, in the absence of batch normalization,
    usually makes things unstable. If you must do this, filter for and
    remove `BATCH_NORM_SHIFT` and `BATCH_NORM_SCALE` from the list of
    parameters you are training, and this brick should behave as a
    (somewhat expensive) no-op.

    This Brick accepts `weights_init` and `biases_init` arguments but is
    *not* an instance of :class:`~blocks.bricks.Initializable`, and will
    therefore not receive pushed initialization config from any parent
    brick. In almost all cases, you will probably want to stick with the
    defaults (unit scale and zero offset), but you can explicitly pass one
    or both initializers to override this.

    This has the necessary properties to be inserted into a
    :class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
    the `input_dim` should be omitted at construction, to be inferred from
    the layer below.

    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, broadcastable=None,
                 conserve_memory=True, epsilon=1e-4, weights_init=None,
                 biases_init=None, **kwargs):
        self.input_dim = input_dim
        self.broadcastable = broadcastable
        self.conserve_memory = conserve_memory
        self.epsilon = epsilon
        self.weights_init = (Constant(1) if weights_init is None
                             else weights_init)
        self.biases_init = (Constant(0) if biases_init is None
                            else biases_init)
        self._training_mode = []
        super(BatchNormalization, self).__init__(**kwargs)

    @application(inputs=['input_'], outputs=['output'])
    def apply(self, input_, application_call):
        if self._training_mode:
            mean, stdev = self._compute_training_statistics(input_)
        else:
            mean, stdev = self._prepare_population_statistics()
        # Useful for filtration of calls that were already made in
        # training mode when doing graph transformations.
        # Very important to cast to bool, as self._training_mode is
        # normally a list (to support nested context managers), which would
        # otherwise get passed by reference and be remotely mutated.
        application_call.metadata['training_mode'] = bool(self._training_mode)
        # Useful for retrieving a list of updates for population
        # statistics. Ditch the broadcastable first axis, though, to
        # make it the same dimensions as the population mean and stdev
        # shared variables.
        application_call.metadata['offset'] = mean[0]
        application_call.metadata['divisor'] = stdev[0]
        # Give these quantities roles in the graph.
        _add_role_and_annotate(mean, BATCH_NORM_OFFSET,
                               [self, application_call])
        _add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
                               [self, application_call])
        W = _add_batch_axis(self.W)
        b = _add_batch_axis(self.b)
        # Heavy lifting is done by the Theano utility function.
        normalized = bn.batch_normalization(input_, W, b, mean, stdev,
                                            mode=('low_mem'
                                                  if self.conserve_memory
                                                  else 'high_mem'))
        return normalized

    def __enter__(self):
        self._training_mode.append(True)

    def __exit__(self, *exc_info):
        self._training_mode.pop()

    def _compute_training_statistics(self, input_):
        axes = (0,) + tuple((i + 1) for i, b in
                            enumerate(self.population_mean.broadcastable)
                            if b)
        mean = input_.mean(axis=axes, keepdims=True)
        assert mean.broadcastable[1:] == self.population_mean.broadcastable
        stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
                            numpy.cast[theano.config.floatX](self.epsilon))
        assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
        add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
        add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
        return mean, stdev

    def _prepare_population_statistics(self):
        mean = _add_batch_axis(self.population_mean)
        stdev = _add_batch_axis(self.population_stdev)
        return mean, stdev

    def _allocate(self):
        input_dim = ((self.input_dim,)
                     if not isinstance(self.input_dim, collections.Sequence)
                     else self.input_dim)
        broadcastable = (tuple(False for _ in input_dim)
                         if self.broadcastable is None else self.broadcastable)
        if len(input_dim) != len(broadcastable):
            raise ValueError("input_dim and broadcastable must be same length")
        var_dim = tuple(1 if broadcast else dim for dim, broadcast in
                        equizip(input_dim, broadcastable))
        broadcastable = broadcastable

        # "gamma", from the Ioffe & Szegedy manuscript.
        self.W = shared_floatx_nans(var_dim, name='batch_norm_scale',
                                    broadcastable=broadcastable)

        # "beta", from the Ioffe & Szegedy manuscript.
        self.b = shared_floatx_nans(var_dim, name='batch_norm_shift',
                                    broadcastable=broadcastable)
        add_role(self.W, WEIGHT)
        add_role(self.b, BIAS)
        self.parameters.append(self.W)
        self.parameters.append(self.b)

        # These aren't technically parameters, in that they should not be
        # learned using the same cost function as other model parameters.
        self.population_mean = shared_floatx_zeros(var_dim,
                                                   name='population_mean',
                                                   broadcastable=broadcastable)
        self.population_stdev = shared_floatx(numpy.ones(var_dim),
                                              name='population_stdev',
                                              broadcastable=broadcastable)
        add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
        add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)

        # Normally these would get annotated by an AnnotatingList, but they
        # aren't in self.parameters.
        add_annotation(self.population_mean, self)
        add_annotation(self.population_stdev, self)

    def _initialize(self):
        self.biases_init.initialize(self.b, self.rng)
        self.weights_init.initialize(self.W, self.rng)

    # Needed for the Feedforward interface.
    @property
    def output_dim(self):
        return self.input_dim

    # The following properties allow for BatchNormalization bricks
    # to be used directly inside of a ConvolutionalSequence.
    @property
    def image_size(self):
        return self.input_dim[-2:]

    @image_size.setter
    def image_size(self, value):
        if not isinstance(self.input_dim, collections.Sequence):
            self.input_dim = (None,) + tuple(value)
        else:
            self.input_dim = (self.input_dim[0],) + tuple(value)

    @property
    def num_channels(self):
        return self.input_dim[0]

    @num_channels.setter
    def num_channels(self, value):
        if not isinstance(self.input_dim, collections.Sequence):
            self.input_dim = (value,) + (None, None)
        else:
            self.input_dim = (value,) + self.input_dim[-2:]

    def get_dim(self, name):
        if name in ('input', 'output'):
            return self.input_dim
        else:
            raise KeyError

    @property
    def num_output_channels(self):
        return self.num_channels


class SpatialBatchNormalization(BatchNormalization):
    """Convenient subclass for batch normalization across spatial inputs.

    Parameters
    ----------
    input_dim : int or tuple
        The input size of a single example. Must be length at least 2.
        It's assumed that the first axis of this tuple is a "channels"
        axis, which should not be summed over, and all remaining
        dimensions are spatial dimensions.

    Notes
    -----
    See :class:`BatchNormalization` for more details (and additional
    keyword arguments).

    """
    @lazy(allocation=['input_dim'])
    def __init__(self, input_dim, **kwargs):
        if not isinstance(input_dim,
                          collections.Sequence) or len(input_dim) < 2:
            raise ValueError('expected input_dim to be length >= 2 '
                             'e.g. (channels, height, width)')
        broadcastable = (False,) + ((True,) * (len(input_dim) - 1))
        kwargs.setdefault('broadcastable', broadcastable)
        super(SpatialBatchNormalization, self).__init__(input_dim, **kwargs)


class BatchNormalizedMLP(MLP):
    """Convenient subclass for building an MLP with batch normalization.

    Parameters
    ----------
    conserve_memory : bool, optional
        See :class:`BatchNormalization`.

    Notes
    -----
    All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
    activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
    containing an appropriate :class:`BatchNormalization` brick and
    the activation that follows it.

    By default, the contained :class:`~blocks.bricks.Linear` bricks will
    not contain any biases, as they could be canceled out by the biases
    in the :class:`BatchNormalization` bricks being added. Pass
    `use_bias` with a value of `True` if you really want this for some
    reason.

    """
    @lazy(allocation=['dims'])
    def __init__(self, activations, dims, *args, **kwargs):
        conserve_memory = kwargs.pop('conserve_memory', True)
        activations = [
            Sequence([
                BatchNormalization(conserve_memory=conserve_memory).apply,
                act.apply
            ], name='batch_norm_activation_{}'.format(i))
            for i, act in enumerate(activations)
        ]
        # Batch normalization bricks incorporate a bias, so there's no
        # need for our Linear bricks to have them.
        kwargs.setdefault('use_bias', False)
        super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
                                                 **kwargs)

    @property
    def conserve_memory(self):
        return self._conserve_memory

    @conserve_memory.setter
    def conserve_memory(self, value):
        self._conserve_memory = value
        for act in self.activations:
            assert isinstance(act.children[0], BatchNormalization)
            act.children[0].conserve_memory = value

    def _push_allocation_config(self):
        super(BatchNormalizedMLP, self)._push_allocation_config()
        # Do the extra allocation pushing for the BatchNormalization
        # bricks. They need as their input dimension the output dimension
        # of each linear transformation.  Exclude the first dimension,
        # which is the input dimension.
        for act, dim in equizip(self.activations, self.dims[1:]):
            assert isinstance(act.children[0], BatchNormalization)
            act.children[0].input_dim = dim


1			import collections
2
3			import numpy
4			from picklable_itertools.extras import equizip
5			import theano
6			from theano import tensor
7			from theano.tensor.nnet import bn
8
9			from ..graph import add_annotation
10			from ..initialization import Constant
11			from ..roles import (WEIGHT, BIAS, BATCH_NORM_POPULATION_MEAN,
12			BATCH_NORM_POPULATION_STDEV, BATCH_NORM_OFFSET,
13			BATCH_NORM_DIVISOR, BATCH_NORM_MINIBATCH_ESTIMATE,
14			add_role)
15			from ..utils import (shared_floatx_zeros, shared_floatx,
16			shared_floatx_nans)
17			from .base import lazy, application
18			from .sequences import Sequence, Feedforward, MLP
19			from .interfaces import RNGMixin
20
21
22			def _add_batch_axis(var):
23			"""Prepend a singleton axis to a TensorVariable and name it."""
24			new_var = new_var = tensor.shape_padleft(var)
25			new_var.name = 'shape_padleft({})'.format(var.name)
26			return new_var
27
28
29			def _add_role_and_annotate(var, role, annotations=()):
30			"""Add a role and zero or more annotations to a variable."""
31			add_role(var, role)
32			for annotation in annotations:
33			add_annotation(var, annotation)
34
35
36			class BatchNormalization(RNGMixin, Feedforward):
37			r"""Normalizes activations, parameterizes a scale and shift.
38
39			Parameters
40			----------
41			input_dim : int or tuple
42			Shape of a single input example. It is assumed that a batch axis
43			will be prepended to this.
44			broadcastable : tuple, optional
45			Tuple the same length as `input_dim` which specifies which of the
46			per-example axes should be averaged over to compute means and
47			standard deviations. For example, in order to normalize over all
48			spatial locations in a `(batch_index, channels, height, width)`
49			image, pass `(False, True, True)`.
50			conserve_memory : bool, optional
51			Use an implementation that stores less intermediate state and
52			therefore uses less memory, at the expense of 5-10% speed. Default
53			is `True`.
54			epsilon : float, optional
55			The stabilizing constant for the minibatch standard deviation
56			computation (when the brick is run in training mode).
57			Added to the variance inside the square root, as in the
58			batch normalization paper.
59			weights_init : object, optional
60			Initialization object to use for the learned scaling parameter
61			($\\gamma$ in [BN]_). By default, uses constant initialization
62			of 1.
63			biases_init : object, optional
64			Initialization object to use for the learned shift parameter
65			($\\beta$ in [BN]_). By default, uses constant initialization of 0.
66
67			Notes
68			-----
69			In order for trained models to behave sensibly immediately upon
70			upon deserialization, by default, this brick runs in inference mode,
71			using a population mean and population standard deviation (initialized
72			to zeros and ones respectively) to normalize activations. It is
73			expected that the user will adapt these during training in some
74			fashion, independently of the training objective, e.g. by taking a
75			moving average of minibatch-wise statistics.
76
77			In order to train with batch normalization, one must obtain a
78			training graph by transforming the original inference graph. See
79			:func:`~blocks.graph.apply_batch_normalization` for a routine to
80			transform graphs, and :func:`~blocks.graph.batch_normalization`
81			for a context manager that may enable shorter compile times
82			(every instance of :class:`BatchNormalization` is itself a context
83			manager, entry into which causes applications to be in minibatch
84			"training" mode, however it is usually more convenient to use
85			:func:`~blocks.graph.batch_normalization` to enable this behaviour
86			for all of your graph's :class:`BatchNormalization` bricks at once).
87
88			Note that training in inference mode should be avoided, as this
89			brick introduces scales and shift parameters (tagged with the
90			`PARAMETER` role) that, in the absence of batch normalization,
91			usually makes things unstable. If you must do this, filter for and
92			remove `BATCH_NORM_SHIFT` and `BATCH_NORM_SCALE` from the list of
93			parameters you are training, and this brick should behave as a
94			(somewhat expensive) no-op.
95
96			This Brick accepts `weights_init` and `biases_init` arguments but is
97			not an instance of :class:`~blocks.bricks.Initializable`, and will
98			therefore not receive pushed initialization config from any parent
99			brick. In almost all cases, you will probably want to stick with the
100			defaults (unit scale and zero offset), but you can explicitly pass one
101			or both initializers to override this.
102
103			This has the necessary properties to be inserted into a
104			:class:`blocks.bricks.conv.ConvolutionalSequence` as-is, in which case
105			the `input_dim` should be omitted at construction, to be inferred from
106			the layer below.
107
108			"""
109			@lazy(allocation=['input_dim'])
110			def __init__(self, input_dim, broadcastable=None,
111			conserve_memory=True, epsilon=1e-4, weights_init=None,
112			biases_init=None, **kwargs):
113			self.input_dim = input_dim
114			self.broadcastable = broadcastable
115			self.conserve_memory = conserve_memory
116			self.epsilon = epsilon
117			self.weights_init = (Constant(1) if weights_init is None
118			else weights_init)
119			self.biases_init = (Constant(0) if biases_init is None
120			else biases_init)
121			self._training_mode = []
122			super(BatchNormalization, self).__init__(**kwargs)
123
124			@application(inputs=['input_'], outputs=['output'])
125			def apply(self, input_, application_call):
126			if self._training_mode:
127			mean, stdev = self._compute_training_statistics(input_)
128			else:
129			mean, stdev = self._prepare_population_statistics()
130			# Useful for filtration of calls that were already made in
131			# training mode when doing graph transformations.
132			# Very important to cast to bool, as self._training_mode is
133			# normally a list (to support nested context managers), which would
134			# otherwise get passed by reference and be remotely mutated.
135			application_call.metadata['training_mode'] = bool(self._training_mode)
136			# Useful for retrieving a list of updates for population
137			# statistics. Ditch the broadcastable first axis, though, to
138			# make it the same dimensions as the population mean and stdev
139			# shared variables.
140			application_call.metadata['offset'] = mean[0]
141			application_call.metadata['divisor'] = stdev[0]
142			# Give these quantities roles in the graph.
143			_add_role_and_annotate(mean, BATCH_NORM_OFFSET,
144			[self, application_call])
145			_add_role_and_annotate(stdev, BATCH_NORM_DIVISOR,
146			[self, application_call])
147			W = _add_batch_axis(self.W)
148			b = _add_batch_axis(self.b)
149			# Heavy lifting is done by the Theano utility function.
150			normalized = bn.batch_normalization(input_, W, b, mean, stdev,
151			mode=('low_mem'
152			if self.conserve_memory
153			else 'high_mem'))
154			return normalized
155
156			def __enter__(self):
157			self._training_mode.append(True)
158
159			def __exit__(self, *exc_info):
160			self._training_mode.pop()
161
162			def _compute_training_statistics(self, input_):
163			axes = (0,) + tuple((i + 1) for i, b in
164			enumerate(self.population_mean.broadcastable)
165			if b)
166			mean = input_.mean(axis=axes, keepdims=True)
167			assert mean.broadcastable[1:] == self.population_mean.broadcastable
168			stdev = tensor.sqrt(tensor.var(input_, axis=axes, keepdims=True) +
169			numpy.cast[theano.config.floatX](self.epsilon))
170			assert stdev.broadcastable[1:] == self.population_stdev.broadcastable
171			add_role(mean, BATCH_NORM_MINIBATCH_ESTIMATE)
172			add_role(stdev, BATCH_NORM_MINIBATCH_ESTIMATE)
173			return mean, stdev
174
175			def _prepare_population_statistics(self):
176			mean = _add_batch_axis(self.population_mean)
177			stdev = _add_batch_axis(self.population_stdev)
178			return mean, stdev
179
180			def _allocate(self):
181			input_dim = ((self.input_dim,)
182			if not isinstance(self.input_dim, collections.Sequence)
183			else self.input_dim)
184			broadcastable = (tuple(False for _ in input_dim)
185			if self.broadcastable is None else self.broadcastable)
186			if len(input_dim) != len(broadcastable):
187			raise ValueError("input_dim and broadcastable must be same length")
188			var_dim = tuple(1 if broadcast else dim for dim, broadcast in
189			equizip(input_dim, broadcastable))
190			broadcastable = broadcastable
191
192			# "gamma", from the Ioffe & Szegedy manuscript.
193			self.W = shared_floatx_nans(var_dim, name='batch_norm_scale',
194			broadcastable=broadcastable)
195
196			# "beta", from the Ioffe & Szegedy manuscript.
197			self.b = shared_floatx_nans(var_dim, name='batch_norm_shift',
198			broadcastable=broadcastable)
199			add_role(self.W, WEIGHT)
200			add_role(self.b, BIAS)
201			self.parameters.append(self.W)
202			self.parameters.append(self.b)
203
204			# These aren't technically parameters, in that they should not be
205			# learned using the same cost function as other model parameters.
206			self.population_mean = shared_floatx_zeros(var_dim,
207			name='population_mean',
208			broadcastable=broadcastable)
209			self.population_stdev = shared_floatx(numpy.ones(var_dim),
210			name='population_stdev',
211			broadcastable=broadcastable)
212			add_role(self.population_mean, BATCH_NORM_POPULATION_MEAN)
213			add_role(self.population_stdev, BATCH_NORM_POPULATION_STDEV)
214
215			# Normally these would get annotated by an AnnotatingList, but they
216			# aren't in self.parameters.
217			add_annotation(self.population_mean, self)
218			add_annotation(self.population_stdev, self)
219
220			def _initialize(self):
221			self.biases_init.initialize(self.b, self.rng)
222			self.weights_init.initialize(self.W, self.rng)
223
224			# Needed for the Feedforward interface.
225			@property
226			def output_dim(self):
227			return self.input_dim
228
229			# The following properties allow for BatchNormalization bricks
230			# to be used directly inside of a ConvolutionalSequence.
231			@property
232			def image_size(self):
233			return self.input_dim[-2:]
234
235			@image_size.setter
236			def image_size(self, value):
237			if not isinstance(self.input_dim, collections.Sequence):
238			self.input_dim = (None,) + tuple(value)
239			else:
240			self.input_dim = (self.input_dim[0],) + tuple(value)
241
242			@property
243			def num_channels(self):
244			return self.input_dim[0]
245
246			@num_channels.setter
247			def num_channels(self, value):
248			if not isinstance(self.input_dim, collections.Sequence):
249			self.input_dim = (value,) + (None, None)
250			else:
251			self.input_dim = (value,) + self.input_dim[-2:]
252
253			def get_dim(self, name):
254			if name in ('input', 'output'):
255			return self.input_dim
256			else:
257			raise KeyError
258
259			@property
260			def num_output_channels(self):
261			return self.num_channels
262
263
264			class SpatialBatchNormalization(BatchNormalization):
265			"""Convenient subclass for batch normalization across spatial inputs.
266
267			Parameters
268			----------
269			input_dim : int or tuple
270			The input size of a single example. Must be length at least 2.
271			It's assumed that the first axis of this tuple is a "channels"
272			axis, which should not be summed over, and all remaining
273			dimensions are spatial dimensions.
274
275			Notes
276			-----
277			See :class:`BatchNormalization` for more details (and additional
278			keyword arguments).
279
280			"""
281			@lazy(allocation=['input_dim'])
282			def __init__(self, input_dim, **kwargs):
283			if not isinstance(input_dim,
284			collections.Sequence) or len(input_dim) < 2:
285			raise ValueError('expected input_dim to be length >= 2 '
286			'e.g. (channels, height, width)')
287			broadcastable = (False,) + ((True,) * (len(input_dim) - 1))
288			kwargs.setdefault('broadcastable', broadcastable)
289			super(SpatialBatchNormalization, self).__init__(input_dim, **kwargs)
290
291
292			class BatchNormalizedMLP(MLP):
293			"""Convenient subclass for building an MLP with batch normalization.
294
295			Parameters
296			----------
297			conserve_memory : bool, optional
298			See :class:`BatchNormalization`.
299
300			Notes
301			-----
302			All other parameters are the same as :class:`~blocks.bricks.MLP`. Each
303			activation brick is wrapped in a :class:`~blocks.bricks.Sequence`
304			containing an appropriate :class:`BatchNormalization` brick and
305			the activation that follows it.
306
307			By default, the contained :class:`~blocks.bricks.Linear` bricks will
308			not contain any biases, as they could be canceled out by the biases
309			in the :class:`BatchNormalization` bricks being added. Pass
310			`use_bias` with a value of `True` if you really want this for some
311			reason.
312
313			"""
314			@lazy(allocation=['dims'])
315			def __init__(self, activations, dims, args, *kwargs):
316			conserve_memory = kwargs.pop('conserve_memory', True)
317			activations = [
318			Sequence([
319			BatchNormalization(conserve_memory=conserve_memory).apply,
320			act.apply
321			], name='batch_norm_activation_{}'.format(i))
322			for i, act in enumerate(activations)
323			]
324			# Batch normalization bricks incorporate a bias, so there's no
325			# need for our Linear bricks to have them.
326			kwargs.setdefault('use_bias', False)
327			super(BatchNormalizedMLP, self).__init__(activations, dims, *args,
328			**kwargs)
329
330			@property
331			def conserve_memory(self):
332			return self._conserve_memory
333
334			@conserve_memory.setter
335			def conserve_memory(self, value):
336			self._conserve_memory = value
337			for act in self.activations:
338			assert isinstance(act.children[0], BatchNormalization)
339			act.children[0].conserve_memory = value
340
341			def _push_allocation_config(self):
342			super(BatchNormalizedMLP, self)._push_allocation_config()
343			# Do the extra allocation pushing for the BatchNormalization
344			# bricks. They need as their input dimension the output dimension
345			# of each linear transformation. Exclude the first dimension,
346			# which is the input dimension.
347			for act, dim in equizip(self.activations, self.dims[1:]):
348			assert isinstance(act.children[0], BatchNormalization)
349			act.children[0].input_dim = dim
350

mila-udem / blocks

Pull Request — master (#941)

_compute_training_statistics() B

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like