fuel.transformers.Window._set_index() - Code Metrics - Inspection of "Add Window transformer" - mila-udem/fuel - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#310)

by Bart

created 2016-02-08 17:25 UTC

fuel.transformers.Window._set_index() A

↳ Parent: fuel.transformers.Window

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	1
dl	0
loc	5
rs	9.4285

from fuel.transformers import Transformer


class Window(Transformer):
    """Return pairs of source and target windows from a stream.

    This data stream wrapper takes as an input a data stream outputting
    sequences of potentially varying lengths (e.g. sentences, audio tracks,
    etc.). It then returns two sliding windows (source and target) over
    these sequences.

    For example, to train an n-gram model set `source_window` to n,
    `target_window` to 1, no offset, and `overlapping` to false. This will
    give chunks [1, N] and [N + 1]. To train an RNN you often want to set
    the source and target window to the same size and use an offset of 1
    with overlap, this would give you chunks [1, N] and [2, N + 1].

    Parameters
    ----------
    offset : int
        The offset from the source window where the target window starts.
    source_window : int
        The size of the source window.
    target_window : int
        The size of the target window.
    overlapping : bool
        If true, the source and target windows overlap i.e. the offset of
        the target window is taken to be from the beginning of the source
        window. If false, the target window offset is taken to be from the
        end of the source window.
    data_stream : :class:`.DataStream` instance
        The data stream providing sequences. Each example is assumed to be
        an object that supports slicing.
    target_source : str, optional
        This data stream adds a new source for the target words. By default
        this source is 'targets'.

    """
    def __init__(self, offset, source_window, target_window,
                 overlapping, data_stream, target_source='targets', **kwargs):
        if not data_stream.produces_examples:
            raise ValueError('the wrapped data stream must produce examples, '
                             'not batches of examples.')
        if len(data_stream.sources) > 1:
            raise ValueError('{} expects only one source'
                             .format(self.__class__.__name__))

        super(Window, self).__init__(data_stream, produces_examples=True,
                                     **kwargs)
        self.sources = self.sources + (target_source,)

        self.offset = offset
        self.source_window = source_window
        self.target_window = target_window
        self.overlapping = overlapping

        self.sentence = []
        self._set_index()

    def _set_index(self):
        """Set the starting index of the source window."""
        self.index = 0
        # If offset is negative, target window might start before 0
        self.index = -min(0, self._get_target_index())

    def _get_target_index(self):
        """Return the index where the target window starts."""
        return (self.index + self.source_window * (not self.overlapping) +
                self.offset)

    def _get_end_index(self):
        """Return the end of both windows."""
        return max(self.index + self.source_window,
                   self._get_target_index() + self.target_window)

    def get_data(self, request=None):
        if request is not None:
            raise ValueError
        while not self._get_end_index() <= len(self.sentence):
            self.sentence, = next(self.child_epoch_iterator)
            self._set_index()
        source = self.sentence[self.index:self.index + self.source_window]
        target = self.sentence[self._get_target_index():
                               self._get_target_index() + self.target_window]
        self.index += 1
        return (source, target)


class NGrams(Window):
    """Return n-grams from a stream.

    This data stream wrapper takes as an input a data stream outputting
    sentences. From these sentences n-grams of a fixed order (e.g. bigrams,
    trigrams, etc.) are extracted and returned. It also creates a
    ``targets`` data source. For each example, the target is the word
    immediately following that n-gram. It is normally used for language
    modeling, where we try to predict the next word from the previous *n*
    words.

    .. note::

       Unlike the :class:`Window` stream, the target returned by
       :class:`NGrams` is a single element instead of a window.

    Parameters
    ----------
    ngram_order : int
        The order of the n-grams to output e.g. 3 for trigrams.
    data_stream : :class:`.DataStream` instance
        The data stream providing sentences. Each example is assumed to be
        a list of integers.
    target_source : str, optional
        This data stream adds a new source for the target words. By default
        this source is 'targets'.

    """
    def __init__(self, ngram_order, *args, **kwargs):
        super(NGrams, self).__init__(
            0, ngram_order, 1, False, *args, **kwargs)

    def get_data(self, *args, **kwargs):
        source, target = super(NGrams, self).get_data(*args, **kwargs)
        return (source, target[0])


1			from fuel.transformers import Transformer
2
3
4			class Window(Transformer):
5			"""Return pairs of source and target windows from a stream.
6
7			This data stream wrapper takes as an input a data stream outputting
8			sequences of potentially varying lengths (e.g. sentences, audio tracks,
9			etc.). It then returns two sliding windows (source and target) over
10			these sequences.
11
12			For example, to train an n-gram model set `source_window` to n,
13			`target_window` to 1, no offset, and `overlapping` to false. This will
14			give chunks [1, N] and [N + 1]. To train an RNN you often want to set
15			the source and target window to the same size and use an offset of 1
16			with overlap, this would give you chunks [1, N] and [2, N + 1].
17
18			Parameters
19			----------
20			offset : int
21			The offset from the source window where the target window starts.
22			source_window : int
23			The size of the source window.
24			target_window : int
25			The size of the target window.
26			overlapping : bool
27			If true, the source and target windows overlap i.e. the offset of
28			the target window is taken to be from the beginning of the source
29			window. If false, the target window offset is taken to be from the
30			end of the source window.
31			data_stream : :class:`.DataStream` instance
32			The data stream providing sequences. Each example is assumed to be
33			an object that supports slicing.
34			target_source : str, optional
35			This data stream adds a new source for the target words. By default
36			this source is 'targets'.
37
38			"""
39			def __init__(self, offset, source_window, target_window,
40			overlapping, data_stream, target_source='targets', **kwargs):
41			if not data_stream.produces_examples:
42			raise ValueError('the wrapped data stream must produce examples, '
43			'not batches of examples.')
44			if len(data_stream.sources) > 1:
45			raise ValueError('{} expects only one source'
46			.format(self.__class__.__name__))
47
48			super(Window, self).__init__(data_stream, produces_examples=True,
49			**kwargs)
50			self.sources = self.sources + (target_source,)
51
52			self.offset = offset
53			self.source_window = source_window
54			self.target_window = target_window
55			self.overlapping = overlapping
56
57			self.sentence = []
58			self._set_index()
59
60			def _set_index(self):
61			"""Set the starting index of the source window."""
62			self.index = 0
63			# If offset is negative, target window might start before 0
64			self.index = -min(0, self._get_target_index())
65
66			def _get_target_index(self):
67			"""Return the index where the target window starts."""
68			return (self.index + self.source_window * (not self.overlapping) +
69			self.offset)
70
71			def _get_end_index(self):
72			"""Return the end of both windows."""
73			return max(self.index + self.source_window,
74			self._get_target_index() + self.target_window)
75
76			def get_data(self, request=None):
77			if request is not None:
78			raise ValueError
79			while not self._get_end_index() <= len(self.sentence):
80			self.sentence, = next(self.child_epoch_iterator)
81			self._set_index()
82			source = self.sentence[self.index:self.index + self.source_window]
83			target = self.sentence[self._get_target_index():
84			self._get_target_index() + self.target_window]
85			self.index += 1
86			return (source, target)
87
88
89			class NGrams(Window):
90			"""Return n-grams from a stream.
91
92			This data stream wrapper takes as an input a data stream outputting
93			sentences. From these sentences n-grams of a fixed order (e.g. bigrams,
94			trigrams, etc.) are extracted and returned. It also creates a
95			``targets`` data source. For each example, the target is the word
96			immediately following that n-gram. It is normally used for language
97			modeling, where we try to predict the next word from the previous n
98			words.
99
100			.. note::
101
102			Unlike the :class:`Window` stream, the target returned by
103			:class:`NGrams` is a single element instead of a window.
104
105			Parameters
106			----------
107			ngram_order : int
108			The order of the n-grams to output e.g. 3 for trigrams.
109			data_stream : :class:`.DataStream` instance
110			The data stream providing sentences. Each example is assumed to be
111			a list of integers.
112			target_source : str, optional
113			This data stream adds a new source for the target words. By default
114			this source is 'targets'.
115
116			"""
117			def __init__(self, ngram_order, args, *kwargs):
118			super(NGrams, self).__init__(
119			0, ngram_order, 1, False, args, *kwargs)
120
121			def get_data(self, args, *kwargs):
122			source, target = super(NGrams, self).get_data(args, *kwargs)
123			return (source, target[0])
124

mila-udem / fuel

Pull Request — master (#310)

fuel.transformers.Window._set_index() A

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like