OneBillionWord.__init__() - Code Metrics - Inspection of "Merge pull request #345 from dmitriy-serdyuk/fix-w..." - mila-udem/fuel - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 3e1d4c...f31f72 )

by Bart

created 2016-04-13 21:38 UTC

OneBillionWord.init() C

↳ Parent: OneBillionWord

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Metric	Value
cc	9
dl	0
loc	20
rs	6.4615

import os

from fuel.datasets import TextFile
from fuel.utils import find_in_data_path


class OneBillionWord(TextFile):
    """Google's One Billion Word benchmark.

    This monolingual corpus contains 829,250,940 tokens (including sentence
    boundary markers). The data is split into 100 partitions, one of which
    is the held-out set. This held-out set is further divided into 50
    partitions. More information about the dataset can be found in
    [CMSG14].

    .. [CSMG14] Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, and
       Thorsten Brants, *One Billion Word Benchmark for Measuring Progress
       in Statistical Language Modeling*, `arXiv:1312.3005 [cs.CL]
       <http://arxiv.org/abs/1312.3005>`.

    Parameters
    ----------
    which_set : 'training' or 'heldout'
        Which dataset to load.
    which_partitions : list of ints
        For the training set, valid values must lie in [1, 99]. For the
        heldout set they must be in [0, 49].
    vocabulary : dict
        A dictionary mapping tokens to integers. This dictionary is
        expected to contain the tokens ``<S>``, ``</S>`` and ``<UNK>``,
        representing "start of sentence", "end of sentence", and
        "out-of-vocabulary" (OoV). The latter will be used whenever a token
        cannot be found in the vocabulary.
    preprocess : function, optional
        A function that takes a string (a sentence including new line) as
        an input and returns a modified string. A useful function to pass
        could be ``str.lower``.

    See :class:`TextFile` for remaining keyword arguments.

    """
    def __init__(self, which_set, which_partitions, dictionary, **kwargs):
        if which_set not in ('training', 'heldout'):
            raise ValueError
        if which_set == 'training':
            if not all(partition in range(1, 100)
                       for partition in which_partitions):
                raise ValueError
            files = [find_in_data_path(os.path.join(
                '1-billion-word', 'training-monolingual.tokenized.shuffled',
                'news.en-{:05d}-of-00100'.format(partition)))
                for partition in which_partitions]
        else:
            if not all(partition in range(50)
                       for partition in which_partitions):
                raise ValueError
            files = [find_in_data_path(os.path.join(
                '1-billion-word', 'heldout-monolingual.tokenized.shuffled',
                'news.en.heldout-{:05d}-of-00050'.format(partition)))
                for partition in which_partitions]
        super(OneBillionWord, self).__init__(files, dictionary, **kwargs)


1			import os
2
3			from fuel.datasets import TextFile
4			from fuel.utils import find_in_data_path
5
6
7			class OneBillionWord(TextFile):
8			"""Google's One Billion Word benchmark.
9
10			This monolingual corpus contains 829,250,940 tokens (including sentence
11			boundary markers). The data is split into 100 partitions, one of which
12			is the held-out set. This held-out set is further divided into 50
13			partitions. More information about the dataset can be found in
14			[CMSG14].
15
16			.. [CSMG14] Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, and
17			Thorsten Brants, *One Billion Word Benchmark for Measuring Progress
18			in Statistical Language Modeling*, `arXiv:1312.3005 [cs.CL]
19			<http://arxiv.org/abs/1312.3005>`.
20
21			Parameters
22			----------
23			which_set : 'training' or 'heldout'
24			Which dataset to load.
25			which_partitions : list of ints
26			For the training set, valid values must lie in [1, 99]. For the
27			heldout set they must be in [0, 49].
28			vocabulary : dict
29			A dictionary mapping tokens to integers. This dictionary is
30			expected to contain the tokens ``<S>``, ``</S>`` and ``<UNK>``,
31			representing "start of sentence", "end of sentence", and
32			"out-of-vocabulary" (OoV). The latter will be used whenever a token
33			cannot be found in the vocabulary.
34			preprocess : function, optional
35			A function that takes a string (a sentence including new line) as
36			an input and returns a modified string. A useful function to pass
37			could be ``str.lower``.
38
39			See :class:`TextFile` for remaining keyword arguments.
40
41			"""
42			def __init__(self, which_set, which_partitions, dictionary, **kwargs):
43			if which_set not in ('training', 'heldout'):
44			raise ValueError
45			if which_set == 'training':
46			if not all(partition in range(1, 100)
47			for partition in which_partitions):
48			raise ValueError
49			files = [find_in_data_path(os.path.join(
50			'1-billion-word', 'training-monolingual.tokenized.shuffled',
51			'news.en-{:05d}-of-00100'.format(partition)))
52			for partition in which_partitions]
53			else:
54			if not all(partition in range(50)
55			for partition in which_partitions):
56			raise ValueError
57			files = [find_in_data_path(os.path.join(
58			'1-billion-word', 'heldout-monolingual.tokenized.shuffled',
59			'news.en.heldout-{:05d}-of-00050'.format(partition)))
60			for partition in which_partitions]
61			super(OneBillionWord, self).__init__(files, dictionary, **kwargs)
62

mila-udem / fuel

Push — master ( 3e1d4c...f31f72 )

OneBillionWord.__init__() C

Complexity

Size

Duplication

Duplication Side-by-Side

Filter issues like

OneBillionWord.init() C