Completed
Push — master ( 3e1d4c...f31f72 )
by Bart
27s
created

OneBillionWord.__init__()   C

Complexity

Conditions 9

Size

Total Lines 20

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 9
dl 0
loc 20
rs 6.4615
1
import os
2
3
from fuel.datasets import TextFile
4
from fuel.utils import find_in_data_path
5
6
7
class OneBillionWord(TextFile):
8
    """Google's One Billion Word benchmark.
9
10
    This monolingual corpus contains 829,250,940 tokens (including sentence
11
    boundary markers). The data is split into 100 partitions, one of which
12
    is the held-out set. This held-out set is further divided into 50
13
    partitions. More information about the dataset can be found in
14
    [CMSG14].
15
16
    .. [CSMG14] Ciprian Chelba, Tomas Mikolov, Mike Schuster, Qi Ge, and
17
       Thorsten Brants, *One Billion Word Benchmark for Measuring Progress
18
       in Statistical Language Modeling*, `arXiv:1312.3005 [cs.CL]
19
       <http://arxiv.org/abs/1312.3005>`.
20
21
    Parameters
22
    ----------
23
    which_set : 'training' or 'heldout'
24
        Which dataset to load.
25
    which_partitions : list of ints
26
        For the training set, valid values must lie in [1, 99]. For the
27
        heldout set they must be in [0, 49].
28
    vocabulary : dict
29
        A dictionary mapping tokens to integers. This dictionary is
30
        expected to contain the tokens ``<S>``, ``</S>`` and ``<UNK>``,
31
        representing "start of sentence", "end of sentence", and
32
        "out-of-vocabulary" (OoV). The latter will be used whenever a token
33
        cannot be found in the vocabulary.
34
    preprocess : function, optional
35
        A function that takes a string (a sentence including new line) as
36
        an input and returns a modified string. A useful function to pass
37
        could be ``str.lower``.
38
39
    See :class:`TextFile` for remaining keyword arguments.
40
41
    """
42
    def __init__(self, which_set, which_partitions, dictionary, **kwargs):
43
        if which_set not in ('training', 'heldout'):
44
            raise ValueError
45
        if which_set == 'training':
46
            if not all(partition in range(1, 100)
47
                       for partition in which_partitions):
48
                raise ValueError
49
            files = [find_in_data_path(os.path.join(
50
                '1-billion-word', 'training-monolingual.tokenized.shuffled',
51
                'news.en-{:05d}-of-00100'.format(partition)))
52
                for partition in which_partitions]
53
        else:
54
            if not all(partition in range(50)
55
                       for partition in which_partitions):
56
                raise ValueError
57
            files = [find_in_data_path(os.path.join(
58
                '1-billion-word', 'heldout-monolingual.tokenized.shuffled',
59
                'news.en.heldout-{:05d}-of-00050'.format(partition)))
60
                for partition in which_partitions]
61
        super(OneBillionWord, self).__init__(files, dictionary, **kwargs)
62