|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
|
|
4
|
|
|
#!/usr/bin/env python |
|
5
|
|
|
# -*- coding: utf-8 -*- |
|
6
|
|
|
|
|
7
|
|
|
import types |
|
8
|
|
|
from . import Dataset |
|
9
|
|
|
from data_processor import DataProcessor |
|
10
|
|
|
from deepy.utils import FakeGenerator, StreamPickler, global_rand |
|
11
|
|
|
|
|
12
|
|
|
import logging as loggers |
|
13
|
|
|
logging = loggers.getLogger(__name__) |
|
14
|
|
|
|
|
15
|
|
|
class OnDiskDataset(Dataset): |
|
16
|
|
|
""" |
|
17
|
|
|
Load large on-disk dataset. |
|
18
|
|
|
The data should be dumped with deepy.utils.StreamPickler. |
|
19
|
|
|
You must convert the data to mini-batches before dump it to a file. |
|
20
|
|
|
""" |
|
21
|
|
|
|
|
22
|
|
|
def __init__(self, train_path, valid_path=None, test_path=None, train_size=None, |
|
23
|
|
|
cached=False, post_processing=None, shuffle_memory=False, data_processor=None): |
|
24
|
|
|
""" |
|
25
|
|
|
:type data_processor: DataProcessor |
|
26
|
|
|
""" |
|
27
|
|
|
self._train_path = train_path |
|
28
|
|
|
self._valid_path = valid_path |
|
29
|
|
|
self._test_path = test_path |
|
30
|
|
|
self._train_size = train_size |
|
31
|
|
|
self._cache_on_memory = cached |
|
32
|
|
|
self._cached_train_data = None |
|
33
|
|
|
self._post_processing = post_processing if post_processing else lambda x: x |
|
34
|
|
|
self._shuffle_memory = shuffle_memory |
|
35
|
|
|
self._epoch = 0 |
|
36
|
|
|
self._data_processor = data_processor |
|
37
|
|
|
if data_processor and not isinstance(data_processor, DataProcessor): |
|
38
|
|
|
raise Exception("data_processor must be an instance of DataProcessor.") |
|
39
|
|
|
if self._cache_on_memory: |
|
40
|
|
|
logging.info("Cache on memory") |
|
41
|
|
|
self._cached_train_data = list(map(self._post_processing, StreamPickler.load(open(self._train_path)))) |
|
42
|
|
|
self._train_size = len(self._cached_train_data) |
|
43
|
|
|
if self._shuffle_memory: |
|
44
|
|
|
logging.info("Shuffle on-memory data") |
|
45
|
|
|
global_rand.shuffle(self._cached_train_data) |
|
46
|
|
|
|
|
47
|
|
|
def _process_data(self, split, epoch, dataset): |
|
48
|
|
|
if self._data_processor: |
|
49
|
|
|
return self._data_processor.process(split, epoch, dataset) |
|
50
|
|
|
else: |
|
51
|
|
|
return dataset |
|
52
|
|
|
|
|
53
|
|
|
def generate_train_data(self): |
|
54
|
|
|
self._epoch += 1 |
|
55
|
|
|
data_source = self._cached_train_data if self._cache_on_memory else StreamPickler.load(open(self._train_path)) |
|
56
|
|
|
for data in self._process_data('train', self._epoch, data_source): |
|
57
|
|
|
yield self._post_processing(data) |
|
58
|
|
|
|
|
59
|
|
|
def generate_valid_data(self): |
|
60
|
|
|
data_source = StreamPickler.load(open(self._valid_path)) |
|
61
|
|
|
for data in self._process_data('valid', self._epoch, data_source): |
|
62
|
|
|
yield self._post_processing(data) |
|
63
|
|
|
|
|
64
|
|
|
def generate_test_data(self): |
|
65
|
|
|
data_source = StreamPickler.load(open(self._test_path)) |
|
66
|
|
|
for data in self._process_data('test', self._epoch, data_source): |
|
67
|
|
|
yield self._post_processing(data) |
|
68
|
|
|
|
|
69
|
|
|
def train_set(self): |
|
70
|
|
|
if not self._train_path: |
|
71
|
|
|
return None |
|
72
|
|
|
return FakeGenerator(self, "generate_train_data") |
|
73
|
|
|
|
|
74
|
|
|
def valid_set(self): |
|
75
|
|
|
if not self._valid_path: |
|
76
|
|
|
return None |
|
77
|
|
|
return FakeGenerator(self, "generate_valid_data") |
|
78
|
|
|
|
|
79
|
|
|
def test_set(self): |
|
80
|
|
|
if not self._test_path: |
|
81
|
|
|
return None |
|
82
|
|
|
return FakeGenerator(self, "generate_test_data") |
|
83
|
|
|
|
|
84
|
|
|
def train_size(self): |
|
85
|
|
|
return self._train_size |
|
86
|
|
|
|