|
1
|
|
|
import h5py |
|
2
|
|
|
import logging |
|
3
|
|
|
import dateutil.parser |
|
4
|
|
|
import numpy as np |
|
5
|
|
|
from collections import OrderedDict |
|
6
|
|
|
|
|
7
|
|
|
logger = logging.getLogger(__name__) |
|
8
|
|
|
|
|
9
|
|
|
|
|
10
|
|
|
class CASASHDF5: |
|
11
|
|
|
"""CASASHDF5 Class to create and retrieve CASAS smart home data from h5df file |
|
12
|
|
|
|
|
13
|
|
|
The data saved to or retrieved from a H5PY data file are pre-calculated features by |
|
14
|
|
|
:class:`CASASData` class. The H5PY data file also contains meta-data about the |
|
15
|
|
|
dataset, which include description for each feature, splits by week and/or splits |
|
16
|
|
|
by days. |
|
17
|
|
|
|
|
18
|
|
|
Attributes: |
|
19
|
|
|
_file (:class:`h5py.File`): :class:`h5py.File` object that represents root group. |
|
20
|
|
|
|
|
21
|
|
|
Args: |
|
22
|
|
|
filename (:obj:`str`): HDF5 File Name |
|
23
|
|
|
mode (:obj:`str`): 'r' for load from the file, and 'w' for create a new h5py data |
|
24
|
|
|
""" |
|
25
|
|
|
def __init__(self, filename, mode='r', driver=None): |
|
26
|
|
|
self._file = h5py.File(filename, mode=mode, driver=driver) |
|
27
|
|
|
if mode == 'w': |
|
28
|
|
|
self._sources = [] |
|
29
|
|
|
self._weeks = OrderedDict() |
|
30
|
|
|
self._days = OrderedDict() |
|
31
|
|
|
self._feature_description = [] |
|
32
|
|
|
self._target_description = [] |
|
33
|
|
|
self._target_colors = [] |
|
34
|
|
|
self._sensors = [] |
|
35
|
|
|
self._comment = '' |
|
36
|
|
|
self._bg_target = '' |
|
37
|
|
|
elif mode == 'r': |
|
38
|
|
|
self._load_dataset_info() |
|
39
|
|
|
else: |
|
40
|
|
|
raise ValueError('mode should be \'w\' or \'r\', but got %s.' % mode) |
|
41
|
|
|
|
|
42
|
|
|
def fetch_data(self, start_split=None, stop_split=None, pre_load=0): |
|
43
|
|
|
"""Fetch data between start and stop splits |
|
44
|
|
|
|
|
45
|
|
|
Args: |
|
46
|
|
|
start_split (:obj:`str`): Begin of data |
|
47
|
|
|
stop_split (:obj:`str`): End of data |
|
48
|
|
|
pre_load (:obj:`int`): Load extra number of data before start split. |
|
49
|
|
|
|
|
50
|
|
|
Returns: |
|
51
|
|
|
:obj:`tuple` of :obj:`numpy.ndarray`: Returns a tuple of all sources sliced by the split defined. |
|
52
|
|
|
The sources should be in the order of ('time', 'feature', 'target') |
|
53
|
|
|
""" |
|
54
|
|
|
start, stop = self._get_split_range(start_split, stop_split, pre_load) |
|
55
|
|
|
# Get time into a array of datetime |
|
56
|
|
|
if 'time' in self._sources: |
|
57
|
|
|
time_list = [dateutil.parser.parse(date_string.decode('utf-8')) |
|
58
|
|
|
for date_string in self._file['time'][start:stop]] |
|
59
|
|
|
else: |
|
60
|
|
|
time_list = None |
|
61
|
|
|
# Get feature array |
|
62
|
|
|
if 'features' in self._sources: |
|
63
|
|
|
features = self._file['features'][start:stop] |
|
64
|
|
|
else: |
|
65
|
|
|
features = None |
|
66
|
|
|
# Get label array |
|
67
|
|
|
if 'targets' in self._sources: |
|
68
|
|
|
targets = self._file['targets'][start:stop] |
|
69
|
|
|
else: |
|
70
|
|
|
targets = None |
|
71
|
|
|
return time_list, features, targets |
|
72
|
|
|
|
|
73
|
|
|
# region Metadata Auxiliary Functions |
|
74
|
|
|
def num_sensors(self): |
|
75
|
|
|
"""Return the number of sensors in the sensor list |
|
76
|
|
|
""" |
|
77
|
|
|
return len(self._sensors) |
|
78
|
|
|
|
|
79
|
|
|
def get_sensor_by_index(self, i): |
|
80
|
|
|
"""Get sensor name by index |
|
81
|
|
|
|
|
82
|
|
|
Args: |
|
83
|
|
|
i (:obj:`int`): Index to sensor |
|
84
|
|
|
""" |
|
85
|
|
|
return self._sensors[i] |
|
86
|
|
|
|
|
87
|
|
|
def num_features(self): |
|
88
|
|
|
"""Get number of features in the dataset |
|
89
|
|
|
""" |
|
90
|
|
|
return len(self._feature_description) |
|
91
|
|
|
|
|
92
|
|
|
def get_feature_description_by_index(self, i): |
|
93
|
|
|
"""Get the description of feature column :math:`i`. |
|
94
|
|
|
|
|
95
|
|
|
Args: |
|
96
|
|
|
i (:obj:`int`): Column index. |
|
97
|
|
|
|
|
98
|
|
|
Returns: |
|
99
|
|
|
:obj:`str`: Corresponding column description. |
|
100
|
|
|
""" |
|
101
|
|
|
return self._feature_description[i] |
|
102
|
|
|
|
|
103
|
|
|
def num_targets(self): |
|
104
|
|
|
"""Total number of target classes. |
|
105
|
|
|
|
|
106
|
|
|
Returns: |
|
107
|
|
|
:obj:`int`: Total number of target classes. |
|
108
|
|
|
""" |
|
109
|
|
|
return len(self._target_description) |
|
110
|
|
|
|
|
111
|
|
|
def get_target_descriptions(self): |
|
112
|
|
|
"""Get list of target descriptions |
|
113
|
|
|
|
|
114
|
|
|
Returns: |
|
115
|
|
|
:obj:`list` of :obj:`str`: List of target class description strings. |
|
116
|
|
|
""" |
|
117
|
|
|
return self._target_description |
|
118
|
|
|
|
|
119
|
|
|
def get_target_description_by_index(self, i): |
|
120
|
|
|
"""Get target description by class index :math:`i`. |
|
121
|
|
|
|
|
122
|
|
|
Args: |
|
123
|
|
|
i (:obj:`int`): Class index. |
|
124
|
|
|
|
|
125
|
|
|
Returns: |
|
126
|
|
|
:obj:`str`: Corresponding target class description. |
|
127
|
|
|
""" |
|
128
|
|
|
return self._target_description[i] |
|
129
|
|
|
|
|
130
|
|
|
def get_target_colors(self): |
|
131
|
|
|
return self._target_colors |
|
132
|
|
|
|
|
133
|
|
|
def get_target_color_by_index(self, i): |
|
134
|
|
|
"""Get the color string of target class :math:`i`. |
|
135
|
|
|
|
|
136
|
|
|
Args: |
|
137
|
|
|
i (:obj:`int`): Class index. |
|
138
|
|
|
|
|
139
|
|
|
Returns: |
|
140
|
|
|
:obj:`str`: Corresponding target class color string. |
|
141
|
|
|
""" |
|
142
|
|
|
return self._target_colors[i] |
|
143
|
|
|
|
|
144
|
|
|
def is_bg_target(self, i=None, label=None): |
|
145
|
|
|
"""Check if the target class given by :param:`i` or :param:`label` is considered background |
|
146
|
|
|
|
|
147
|
|
|
Args: |
|
148
|
|
|
i (:obj:`int`): Class index. |
|
149
|
|
|
label (:obj:`str`): Class name. |
|
150
|
|
|
|
|
151
|
|
|
Returns: |
|
152
|
|
|
:obj:`bool`: True if it is considered background. |
|
153
|
|
|
""" |
|
154
|
|
|
if i is not None: |
|
155
|
|
|
return i == self._target_description.index(self._bg_target) |
|
156
|
|
|
if label is not None: |
|
157
|
|
|
return label == self._bg_target |
|
158
|
|
|
return False |
|
159
|
|
|
|
|
160
|
|
|
def get_bg_target(self): |
|
161
|
|
|
"""Get the description of the target class considered background in the dataset. |
|
162
|
|
|
|
|
163
|
|
|
Returns: |
|
164
|
|
|
:obj:`str`: Name of the class which is considered background in the dataset. Usually it is 'Other_Activity'. |
|
165
|
|
|
""" |
|
166
|
|
|
return self._bg_target |
|
167
|
|
|
|
|
168
|
|
|
def get_bg_target_id(self): |
|
169
|
|
|
"""Get the id of the target class considered background. |
|
170
|
|
|
|
|
171
|
|
|
Returns: |
|
172
|
|
|
:obj:`int`: The index of the target class which is considered background in the dataset. |
|
173
|
|
|
""" |
|
174
|
|
|
return self._target_description.index(self._bg_target) |
|
175
|
|
|
|
|
176
|
|
|
def num_between_splits(self, start_split=None, stop_split=None): |
|
177
|
|
|
"""Get the number of item between splits |
|
178
|
|
|
|
|
179
|
|
|
Args: |
|
180
|
|
|
start_split (:obj:`str`): Begin of data |
|
181
|
|
|
stop_split (:obj:`str`): End of data |
|
182
|
|
|
|
|
183
|
|
|
Returns: |
|
184
|
|
|
:obj:`int`: The number of items between two splits. |
|
185
|
|
|
""" |
|
186
|
|
|
start, stop = self._get_split_range(start_split, stop_split) |
|
187
|
|
|
return stop - start |
|
188
|
|
|
|
|
189
|
|
|
def get_weeks_info(self): |
|
190
|
|
|
"""Get splits by week. |
|
191
|
|
|
|
|
192
|
|
|
Returns: |
|
193
|
|
|
:obj:`List` of :obj:`tuple`: List of (key, value) tuple, where key is the name of the split and value is |
|
194
|
|
|
number of items in that split. |
|
195
|
|
|
""" |
|
196
|
|
|
return [(week, self._weeks[week][1] - self._weeks[week][0]) for week in self._weeks] |
|
197
|
|
|
|
|
198
|
|
|
def get_days_info(self): |
|
199
|
|
|
"""Get splits by day. |
|
200
|
|
|
|
|
201
|
|
|
Returns: |
|
202
|
|
|
:obj:`List` of :obj:`tuple`: List of (key, value) tuple, where key is the name of the split and value is |
|
203
|
|
|
number of items in that split. |
|
204
|
|
|
""" |
|
205
|
|
|
return [(day, self._days[day][1] - self._days[day][0]) for day in self._days] |
|
206
|
|
|
# endregion |
|
207
|
|
|
|
|
208
|
|
|
# region CASASH5PY Dataset Creation |
|
209
|
|
|
def create_features(self, feature_array, feature_description): |
|
210
|
|
|
""" Create Feature Dataset |
|
211
|
|
|
|
|
212
|
|
|
Args: |
|
213
|
|
|
feature_array (:obj:`numpy.ndarray`): Numpy array holding calculated feature vectors |
|
214
|
|
|
feature_description (:obj:`list` of :obj:`str`): List of strings that describe each column of |
|
215
|
|
|
feature vectors. |
|
216
|
|
|
""" |
|
217
|
|
|
if 'features' in self._sources: |
|
218
|
|
|
logger.error('Feature array already exists in the dataset.') |
|
219
|
|
|
return |
|
220
|
|
|
self._sources.append('features') |
|
221
|
|
|
self._feature_description = feature_description |
|
222
|
|
|
# Create feature array |
|
223
|
|
|
dset = self._file.create_dataset('features', data=feature_array, |
|
224
|
|
|
chunks=True, compression="gzip", compression_opts=9) |
|
225
|
|
|
dset.dims[0].label = 'batch' |
|
226
|
|
|
dset.dims[1].label = 'feature' |
|
227
|
|
|
# Add Feature Description as attributes |
|
228
|
|
|
self._file.attrs['features'] = [description.encode('utf-8') |
|
229
|
|
|
for description in feature_description] |
|
230
|
|
|
|
|
231
|
|
|
def create_targets(self, target_array, target_description, target_colors): |
|
232
|
|
|
""" Create Target Dataset |
|
233
|
|
|
|
|
234
|
|
|
Args: |
|
235
|
|
|
target_array (:obj:`numpy.ndarray`): Numpy array holding target labels |
|
236
|
|
|
target_description (:obj:`list` of :obj:`str`): List of strings that describe each each target class. |
|
237
|
|
|
target_colors (:obj:`list` of :obj:`str`): List of color values corresponding to each target class. |
|
238
|
|
|
""" |
|
239
|
|
|
if 'targets' in self._sources: |
|
240
|
|
|
logger.error('Target array already exists in the dataset.') |
|
241
|
|
|
return |
|
242
|
|
|
self._sources.append('targets') |
|
243
|
|
|
self._target_description = target_description |
|
244
|
|
|
self._target_colors = target_colors |
|
245
|
|
|
# Create feature array |
|
246
|
|
|
dset = self._file.create_dataset('targets', data=target_array.reshape((target_array.size, 1))) |
|
247
|
|
|
dset.dims[0].label = 'batch' |
|
248
|
|
|
dset.dims[1].label = 'target' |
|
249
|
|
|
# Add Target Description as attributes |
|
250
|
|
|
self._file.attrs['targets'] = [description.encode('utf-8') |
|
251
|
|
|
for description in target_description] |
|
252
|
|
|
# Add Target Color as attributes |
|
253
|
|
|
self._file.attrs['target_colors'] = [color_string.encode('utf-8') |
|
254
|
|
|
for color_string in target_colors] |
|
255
|
|
|
|
|
256
|
|
|
def create_time_list(self, time_array): |
|
257
|
|
|
""" Create Time List |
|
258
|
|
|
|
|
259
|
|
|
Args: |
|
260
|
|
|
time_array (:obj:`list` of :obj:`datetime`): datetime corresponding to each feature vector in feature |
|
261
|
|
|
dataset. |
|
262
|
|
|
""" |
|
263
|
|
|
if 'time' in self._sources: |
|
264
|
|
|
logger.error('Time list already exists in the dataset.') |
|
265
|
|
|
return |
|
266
|
|
|
self._sources.append('time') |
|
267
|
|
|
# Create Time lists |
|
268
|
|
|
num_items = len(time_array) |
|
269
|
|
|
dt = h5py.special_dtype(vlen=bytes) |
|
270
|
|
|
dset = self._file.create_dataset('time', (num_items,), dtype=dt) |
|
271
|
|
|
for i in range(num_items): |
|
272
|
|
|
dset[i] = time_array[i].isoformat().encode('utf-8') |
|
273
|
|
|
|
|
274
|
|
|
def create_splits(self, days, weeks): |
|
275
|
|
|
""" Create splits by days and weeks |
|
276
|
|
|
|
|
277
|
|
|
Args: |
|
278
|
|
|
days (:obj:`list` of :obj:`int`): Start index for each day |
|
279
|
|
|
weeks (:obj:`list` of :obj:`int`): Start index for week |
|
280
|
|
|
""" |
|
281
|
|
|
if len(self._days) != 0 or len(self._weeks) != 0: |
|
282
|
|
|
logger.error('Splits already exist.') |
|
283
|
|
|
return |
|
284
|
|
|
self._days = OrderedDict() |
|
285
|
|
|
self._weeks = OrderedDict() |
|
286
|
|
|
max_name_len = len('week_%d' % len(days)) |
|
287
|
|
|
# Create days numpy array |
|
288
|
|
|
days_array = np.empty( |
|
289
|
|
|
len(days) - 1, |
|
290
|
|
|
dtype=np.dtype([ |
|
291
|
|
|
('name', 'a', max_name_len), |
|
292
|
|
|
('start', np.int64, 1), |
|
293
|
|
|
('stop', np.int64, 1)] |
|
294
|
|
|
)) |
|
295
|
|
|
# Create days numpy array |
|
296
|
|
|
weeks_array = np.empty( |
|
297
|
|
|
len(weeks) - 1, |
|
298
|
|
|
dtype=np.dtype([ |
|
299
|
|
|
('name', 'a', max_name_len), |
|
300
|
|
|
('start', np.int64, 1), |
|
301
|
|
|
('stop', np.int64, 1)] |
|
302
|
|
|
)) |
|
303
|
|
|
# Populate days_array |
|
304
|
|
|
for i in range(len(days) - 1): |
|
305
|
|
|
days_array[i]['name'] = ('day_%d' % i).encode('utf-8') |
|
306
|
|
|
days_array[i]['start'] = days[i] |
|
307
|
|
|
days_array[i]['stop'] = days[i+1] |
|
308
|
|
|
self._days[('day_%d' % i)] = [days[i], days[i+1]] |
|
309
|
|
|
# Populate weeks array |
|
310
|
|
|
for i in range(len(weeks) - 1): |
|
311
|
|
|
weeks_array[i]['name'] = ('week_%d' % i).encode('utf-8') |
|
312
|
|
|
weeks_array[i]['start'] = weeks[i] |
|
313
|
|
|
weeks_array[i]['stop'] = weeks[i+1] |
|
314
|
|
|
self._weeks[('week_%d' % i)] = [weeks[i], weeks[i+1]] |
|
315
|
|
|
# Set attributes |
|
316
|
|
|
self._file.attrs['days'] = days_array |
|
317
|
|
|
self._file.attrs['weeks'] = weeks_array |
|
318
|
|
|
|
|
319
|
|
|
def create_comments(self, comment): |
|
320
|
|
|
""" Add comments to dataset |
|
321
|
|
|
|
|
322
|
|
|
Args: |
|
323
|
|
|
comment (:obj:`str`): Comments to the dataset |
|
324
|
|
|
""" |
|
325
|
|
|
self._file.attrs['comment'] = comment.encode('utf-8') |
|
326
|
|
|
|
|
327
|
|
|
def create_sensors(self, sensors): |
|
328
|
|
|
""" Add sensors list to attributes |
|
329
|
|
|
|
|
330
|
|
|
If the sensor IDs in the dataset is not binary coded, there is a need to provide the sensor list to go along |
|
331
|
|
|
with the feature vectors. |
|
332
|
|
|
|
|
333
|
|
|
Args: |
|
334
|
|
|
sensors (:obj:`list` of :obj:`str`): List of sensor name corresponds to the id in the feature array. |
|
335
|
|
|
""" |
|
336
|
|
|
self._file.attrs['sensors'] = [sensor.encode('utf-8') for sensor in sensors] |
|
337
|
|
|
|
|
338
|
|
|
def set_background_target(self, target_name): |
|
339
|
|
|
""" Set 'target_name' as background target |
|
340
|
|
|
|
|
341
|
|
|
Args: |
|
342
|
|
|
target_name (:obj:`str`): Name of background target |
|
343
|
|
|
""" |
|
344
|
|
|
if self._bg_target != '': |
|
345
|
|
|
logger.error('background target label has been set to %s.' % self._bg_target) |
|
346
|
|
|
return |
|
347
|
|
|
self._bg_target = target_name |
|
348
|
|
|
self._file.attrs['bg_target'] = target_name.encode('utf-8') |
|
349
|
|
|
|
|
350
|
|
|
def flush(self): |
|
351
|
|
|
""" Write To File |
|
352
|
|
|
""" |
|
353
|
|
|
self._file.attrs['sources'] = [source.encode('utf-8') for source in self._sources] |
|
354
|
|
|
self._file.flush() |
|
355
|
|
|
# endregion |
|
356
|
|
|
|
|
357
|
|
|
def close(self): |
|
358
|
|
|
""" Close Dataset |
|
359
|
|
|
""" |
|
360
|
|
|
self._file.close() |
|
361
|
|
|
|
|
362
|
|
|
# region InternalSupportRoutines |
|
363
|
|
|
def _get_split_range(self, start_split=None, stop_split=None, pre_load=0): |
|
364
|
|
|
"""Get the requested splits range |
|
365
|
|
|
|
|
366
|
|
|
Args: |
|
367
|
|
|
start_split (:obj:`str`): Begin of data |
|
368
|
|
|
stop_split (:obj:`str`): End of data |
|
369
|
|
|
pre_load (:obj:`int`): Load extra number of data before start split. |
|
370
|
|
|
|
|
371
|
|
|
Returns: |
|
372
|
|
|
:obj:`tuple` of :obj:`int`: Returns a tuple of the start and stop index. |
|
373
|
|
|
""" |
|
374
|
|
|
# Determine the start index |
|
375
|
|
|
if start_split is None: |
|
376
|
|
|
start = 0 |
|
377
|
|
|
stop = self._file[self._sources[0]].shape[0] |
|
378
|
|
|
elif start_split in self._weeks: |
|
379
|
|
|
start = self._weeks[start_split][0] |
|
380
|
|
|
stop = self._weeks[start_split][1] |
|
381
|
|
|
elif start_split in self._days: |
|
382
|
|
|
start = self._days[start_split][0] |
|
383
|
|
|
stop = self._days[start_split][1] |
|
384
|
|
|
else: |
|
385
|
|
|
raise ValueError('start_split error: Cannot find %s in splitting array.' % start_split) |
|
386
|
|
|
# Determine the stop index |
|
387
|
|
|
if stop_split is not None: |
|
388
|
|
|
if stop_split in self._weeks: |
|
389
|
|
|
stop = self._weeks[stop_split][1] |
|
390
|
|
|
elif stop_split in self._days: |
|
391
|
|
|
stop = self._weeks[stop_split][1] |
|
392
|
|
|
else: |
|
393
|
|
|
raise ValueError('stop_split error: Cannot find %s in splitting array.' % stop_split) |
|
394
|
|
|
# Compensate pre-load |
|
395
|
|
|
start = start - pre_load |
|
396
|
|
|
if start < 0: |
|
397
|
|
|
start = 0 |
|
398
|
|
|
return start, stop |
|
399
|
|
|
|
|
400
|
|
|
def _load_dataset_info(self): |
|
401
|
|
|
"""Populate attributes of current class based on meta-data from h5py file |
|
402
|
|
|
""" |
|
403
|
|
|
attrs = self._file.attrs.keys() |
|
404
|
|
|
# Check sources set |
|
405
|
|
|
if 'sources' in attrs: |
|
406
|
|
|
self._sources = [source.decode('utf-8') for source in self._file.attrs['sources']] |
|
407
|
|
|
else: |
|
408
|
|
|
self._sources = [] |
|
409
|
|
|
# Parse splits |
|
410
|
|
|
self._weeks = OrderedDict() |
|
411
|
|
|
self._days = OrderedDict() |
|
412
|
|
|
if 'weeks' in attrs and 'days' in attrs: |
|
413
|
|
|
for row in self._file.attrs['weeks']: |
|
414
|
|
|
self._weeks[row['name'].decode('utf-8')] = [row['start'], row['stop']] |
|
415
|
|
|
for row in self._file.attrs['days']: |
|
416
|
|
|
self._days[row['name'].decode('utf-8')] = [row['start'], row['stop']] |
|
417
|
|
|
# Meta-data about dataset |
|
418
|
|
|
if 'features' in attrs: |
|
419
|
|
|
self._feature_description = [description.decode('utf-8') |
|
420
|
|
|
for description in self._file.attrs['features']] |
|
421
|
|
|
else: |
|
422
|
|
|
self._feature_description = [] |
|
423
|
|
|
if 'targets' in attrs: |
|
424
|
|
|
self._target_description = [description.decode('utf-8') |
|
425
|
|
|
for description in self._file.attrs['targets']] |
|
426
|
|
|
else: |
|
427
|
|
|
self._target_description = [] |
|
428
|
|
|
if 'target_colors' in attrs: |
|
429
|
|
|
self._target_colors = [color_string.decode('utf-8') |
|
430
|
|
|
for color_string in self._file.attrs['target_colors']] |
|
431
|
|
|
else: |
|
432
|
|
|
self._target_colors = [] |
|
433
|
|
|
if 'sensors' in attrs: |
|
434
|
|
|
self._sensors = [sensor.decode('utf-8') for sensor in self._file.attrs['sensors']] |
|
435
|
|
|
else: |
|
436
|
|
|
self._sensors = [] |
|
437
|
|
|
# Load Comments and Background task |
|
438
|
|
|
if 'bg_target' in attrs: |
|
439
|
|
|
self._bg_target = self._file.attrs['bg_target'].decode('utf-8') |
|
440
|
|
|
else: |
|
441
|
|
|
self._bg_target = '' |
|
442
|
|
|
if 'comment' in attrs: |
|
443
|
|
|
self._comment = self._file.attrs['comment'].decode('utf-8') |
|
444
|
|
|
else: |
|
445
|
|
|
self._comment = '' |
|
446
|
|
|
# endregion |
|
447
|
|
|
|