|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
|
|
4
|
|
|
import numpy as np |
|
5
|
|
|
from deepy.utils import FLOATX |
|
6
|
|
|
|
|
7
|
|
|
def pad_dataset(subset, side, length): |
|
8
|
|
|
""" |
|
9
|
|
|
Pad data set to specified length. |
|
10
|
|
|
Parameters: |
|
11
|
|
|
length - max length, a just to the max length in the batch if length is -1 |
|
12
|
|
|
""" |
|
13
|
|
|
assert length == -1 or length > 0 |
|
14
|
|
|
if type(subset[0][0][0]) in [float, int]: |
|
15
|
|
|
return _pad_2d(subset, side, length) |
|
16
|
|
|
else: |
|
17
|
|
|
return _pad_3d(subset, side, length) |
|
18
|
|
|
|
|
19
|
|
|
def _pad_2d(subset, side, length): |
|
20
|
|
|
new_set = [] |
|
21
|
|
|
max_len = max([len(x) for x, _ in subset]) if length == -1 else length |
|
22
|
|
|
for x, y in subset: |
|
23
|
|
|
if len(y) > max_len: |
|
|
|
|
|
|
24
|
|
|
y = y[:max_len] |
|
25
|
|
|
elif len(y) < max_len: |
|
26
|
|
|
if side == "left": |
|
27
|
|
|
y = [0 for _ in range(max_len - len(y))] + y |
|
28
|
|
|
elif side == "right": |
|
29
|
|
|
y = y + [0 for _ in range(max_len - len(y))] |
|
30
|
|
|
if len(x) > max_len: |
|
|
|
|
|
|
31
|
|
|
x = x[:max_len] |
|
32
|
|
|
elif len(x) < max_len: |
|
33
|
|
|
if side == "left": |
|
34
|
|
|
x = [0 for _ in range(max_len - len(x))] + x |
|
35
|
|
|
elif side == "right": |
|
36
|
|
|
x = x + [0 for _ in range(max_len - len(x))] |
|
37
|
|
|
new_set.append((x, y)) |
|
38
|
|
|
return new_set |
|
39
|
|
|
|
|
40
|
|
|
def _pad_3d(subset, side, length): |
|
41
|
|
|
row_size = subset[0][0][0].shape[0] |
|
42
|
|
|
new_set = [] |
|
43
|
|
|
max_len = max([len(x) for x, _ in subset]) if length == -1 else length |
|
44
|
|
|
for x, y in subset: |
|
45
|
|
|
if type(y) == list: |
|
46
|
|
|
# Clip target vector |
|
47
|
|
|
if len(y) > max_len: |
|
|
|
|
|
|
48
|
|
|
y = y[:max_len] |
|
49
|
|
|
elif len(y) < max_len: |
|
50
|
|
|
if side == "left": |
|
51
|
|
|
y = [0 for _ in range(max_len - len(y))] + y |
|
52
|
|
|
elif side == "right": |
|
53
|
|
|
y = y + [0 for _ in range(max_len - len(y))] |
|
54
|
|
|
if len(x) > max_len: |
|
55
|
|
|
x = x[:max_len] |
|
56
|
|
|
elif len(x) < max_len: |
|
57
|
|
|
pad_length = max_len - len(x) |
|
58
|
|
|
pad_matrix = np.zeros((pad_length,row_size), dtype=FLOATX) |
|
59
|
|
|
if side == "left": |
|
60
|
|
|
x = np.vstack([pad_matrix, x]) |
|
61
|
|
|
elif side == "right": |
|
62
|
|
|
x = np.vstack([x, pad_matrix]) |
|
63
|
|
|
else: |
|
64
|
|
|
return Exception("Side of padding must be 'left' or 'right'") |
|
65
|
|
|
new_set.append((x, y)) |
|
66
|
|
|
return new_set |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.