sensor2vec()   F
last analyzed

Complexity

Conditions 9

Size

Total Lines 84

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 9
c 1
b 0
f 0
dl 0
loc 84
rs 3.6178

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
import math
2
import numpy as np
3
import tensorflow as tf
4
from ..learning.nn.injectors import SkipGramInjector
5
6
7
def sensor2vec(num_sensors, sensor_event_list, embedding_size=20,
8
               batch_size=128, num_skips=8, skip_window=5,
9
               num_neg_samples=64, learning_rate=1.0):
10
    """Sensor to Vector
11
    """
12
    if num_neg_samples > num_sensors:
13
        num_neg_samples = num_sensors
14
    # Initialize a SkipGram Injector
15
    injector = SkipGramInjector(sensor_event_list, batch_size, num_skips, skip_window)
16
    # Build Training Model
17
    graph = tf.Graph()
18
    with graph.as_default():
19
        # Input Place Holder
20
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
21
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
22
        # As we normally do not have too many sensors - it is OK to use all of them
23
        valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32)
24
        # Only CPU supports NCE loss
25
        with tf.device('/cpu:0'):
26
            # Look up embeddings for inputs.
27
            embeddings = tf.Variable(
28
                tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0))
29
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
30
31
            # Construct the variables for the NCE loss
32
            nce_weights = tf.Variable(
33
                tf.truncated_normal([num_sensors, embedding_size],
34
                                    stddev=1.0 / math.sqrt(embedding_size)))
35
            nce_biases = tf.Variable(tf.zeros([num_sensors]))
36
37
        # Compute the average NCE loss for the batch.
38
        # tf.nce_loss automatically draws a new sample of the negative labels each
39
        # time we evaluate the loss.
40
        loss = tf.reduce_mean(
41
            tf.nn.nce_loss(weights=nce_weights,
42
                           biases=nce_biases,
43
                           labels=train_labels,
44
                           inputs=embed,
45
                           num_sampled=num_neg_samples,
46
                           num_classes=num_sensors))
47
48
        # Construct the Optimizer
49
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
50
51
        # Compute the cosine similarity between minibatch examples and all embeddings.
52
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
53
        normalized_embeddings = embeddings / norm
54
        valid_embeddings = tf.nn.embedding_lookup(
55
            normalized_embeddings, valid_dataset)
56
        similarity = tf.matmul(
57
            valid_embeddings, normalized_embeddings, transpose_b=True)
58
59
        # Add variable initializer.
60
        init = tf.initialize_all_variables()
61
62
        # Begin training.
63
        num_steps = 100001
64
65
        with tf.Session(graph=graph) as session:
66
            # We must initialize all variables before we use them.
67
            init.run()
68
            print("Initialized")
69
70
            average_loss = 0
71
            for step in range(num_steps):
72
                batch_inputs, batch_labels = injector.next_batch()
73
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
74
75
                # We perform one update step by evaluating the optimizer op (including it
76
                # in the list of returned values for session.run()
77
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
78
                average_loss += loss_val
79
80
                if step % 2000 == 0:
81
                    if step > 0:
82
                        average_loss /= 2000
83
                    # The average loss is an estimate of the loss over the last 2000 batches.
84
                    print("Average loss at step ", step, ": ", average_loss)
85
                    average_loss = 0
86
87
            final_embeddings = normalized_embeddings.eval()
88
            final_similarity = 1 - similarity.eval()
89
            distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:, None]
90
    return final_embeddings, distance_matrix
91
92
93
94
def sensor2vec_data(sensor_list, event_list, embedding_size=20,
95
               batch_size=128, num_skips=8, skip_window=5,
96
               num_neg_samples=64, learning_rate=1.0, ignore_off=True):
97
    """Transform sensor to high dimensional space
98
99
    Similar to word embedding used in natural language processing system, we want
100
    to represent sensors using in a synthesized vector space as well, instead of
101
    using an arbitrary labels for each sensors without any useful information.
102
103
    The methods used to find word embeddings can be classified into two categories:
104
    count-based methods (Latent Semantic Analysis) and predictive models.
105
    In this implementation for mapping sensor into high dimension vector space, we
106
    use skip-gram negative sampling models.
107
108
    Args:
109
        sensor_list (:obj:`list` of :obj:`dict`): List of dictionary containing
110
            sensor information.
111
        event_list (:obj:`list` of :obj:`dict`): List of events.
112
        embedding_size (:obj:`int`): The size of embedding vector.
113
        batch_size (:obj:`int`): The number of batch used in training
114
        num_skips (:obj:`int`): How many times to re-use an input to generate a label
115
            in skip-gram model.
116
        skip_window (:obj:`int`): How many items to consider left or right in skip-gram
117
            model.
118
        num_neg_samples (:obj:`int`): Number of negative samples to draw from the vocabulary.
119
        ignore_off (:obj:`bool`): Ignore motion-sensor with ``Off`` state in event.rst list.
120
121
    Please refer to :func:`sensor_distance` for an example of ``sensor_list``.
122
    Please refer to :func:`sensor_mi_distance` for an example of ``event_list``.
123
    """
124
    # Put sensor in hash table for fast fetch of index
125
    num_sensors = len(sensor_list)
126
    # Negative samples cannot exceed sensor numbers
127
    if num_neg_samples > num_sensors:
128
        num_neg_samples = num_sensors
129
    # Store sensor ID in hash table for faster access
130
    sensor_dict = {}
131
    for i in range(num_sensors):
132
        sensor_dict[sensor_list[i]['name']] = i
133
    # Generate event.rst sensor list
134
    event_sensor_list = []
135
    for event_entry in event_list:
136
        if ignore_off and event_entry['sensor_status'].upper() == "OFF":
137
            continue
138
        event_sensor_list.append(sensor_dict[event_entry['sensor_id']])
139
    # Initialize a SkipGram Injector
140
    injector = SkipGramInjector(event_sensor_list, batch_size, num_skips, skip_window)
141
    # Build Training Model
142
    graph = tf.Graph()
143
    with graph.as_default():
144
        # Input Place Holder
145
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
146
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
147
        # As we normally do not have too many sensors - it is OK to use all of them
148
        valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32)
149
        # Only CPU supports NCE loss
150
        with tf.device('/cpu:0'):
151
            # Look up embeddings for inputs.
152
            embeddings = tf.Variable(
153
                tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0))
154
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)
155
156
            # Construct the variables for the NCE loss
157
            nce_weights = tf.Variable(
158
                tf.truncated_normal([num_sensors, embedding_size],
159
                                    stddev=1.0 / math.sqrt(embedding_size)))
160
            nce_biases = tf.Variable(tf.zeros([num_sensors]))
161
162
        # Compute the average NCE loss for the batch.
163
        # tf.nce_loss automatically draws a new sample of the negative labels each
164
        # time we evaluate the loss.
165
        loss = tf.reduce_mean(
166
            tf.nn.nce_loss(weights=nce_weights,
167
                           biases=nce_biases,
168
                           labels=train_labels,
169
                           inputs=embed,
170
                           num_sampled=num_neg_samples,
171
                           num_classes=num_sensors))
172
173
        # Construct the Optimizer
174
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
175
176
        # Compute the cosine similarity between minibatch examples and all embeddings.
177
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
178
        normalized_embeddings = embeddings / norm
179
        valid_embeddings = tf.nn.embedding_lookup(
180
            normalized_embeddings, valid_dataset)
181
        similarity = tf.matmul(
182
            valid_embeddings, normalized_embeddings, transpose_b=True)
183
184
        # Add variable initializer.
185
        init = tf.initialize_all_variables()
186
187
        # Begin training.
188
        num_steps = 100001
189
190
        with tf.Session(graph=graph) as session:
191
            # We must initialize all variables before we use them.
192
            init.run()
193
            print("Initialized")
194
195
            average_loss = 0
196
            for step in range(num_steps):
197
                batch_inputs, batch_labels = injector.next_batch()
198
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
199
200
                # We perform one update step by evaluating the optimizer op (including it
201
                # in the list of returned values for session.run()
202
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
203
                average_loss += loss_val
204
205
                if step % 2000 == 0:
206
                    if step > 0:
207
                        average_loss /= 2000
208
                    # The average loss is an estimate of the loss over the last 2000 batches.
209
                    print("Average loss at step ", step, ": ", average_loss)
210
                    average_loss = 0
211
212
                # Note that this is expensive (~20% slowdown if computed every 500 steps)
213
                if step % 10000 == 0:
214
                    sim = similarity.eval()
215
                    for i in range(num_sensors):
216
                        valid_sensor = sensor_list[i]['name']
217
                        top_k = 8  # number of nearest neighbors
218
                        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
219
                        log_str = "Nearest to %s:" % valid_sensor
220
                        for k in range(top_k):
221
                            close_sensor = sensor_list[nearest[k]]['name']
222
                            log_str = "%s %s," % (log_str, close_sensor)
223
                        print(log_str)
224
            final_embeddings = normalized_embeddings.eval()
225
            final_similarity = 1 - similarity.eval()
226
            distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:,None]
227
228
    # try:
229
    #     from sklearn.manifold import TSNE
230
    #     import matplotlib.pyplot as plt
231
    #
232
    #     tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
233
    #     low_dim_embs = tsne.fit_transform(final_embeddings)
234
    #     labels = [sensor_list[i]['name'] for i in range(num_sensors)]
235
    #
236
    #     assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
237
    #     plt.figure(figsize=(18, 18))  # in inches
238
    #     for i, label in enumerate(labels):
239
    #         x, y = low_dim_embs[i, :]
240
    #         plt.scatter(x, y)
241
    #         plt.annotate(label,
242
    #                      xy=(x, y),
243
    #                      xytext=(5, 2),
244
    #                      textcoords='offset points',
245
    #                      ha='right',
246
    #                      va='bottom')
247
    #     plt.show()
248
    # except ImportError:
249
    #     print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")
250
251
    return final_embeddings, distance_matrix
252