Conditions | 16 |
Total Lines | 158 |
Lines | 0 |
Ratio | 0 % |
Changes | 1 | ||
Bugs | 0 | Features | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like sensor2vec_data() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | import math |
||
94 | def sensor2vec_data(sensor_list, event_list, embedding_size=20, |
||
95 | batch_size=128, num_skips=8, skip_window=5, |
||
96 | num_neg_samples=64, learning_rate=1.0, ignore_off=True): |
||
97 | """Transform sensor to high dimensional space |
||
98 | |||
99 | Similar to word embedding used in natural language processing system, we want |
||
100 | to represent sensors using in a synthesized vector space as well, instead of |
||
101 | using an arbitrary labels for each sensors without any useful information. |
||
102 | |||
103 | The methods used to find word embeddings can be classified into two categories: |
||
104 | count-based methods (Latent Semantic Analysis) and predictive models. |
||
105 | In this implementation for mapping sensor into high dimension vector space, we |
||
106 | use skip-gram negative sampling models. |
||
107 | |||
108 | Args: |
||
109 | sensor_list (:obj:`list` of :obj:`dict`): List of dictionary containing |
||
110 | sensor information. |
||
111 | event_list (:obj:`list` of :obj:`dict`): List of events. |
||
112 | embedding_size (:obj:`int`): The size of embedding vector. |
||
113 | batch_size (:obj:`int`): The number of batch used in training |
||
114 | num_skips (:obj:`int`): How many times to re-use an input to generate a label |
||
115 | in skip-gram model. |
||
116 | skip_window (:obj:`int`): How many items to consider left or right in skip-gram |
||
117 | model. |
||
118 | num_neg_samples (:obj:`int`): Number of negative samples to draw from the vocabulary. |
||
119 | ignore_off (:obj:`bool`): Ignore motion-sensor with ``Off`` state in event.rst list. |
||
120 | |||
121 | Please refer to :func:`sensor_distance` for an example of ``sensor_list``. |
||
122 | Please refer to :func:`sensor_mi_distance` for an example of ``event_list``. |
||
123 | """ |
||
124 | # Put sensor in hash table for fast fetch of index |
||
125 | num_sensors = len(sensor_list) |
||
126 | # Negative samples cannot exceed sensor numbers |
||
127 | if num_neg_samples > num_sensors: |
||
128 | num_neg_samples = num_sensors |
||
129 | # Store sensor ID in hash table for faster access |
||
130 | sensor_dict = {} |
||
131 | for i in range(num_sensors): |
||
132 | sensor_dict[sensor_list[i]['name']] = i |
||
133 | # Generate event.rst sensor list |
||
134 | event_sensor_list = [] |
||
135 | for event_entry in event_list: |
||
136 | if ignore_off and event_entry['sensor_status'].upper() == "OFF": |
||
137 | continue |
||
138 | event_sensor_list.append(sensor_dict[event_entry['sensor_id']]) |
||
139 | # Initialize a SkipGram Injector |
||
140 | injector = SkipGramInjector(event_sensor_list, batch_size, num_skips, skip_window) |
||
141 | # Build Training Model |
||
142 | graph = tf.Graph() |
||
143 | with graph.as_default(): |
||
144 | # Input Place Holder |
||
145 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) |
||
146 | train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) |
||
147 | # As we normally do not have too many sensors - it is OK to use all of them |
||
148 | valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32) |
||
149 | # Only CPU supports NCE loss |
||
150 | with tf.device('/cpu:0'): |
||
151 | # Look up embeddings for inputs. |
||
152 | embeddings = tf.Variable( |
||
153 | tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0)) |
||
154 | embed = tf.nn.embedding_lookup(embeddings, train_inputs) |
||
155 | |||
156 | # Construct the variables for the NCE loss |
||
157 | nce_weights = tf.Variable( |
||
158 | tf.truncated_normal([num_sensors, embedding_size], |
||
159 | stddev=1.0 / math.sqrt(embedding_size))) |
||
160 | nce_biases = tf.Variable(tf.zeros([num_sensors])) |
||
161 | |||
162 | # Compute the average NCE loss for the batch. |
||
163 | # tf.nce_loss automatically draws a new sample of the negative labels each |
||
164 | # time we evaluate the loss. |
||
165 | loss = tf.reduce_mean( |
||
166 | tf.nn.nce_loss(weights=nce_weights, |
||
167 | biases=nce_biases, |
||
168 | labels=train_labels, |
||
169 | inputs=embed, |
||
170 | num_sampled=num_neg_samples, |
||
171 | num_classes=num_sensors)) |
||
172 | |||
173 | # Construct the Optimizer |
||
174 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) |
||
175 | |||
176 | # Compute the cosine similarity between minibatch examples and all embeddings. |
||
177 | norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) |
||
178 | normalized_embeddings = embeddings / norm |
||
179 | valid_embeddings = tf.nn.embedding_lookup( |
||
180 | normalized_embeddings, valid_dataset) |
||
181 | similarity = tf.matmul( |
||
182 | valid_embeddings, normalized_embeddings, transpose_b=True) |
||
183 | |||
184 | # Add variable initializer. |
||
185 | init = tf.initialize_all_variables() |
||
186 | |||
187 | # Begin training. |
||
188 | num_steps = 100001 |
||
189 | |||
190 | with tf.Session(graph=graph) as session: |
||
191 | # We must initialize all variables before we use them. |
||
192 | init.run() |
||
193 | print("Initialized") |
||
194 | |||
195 | average_loss = 0 |
||
196 | for step in range(num_steps): |
||
197 | batch_inputs, batch_labels = injector.next_batch() |
||
198 | feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} |
||
199 | |||
200 | # We perform one update step by evaluating the optimizer op (including it |
||
201 | # in the list of returned values for session.run() |
||
202 | _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) |
||
203 | average_loss += loss_val |
||
204 | |||
205 | if step % 2000 == 0: |
||
206 | if step > 0: |
||
207 | average_loss /= 2000 |
||
208 | # The average loss is an estimate of the loss over the last 2000 batches. |
||
209 | print("Average loss at step ", step, ": ", average_loss) |
||
210 | average_loss = 0 |
||
211 | |||
212 | # Note that this is expensive (~20% slowdown if computed every 500 steps) |
||
213 | if step % 10000 == 0: |
||
214 | sim = similarity.eval() |
||
215 | for i in range(num_sensors): |
||
216 | valid_sensor = sensor_list[i]['name'] |
||
217 | top_k = 8 # number of nearest neighbors |
||
218 | nearest = (-sim[i, :]).argsort()[1:top_k + 1] |
||
219 | log_str = "Nearest to %s:" % valid_sensor |
||
220 | for k in range(top_k): |
||
221 | close_sensor = sensor_list[nearest[k]]['name'] |
||
222 | log_str = "%s %s," % (log_str, close_sensor) |
||
223 | print(log_str) |
||
224 | final_embeddings = normalized_embeddings.eval() |
||
225 | final_similarity = 1 - similarity.eval() |
||
226 | distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:,None] |
||
227 | |||
228 | # try: |
||
229 | # from sklearn.manifold import TSNE |
||
230 | # import matplotlib.pyplot as plt |
||
231 | # |
||
232 | # tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) |
||
233 | # low_dim_embs = tsne.fit_transform(final_embeddings) |
||
234 | # labels = [sensor_list[i]['name'] for i in range(num_sensors)] |
||
235 | # |
||
236 | # assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" |
||
237 | # plt.figure(figsize=(18, 18)) # in inches |
||
238 | # for i, label in enumerate(labels): |
||
239 | # x, y = low_dim_embs[i, :] |
||
240 | # plt.scatter(x, y) |
||
241 | # plt.annotate(label, |
||
242 | # xy=(x, y), |
||
243 | # xytext=(5, 2), |
||
244 | # textcoords='offset points', |
||
245 | # ha='right', |
||
246 | # va='bottom') |
||
247 | # plt.show() |
||
248 | # except ImportError: |
||
249 | # print("Please install sklearn, matplotlib, and scipy to visualize embeddings.") |
||
250 | |||
251 | return final_embeddings, distance_matrix |
||
252 |