| Conditions | 16 |
| Total Lines | 158 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like sensor2vec_data() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import math |
||
| 94 | def sensor2vec_data(sensor_list, event_list, embedding_size=20, |
||
| 95 | batch_size=128, num_skips=8, skip_window=5, |
||
| 96 | num_neg_samples=64, learning_rate=1.0, ignore_off=True): |
||
| 97 | """Transform sensor to high dimensional space |
||
| 98 | |||
| 99 | Similar to word embedding used in natural language processing system, we want |
||
| 100 | to represent sensors using in a synthesized vector space as well, instead of |
||
| 101 | using an arbitrary labels for each sensors without any useful information. |
||
| 102 | |||
| 103 | The methods used to find word embeddings can be classified into two categories: |
||
| 104 | count-based methods (Latent Semantic Analysis) and predictive models. |
||
| 105 | In this implementation for mapping sensor into high dimension vector space, we |
||
| 106 | use skip-gram negative sampling models. |
||
| 107 | |||
| 108 | Args: |
||
| 109 | sensor_list (:obj:`list` of :obj:`dict`): List of dictionary containing |
||
| 110 | sensor information. |
||
| 111 | event_list (:obj:`list` of :obj:`dict`): List of events. |
||
| 112 | embedding_size (:obj:`int`): The size of embedding vector. |
||
| 113 | batch_size (:obj:`int`): The number of batch used in training |
||
| 114 | num_skips (:obj:`int`): How many times to re-use an input to generate a label |
||
| 115 | in skip-gram model. |
||
| 116 | skip_window (:obj:`int`): How many items to consider left or right in skip-gram |
||
| 117 | model. |
||
| 118 | num_neg_samples (:obj:`int`): Number of negative samples to draw from the vocabulary. |
||
| 119 | ignore_off (:obj:`bool`): Ignore motion-sensor with ``Off`` state in event.rst list. |
||
| 120 | |||
| 121 | Please refer to :func:`sensor_distance` for an example of ``sensor_list``. |
||
| 122 | Please refer to :func:`sensor_mi_distance` for an example of ``event_list``. |
||
| 123 | """ |
||
| 124 | # Put sensor in hash table for fast fetch of index |
||
| 125 | num_sensors = len(sensor_list) |
||
| 126 | # Negative samples cannot exceed sensor numbers |
||
| 127 | if num_neg_samples > num_sensors: |
||
| 128 | num_neg_samples = num_sensors |
||
| 129 | # Store sensor ID in hash table for faster access |
||
| 130 | sensor_dict = {} |
||
| 131 | for i in range(num_sensors): |
||
| 132 | sensor_dict[sensor_list[i]['name']] = i |
||
| 133 | # Generate event.rst sensor list |
||
| 134 | event_sensor_list = [] |
||
| 135 | for event_entry in event_list: |
||
| 136 | if ignore_off and event_entry['sensor_status'].upper() == "OFF": |
||
| 137 | continue |
||
| 138 | event_sensor_list.append(sensor_dict[event_entry['sensor_id']]) |
||
| 139 | # Initialize a SkipGram Injector |
||
| 140 | injector = SkipGramInjector(event_sensor_list, batch_size, num_skips, skip_window) |
||
| 141 | # Build Training Model |
||
| 142 | graph = tf.Graph() |
||
| 143 | with graph.as_default(): |
||
| 144 | # Input Place Holder |
||
| 145 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) |
||
| 146 | train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) |
||
| 147 | # As we normally do not have too many sensors - it is OK to use all of them |
||
| 148 | valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32) |
||
| 149 | # Only CPU supports NCE loss |
||
| 150 | with tf.device('/cpu:0'): |
||
| 151 | # Look up embeddings for inputs. |
||
| 152 | embeddings = tf.Variable( |
||
| 153 | tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0)) |
||
| 154 | embed = tf.nn.embedding_lookup(embeddings, train_inputs) |
||
| 155 | |||
| 156 | # Construct the variables for the NCE loss |
||
| 157 | nce_weights = tf.Variable( |
||
| 158 | tf.truncated_normal([num_sensors, embedding_size], |
||
| 159 | stddev=1.0 / math.sqrt(embedding_size))) |
||
| 160 | nce_biases = tf.Variable(tf.zeros([num_sensors])) |
||
| 161 | |||
| 162 | # Compute the average NCE loss for the batch. |
||
| 163 | # tf.nce_loss automatically draws a new sample of the negative labels each |
||
| 164 | # time we evaluate the loss. |
||
| 165 | loss = tf.reduce_mean( |
||
| 166 | tf.nn.nce_loss(weights=nce_weights, |
||
| 167 | biases=nce_biases, |
||
| 168 | labels=train_labels, |
||
| 169 | inputs=embed, |
||
| 170 | num_sampled=num_neg_samples, |
||
| 171 | num_classes=num_sensors)) |
||
| 172 | |||
| 173 | # Construct the Optimizer |
||
| 174 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) |
||
| 175 | |||
| 176 | # Compute the cosine similarity between minibatch examples and all embeddings. |
||
| 177 | norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) |
||
| 178 | normalized_embeddings = embeddings / norm |
||
| 179 | valid_embeddings = tf.nn.embedding_lookup( |
||
| 180 | normalized_embeddings, valid_dataset) |
||
| 181 | similarity = tf.matmul( |
||
| 182 | valid_embeddings, normalized_embeddings, transpose_b=True) |
||
| 183 | |||
| 184 | # Add variable initializer. |
||
| 185 | init = tf.initialize_all_variables() |
||
| 186 | |||
| 187 | # Begin training. |
||
| 188 | num_steps = 100001 |
||
| 189 | |||
| 190 | with tf.Session(graph=graph) as session: |
||
| 191 | # We must initialize all variables before we use them. |
||
| 192 | init.run() |
||
| 193 | print("Initialized") |
||
| 194 | |||
| 195 | average_loss = 0 |
||
| 196 | for step in range(num_steps): |
||
| 197 | batch_inputs, batch_labels = injector.next_batch() |
||
| 198 | feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels} |
||
| 199 | |||
| 200 | # We perform one update step by evaluating the optimizer op (including it |
||
| 201 | # in the list of returned values for session.run() |
||
| 202 | _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict) |
||
| 203 | average_loss += loss_val |
||
| 204 | |||
| 205 | if step % 2000 == 0: |
||
| 206 | if step > 0: |
||
| 207 | average_loss /= 2000 |
||
| 208 | # The average loss is an estimate of the loss over the last 2000 batches. |
||
| 209 | print("Average loss at step ", step, ": ", average_loss) |
||
| 210 | average_loss = 0 |
||
| 211 | |||
| 212 | # Note that this is expensive (~20% slowdown if computed every 500 steps) |
||
| 213 | if step % 10000 == 0: |
||
| 214 | sim = similarity.eval() |
||
| 215 | for i in range(num_sensors): |
||
| 216 | valid_sensor = sensor_list[i]['name'] |
||
| 217 | top_k = 8 # number of nearest neighbors |
||
| 218 | nearest = (-sim[i, :]).argsort()[1:top_k + 1] |
||
| 219 | log_str = "Nearest to %s:" % valid_sensor |
||
| 220 | for k in range(top_k): |
||
| 221 | close_sensor = sensor_list[nearest[k]]['name'] |
||
| 222 | log_str = "%s %s," % (log_str, close_sensor) |
||
| 223 | print(log_str) |
||
| 224 | final_embeddings = normalized_embeddings.eval() |
||
| 225 | final_similarity = 1 - similarity.eval() |
||
| 226 | distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:,None] |
||
| 227 | |||
| 228 | # try: |
||
| 229 | # from sklearn.manifold import TSNE |
||
| 230 | # import matplotlib.pyplot as plt |
||
| 231 | # |
||
| 232 | # tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) |
||
| 233 | # low_dim_embs = tsne.fit_transform(final_embeddings) |
||
| 234 | # labels = [sensor_list[i]['name'] for i in range(num_sensors)] |
||
| 235 | # |
||
| 236 | # assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" |
||
| 237 | # plt.figure(figsize=(18, 18)) # in inches |
||
| 238 | # for i, label in enumerate(labels): |
||
| 239 | # x, y = low_dim_embs[i, :] |
||
| 240 | # plt.scatter(x, y) |
||
| 241 | # plt.annotate(label, |
||
| 242 | # xy=(x, y), |
||
| 243 | # xytext=(5, 2), |
||
| 244 | # textcoords='offset points', |
||
| 245 | # ha='right', |
||
| 246 | # va='bottom') |
||
| 247 | # plt.show() |
||
| 248 | # except ImportError: |
||
| 249 | # print("Please install sklearn, matplotlib, and scipy to visualize embeddings.") |
||
| 250 | |||
| 251 | return final_embeddings, distance_matrix |
||
| 252 |