sensor2vec_data() - Code Metrics - Inspection of "Update LSTM/SDA training examples" - TinghuiWang/pyActLearn - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 391e01...d85813 )

by Tinghui

created 2018-03-13 15:37 UTC

sensor2vec_data() F

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

158

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	16
c	1
b	0
f	0
dl	0
loc	158
rs	2

How to fix Long Method Complexity

import math
import numpy as np
import tensorflow as tf
from ..learning.nn.injectors import SkipGramInjector


def sensor2vec(num_sensors, sensor_event_list, embedding_size=20,
               batch_size=128, num_skips=8, skip_window=5,
               num_neg_samples=64, learning_rate=1.0):
    """Sensor to Vector
    """
    if num_neg_samples > num_sensors:
        num_neg_samples = num_sensors
    # Initialize a SkipGram Injector
    injector = SkipGramInjector(sensor_event_list, batch_size, num_skips, skip_window)
    # Build Training Model
    graph = tf.Graph()
    with graph.as_default():
        # Input Place Holder
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        # As we normally do not have too many sensors - it is OK to use all of them
        valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32)
        # Only CPU supports NCE loss
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            embeddings = tf.Variable(
                tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # Construct the variables for the NCE loss
            nce_weights = tf.Variable(
                tf.truncated_normal([num_sensors, embedding_size],
                                    stddev=1.0 / math.sqrt(embedding_size)))
            nce_biases = tf.Variable(tf.zeros([num_sensors]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_neg_samples,
                           num_classes=num_sensors))

        # Construct the Optimizer
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)

        # Add variable initializer.
        init = tf.initialize_all_variables()

        # Begin training.
        num_steps = 100001

        with tf.Session(graph=graph) as session:
            # We must initialize all variables before we use them.
            init.run()
            print("Initialized")

            average_loss = 0
            for step in range(num_steps):
                batch_inputs, batch_labels = injector.next_batch()
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

                # We perform one update step by evaluating the optimizer op (including it
                # in the list of returned values for session.run()
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                average_loss += loss_val

                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    print("Average loss at step ", step, ": ", average_loss)
                    average_loss = 0

            final_embeddings = normalized_embeddings.eval()
            final_similarity = 1 - similarity.eval()
            distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:, None]
    return final_embeddings, distance_matrix



def sensor2vec_data(sensor_list, event_list, embedding_size=20,
               batch_size=128, num_skips=8, skip_window=5,
               num_neg_samples=64, learning_rate=1.0, ignore_off=True):
    """Transform sensor to high dimensional space

    Similar to word embedding used in natural language processing system, we want
    to represent sensors using in a synthesized vector space as well, instead of
    using an arbitrary labels for each sensors without any useful information.

    The methods used to find word embeddings can be classified into two categories:
    count-based methods (Latent Semantic Analysis) and predictive models.
    In this implementation for mapping sensor into high dimension vector space, we
    use skip-gram negative sampling models.

    Args:
        sensor_list (:obj:`list` of :obj:`dict`): List of dictionary containing
            sensor information.
        event_list (:obj:`list` of :obj:`dict`): List of events.
        embedding_size (:obj:`int`): The size of embedding vector.
        batch_size (:obj:`int`): The number of batch used in training
        num_skips (:obj:`int`): How many times to re-use an input to generate a label
            in skip-gram model.
        skip_window (:obj:`int`): How many items to consider left or right in skip-gram
            model.
        num_neg_samples (:obj:`int`): Number of negative samples to draw from the vocabulary.
        ignore_off (:obj:`bool`): Ignore motion-sensor with ``Off`` state in event.rst list.

    Please refer to :func:`sensor_distance` for an example of ``sensor_list``.
    Please refer to :func:`sensor_mi_distance` for an example of ``event_list``.
    """
    # Put sensor in hash table for fast fetch of index
    num_sensors = len(sensor_list)
    # Negative samples cannot exceed sensor numbers
    if num_neg_samples > num_sensors:
        num_neg_samples = num_sensors
    # Store sensor ID in hash table for faster access
    sensor_dict = {}
    for i in range(num_sensors):
        sensor_dict[sensor_list[i]['name']] = i
    # Generate event.rst sensor list
    event_sensor_list = []
    for event_entry in event_list:
        if ignore_off and event_entry['sensor_status'].upper() == "OFF":
            continue
        event_sensor_list.append(sensor_dict[event_entry['sensor_id']])
    # Initialize a SkipGram Injector
    injector = SkipGramInjector(event_sensor_list, batch_size, num_skips, skip_window)
    # Build Training Model
    graph = tf.Graph()
    with graph.as_default():
        # Input Place Holder
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        # As we normally do not have too many sensors - it is OK to use all of them
        valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32)
        # Only CPU supports NCE loss
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            embeddings = tf.Variable(
                tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # Construct the variables for the NCE loss
            nce_weights = tf.Variable(
                tf.truncated_normal([num_sensors, embedding_size],
                                    stddev=1.0 / math.sqrt(embedding_size)))
            nce_biases = tf.Variable(tf.zeros([num_sensors]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_neg_samples,
                           num_classes=num_sensors))

        # Construct the Optimizer
        optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(
            normalized_embeddings, valid_dataset)
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)

        # Add variable initializer.
        init = tf.initialize_all_variables()

        # Begin training.
        num_steps = 100001

        with tf.Session(graph=graph) as session:
            # We must initialize all variables before we use them.
            init.run()
            print("Initialized")

            average_loss = 0
            for step in range(num_steps):
                batch_inputs, batch_labels = injector.next_batch()
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

                # We perform one update step by evaluating the optimizer op (including it
                # in the list of returned values for session.run()
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                average_loss += loss_val

                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    print("Average loss at step ", step, ": ", average_loss)
                    average_loss = 0

                # Note that this is expensive (~20% slowdown if computed every 500 steps)
                if step % 10000 == 0:
                    sim = similarity.eval()
                    for i in range(num_sensors):
                        valid_sensor = sensor_list[i]['name']
                        top_k = 8  # number of nearest neighbors
                        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                        log_str = "Nearest to %s:" % valid_sensor
                        for k in range(top_k):
                            close_sensor = sensor_list[nearest[k]]['name']
                            log_str = "%s %s," % (log_str, close_sensor)
                        print(log_str)
            final_embeddings = normalized_embeddings.eval()
            final_similarity = 1 - similarity.eval()
            distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:,None]

    # try:
    #     from sklearn.manifold import TSNE
    #     import matplotlib.pyplot as plt
    #
    #     tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    #     low_dim_embs = tsne.fit_transform(final_embeddings)
    #     labels = [sensor_list[i]['name'] for i in range(num_sensors)]
    #
    #     assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    #     plt.figure(figsize=(18, 18))  # in inches
    #     for i, label in enumerate(labels):
    #         x, y = low_dim_embs[i, :]
    #         plt.scatter(x, y)
    #         plt.annotate(label,
    #                      xy=(x, y),
    #                      xytext=(5, 2),
    #                      textcoords='offset points',
    #                      ha='right',
    #                      va='bottom')
    #     plt.show()
    # except ImportError:
    #     print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")

    return final_embeddings, distance_matrix


1			import math
2			import numpy as np
3			import tensorflow as tf
4			from ..learning.nn.injectors import SkipGramInjector
5
6
7			def sensor2vec(num_sensors, sensor_event_list, embedding_size=20,
8			batch_size=128, num_skips=8, skip_window=5,
9			num_neg_samples=64, learning_rate=1.0):
10			"""Sensor to Vector
11			"""
12			if num_neg_samples > num_sensors:
13			num_neg_samples = num_sensors
14			# Initialize a SkipGram Injector
15			injector = SkipGramInjector(sensor_event_list, batch_size, num_skips, skip_window)
16			# Build Training Model
17			graph = tf.Graph()
18			with graph.as_default():
19			# Input Place Holder
20			train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
21			train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
22			# As we normally do not have too many sensors - it is OK to use all of them
23			valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32)
24			# Only CPU supports NCE loss
25			with tf.device('/cpu:0'):
26			# Look up embeddings for inputs.
27			embeddings = tf.Variable(
28			tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0))
29			embed = tf.nn.embedding_lookup(embeddings, train_inputs)
30
31			# Construct the variables for the NCE loss
32			nce_weights = tf.Variable(
33			tf.truncated_normal([num_sensors, embedding_size],
34			stddev=1.0 / math.sqrt(embedding_size)))
35			nce_biases = tf.Variable(tf.zeros([num_sensors]))
36
37			# Compute the average NCE loss for the batch.
38			# tf.nce_loss automatically draws a new sample of the negative labels each
39			# time we evaluate the loss.
40			loss = tf.reduce_mean(
41			tf.nn.nce_loss(weights=nce_weights,
42			biases=nce_biases,
43			labels=train_labels,
44			inputs=embed,
45			num_sampled=num_neg_samples,
46			num_classes=num_sensors))
47
48			# Construct the Optimizer
49			optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
50
51			# Compute the cosine similarity between minibatch examples and all embeddings.
52			norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
53			normalized_embeddings = embeddings / norm
54			valid_embeddings = tf.nn.embedding_lookup(
55			normalized_embeddings, valid_dataset)
56			similarity = tf.matmul(
57			valid_embeddings, normalized_embeddings, transpose_b=True)
58
59			# Add variable initializer.
60			init = tf.initialize_all_variables()
61
62			# Begin training.
63			num_steps = 100001
64
65			with tf.Session(graph=graph) as session:
66			# We must initialize all variables before we use them.
67			init.run()
68			print("Initialized")
69
70			average_loss = 0
71			for step in range(num_steps):
72			batch_inputs, batch_labels = injector.next_batch()
73			feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
74
75			# We perform one update step by evaluating the optimizer op (including it
76			# in the list of returned values for session.run()
77			_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
78			average_loss += loss_val
79
80			if step % 2000 == 0:
81			if step > 0:
82			average_loss /= 2000
83			# The average loss is an estimate of the loss over the last 2000 batches.
84			print("Average loss at step ", step, ": ", average_loss)
85			average_loss = 0
86
87			final_embeddings = normalized_embeddings.eval()
88			final_similarity = 1 - similarity.eval()
89			distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:, None]
90			return final_embeddings, distance_matrix
91
92
93
94			def sensor2vec_data(sensor_list, event_list, embedding_size=20,
95			batch_size=128, num_skips=8, skip_window=5,
96			num_neg_samples=64, learning_rate=1.0, ignore_off=True):
97			"""Transform sensor to high dimensional space
98
99			Similar to word embedding used in natural language processing system, we want
100			to represent sensors using in a synthesized vector space as well, instead of
101			using an arbitrary labels for each sensors without any useful information.
102
103			The methods used to find word embeddings can be classified into two categories:
104			count-based methods (Latent Semantic Analysis) and predictive models.
105			In this implementation for mapping sensor into high dimension vector space, we
106			use skip-gram negative sampling models.
107
108			Args:
109			sensor_list (:obj:`list` of :obj:`dict`): List of dictionary containing
110			sensor information.
111			event_list (:obj:`list` of :obj:`dict`): List of events.
112			embedding_size (:obj:`int`): The size of embedding vector.
113			batch_size (:obj:`int`): The number of batch used in training
114			num_skips (:obj:`int`): How many times to re-use an input to generate a label
115			in skip-gram model.
116			skip_window (:obj:`int`): How many items to consider left or right in skip-gram
117			model.
118			num_neg_samples (:obj:`int`): Number of negative samples to draw from the vocabulary.
119			ignore_off (:obj:`bool`): Ignore motion-sensor with ``Off`` state in event.rst list.
120
121			Please refer to :func:`sensor_distance` for an example of ``sensor_list``.
122			Please refer to :func:`sensor_mi_distance` for an example of ``event_list``.
123			"""
124			# Put sensor in hash table for fast fetch of index
125			num_sensors = len(sensor_list)
126			# Negative samples cannot exceed sensor numbers
127			if num_neg_samples > num_sensors:
128			num_neg_samples = num_sensors
129			# Store sensor ID in hash table for faster access
130			sensor_dict = {}
131			for i in range(num_sensors):
132			sensor_dict[sensor_list[i]['name']] = i
133			# Generate event.rst sensor list
134			event_sensor_list = []
135			for event_entry in event_list:
136			if ignore_off and event_entry['sensor_status'].upper() == "OFF":
137			continue
138			event_sensor_list.append(sensor_dict[event_entry['sensor_id']])
139			# Initialize a SkipGram Injector
140			injector = SkipGramInjector(event_sensor_list, batch_size, num_skips, skip_window)
141			# Build Training Model
142			graph = tf.Graph()
143			with graph.as_default():
144			# Input Place Holder
145			train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
146			train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
147			# As we normally do not have too many sensors - it is OK to use all of them
148			valid_dataset = tf.constant([i for i in range(num_sensors)], dtype=tf.int32)
149			# Only CPU supports NCE loss
150			with tf.device('/cpu:0'):
151			# Look up embeddings for inputs.
152			embeddings = tf.Variable(
153			tf.random_uniform([num_sensors, embedding_size], -1.0, 1.0))
154			embed = tf.nn.embedding_lookup(embeddings, train_inputs)
155
156			# Construct the variables for the NCE loss
157			nce_weights = tf.Variable(
158			tf.truncated_normal([num_sensors, embedding_size],
159			stddev=1.0 / math.sqrt(embedding_size)))
160			nce_biases = tf.Variable(tf.zeros([num_sensors]))
161
162			# Compute the average NCE loss for the batch.
163			# tf.nce_loss automatically draws a new sample of the negative labels each
164			# time we evaluate the loss.
165			loss = tf.reduce_mean(
166			tf.nn.nce_loss(weights=nce_weights,
167			biases=nce_biases,
168			labels=train_labels,
169			inputs=embed,
170			num_sampled=num_neg_samples,
171			num_classes=num_sensors))
172
173			# Construct the Optimizer
174			optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
175
176			# Compute the cosine similarity between minibatch examples and all embeddings.
177			norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
178			normalized_embeddings = embeddings / norm
179			valid_embeddings = tf.nn.embedding_lookup(
180			normalized_embeddings, valid_dataset)
181			similarity = tf.matmul(
182			valid_embeddings, normalized_embeddings, transpose_b=True)
183
184			# Add variable initializer.
185			init = tf.initialize_all_variables()
186
187			# Begin training.
188			num_steps = 100001
189
190			with tf.Session(graph=graph) as session:
191			# We must initialize all variables before we use them.
192			init.run()
193			print("Initialized")
194
195			average_loss = 0
196			for step in range(num_steps):
197			batch_inputs, batch_labels = injector.next_batch()
198			feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
199
200			# We perform one update step by evaluating the optimizer op (including it
201			# in the list of returned values for session.run()
202			_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
203			average_loss += loss_val
204
205			if step % 2000 == 0:
206			if step > 0:
207			average_loss /= 2000
208			# The average loss is an estimate of the loss over the last 2000 batches.
209			print("Average loss at step ", step, ": ", average_loss)
210			average_loss = 0
211
212			# Note that this is expensive (~20% slowdown if computed every 500 steps)
213			if step % 10000 == 0:
214			sim = similarity.eval()
215			for i in range(num_sensors):
216			valid_sensor = sensor_list[i]['name']
217			top_k = 8 # number of nearest neighbors
218			nearest = (-sim[i, :]).argsort()[1:top_k + 1]
219			log_str = "Nearest to %s:" % valid_sensor
220			for k in range(top_k):
221			close_sensor = sensor_list[nearest[k]]['name']
222			log_str = "%s %s," % (log_str, close_sensor)
223			print(log_str)
224			final_embeddings = normalized_embeddings.eval()
225			final_similarity = 1 - similarity.eval()
226			distance_matrix = final_similarity / np.max(final_similarity, axis=1)[:,None]
227
228			# try:
229			# from sklearn.manifold import TSNE
230			# import matplotlib.pyplot as plt
231			#
232			# tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
233			# low_dim_embs = tsne.fit_transform(final_embeddings)
234			# labels = [sensor_list[i]['name'] for i in range(num_sensors)]
235			#
236			# assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
237			# plt.figure(figsize=(18, 18)) # in inches
238			# for i, label in enumerate(labels):
239			# x, y = low_dim_embs[i, :]
240			# plt.scatter(x, y)
241			# plt.annotate(label,
242			# xy=(x, y),
243			# xytext=(5, 2),
244			# textcoords='offset points',
245			# ha='right',
246			# va='bottom')
247			# plt.show()
248			# except ImportError:
249			# print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")
250
251			return final_embeddings, distance_matrix
252

TinghuiWang / pyActLearn

Push — master ( 391e01...d85813 )

sensor2vec_data() F

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like