1
|
|
|
from sklearn.manifold import TSNE |
|
|
|
|
2
|
|
|
import matplotlib.pyplot as plt |
3
|
|
|
import numpy as np |
4
|
|
|
import random |
|
|
|
|
5
|
|
|
|
6
|
|
|
|
7
|
|
View Code Duplication |
def reduce_dimensions(w2v_model): |
|
|
|
|
8
|
|
|
num_dimensions = 2 |
9
|
|
|
vectors = np.asarray(w2v_model.wv.vectors) |
10
|
|
|
labels = np.asarray(w2v_model.wv.index_to_key) |
|
|
|
|
11
|
|
|
|
12
|
|
|
tsne = TSNE(n_components=num_dimensions, random_state=42) |
13
|
|
|
vectors = tsne.fit_transform(vectors) |
14
|
|
|
|
15
|
|
|
x_evals = [v[0] for v in vectors] |
|
|
|
|
16
|
|
|
y_evals = [v[1] for v in vectors] |
|
|
|
|
17
|
|
|
return x_evals, y_evals, labels |
18
|
|
|
|
19
|
|
|
|
20
|
|
View Code Duplication |
def plot_with_matplotlib(x_evals, y_evals, labels): |
|
|
|
|
21
|
|
|
random.seed(0) |
22
|
|
|
|
23
|
|
|
plt.figure(figsize=(12, 12)) |
24
|
|
|
plt.scatter(x_evals, y_evals) |
25
|
|
|
indices = list(range(len(labels))) |
26
|
|
|
selected_indices = random.sample(indices, 25) |
27
|
|
|
for i in selected_indices: |
28
|
|
|
plt.annotate(labels[i], (x_evals[i], y_evals[i])) |
29
|
|
|
plt.savefig('w2v_plot.png') |
30
|
|
|
plt.show() |
31
|
|
|
|
32
|
|
|
|
33
|
|
View Code Duplication |
if __name__ == '__main__': |
|
|
|
|
34
|
|
|
from .w2v_corpus import W2VCorpus |
35
|
|
|
import pandas as pd |
36
|
|
|
import gensim.models |
37
|
|
|
corpus = pd.read_excel('test_corpus.xlsx') |
38
|
|
|
print(corpus.head()) |
39
|
|
|
corpus = W2VCorpus(list(corpus[0])) |
40
|
|
|
model = gensim.models.Word2Vec(sentences=corpus) |
41
|
|
|
x_evals, y_evals, labels = reduce_dimensions(model) |
42
|
|
|
plot_with_matplotlib(x_evals, y_evals, labels) |
43
|
|
|
|