1
|
|
|
import math |
2
|
|
|
|
3
|
|
|
import gensim |
4
|
|
|
import matplotlib.pylab as plt |
5
|
|
|
import numpy as np |
6
|
|
|
import pandas as pd |
7
|
|
|
from six import string_types |
8
|
|
|
from sklearn.cluster import KMeans |
9
|
|
|
from sklearn.manifold import TSNE |
10
|
|
|
from sklearn.metrics import accuracy_score |
11
|
|
|
|
12
|
|
|
|
13
|
|
|
WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors, |
14
|
|
|
gensim.models.keyedvectors.BaseKeyedVectors, |
15
|
|
|
gensim.models.fasttext.FastText, |
16
|
|
|
gensim.models.word2vec.Word2Vec, |
17
|
|
|
gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long |
18
|
|
|
|
19
|
|
|
|
20
|
|
|
def round_to_extreme(value, digits=2): |
21
|
|
|
place = 10**digits |
22
|
|
|
new_value = math.ceil(abs(value) * place) / place |
23
|
|
|
if value < 0: |
24
|
|
|
new_value = -new_value |
25
|
|
|
return new_value |
26
|
|
|
|
27
|
|
|
|
28
|
|
|
def normalize(v): |
29
|
|
|
"""Normalize a 1-D vector.""" |
30
|
|
|
if v.ndim != 1: |
31
|
|
|
raise ValueError('v should be 1-D, {}-D was given'.format( |
32
|
|
|
v.ndim)) |
33
|
|
|
norm = np.linalg.norm(v) |
34
|
|
|
if norm == 0: |
35
|
|
|
return v |
36
|
|
|
return v / norm |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
def cosine_similarity(v, u): |
40
|
|
|
"""Calculate the cosine similarity between two vectors.""" |
41
|
|
|
v_norm = np.linalg.norm(v) |
42
|
|
|
u_norm = np.linalg.norm(u) |
43
|
|
|
similarity = v @ u / (v_norm * u_norm) |
44
|
|
|
return similarity |
45
|
|
|
|
46
|
|
|
|
47
|
|
|
def project_vector(v, u): |
48
|
|
|
"""Projecting the vector v onto direction u.""" |
49
|
|
|
normalize_u = normalize(u) |
50
|
|
|
return (v @ normalize_u) * normalize_u |
51
|
|
|
|
52
|
|
|
|
53
|
|
|
def reject_vector(v, u): |
54
|
|
|
"""Rejecting the vector v onto direction u.""" |
55
|
|
|
return v - project_vector(v, u) |
56
|
|
|
|
57
|
|
|
|
58
|
|
|
def project_reject_vector(v, u): |
59
|
|
|
"""Projecting and rejecting the vector v onto direction u.""" |
60
|
|
|
projected_vector = project_vector(v, u) |
61
|
|
|
rejected_vector = v - projected_vector |
62
|
|
|
return projected_vector, rejected_vector |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
def project_params(u, v): |
66
|
|
|
"""Projecting and rejecting the vector v onto direction u with scalar.""" |
67
|
|
|
normalize_u = normalize(u) |
68
|
|
|
projection = (v @ normalize_u) |
69
|
|
|
projected_vector = projection * normalize_u |
70
|
|
|
rejected_vector = v - projected_vector |
71
|
|
|
return projection, projected_vector, rejected_vector |
72
|
|
|
|
73
|
|
|
|
74
|
|
|
def cosine_similarities_by_words(model, word, words): |
75
|
|
|
"""Compute cosine similarities between a word and a set of other words.""" |
76
|
|
|
|
77
|
|
|
assert isinstance(word, string_types), \ |
78
|
|
|
'The arguemnt `word` should be a string.' |
79
|
|
|
assert not isinstance(words, string_types), \ |
80
|
|
|
'The argument `words` should not be a string.' |
81
|
|
|
|
82
|
|
|
vec = model[word] |
83
|
|
|
vecs = [model[w] for w in words] |
84
|
|
|
return model.cosine_similarities(vec, vecs) |
85
|
|
|
|
86
|
|
|
|
87
|
|
|
def update_word_vector(model, word, new_vector): |
88
|
|
|
model.vectors[model.vocab[word].index] = new_vector |
89
|
|
|
if model.vectors_norm is not None: |
90
|
|
|
model.vectors_norm[model.vocab[word].index] = normalize(new_vector) |
91
|
|
|
|
92
|
|
|
|
93
|
|
|
def generate_one_word_forms(word): |
94
|
|
|
return [word.lower(), word.upper(), word.title()] |
95
|
|
|
|
96
|
|
|
|
97
|
|
|
def generate_words_forms(words): |
98
|
|
|
return sum([generate_one_word_forms(word) for word in words], []) |
99
|
|
|
|
100
|
|
|
|
101
|
|
|
def take_two_sides_extreme_sorted(df, n_extreme, |
102
|
|
|
part_column=None, |
103
|
|
|
head_value='', |
104
|
|
|
tail_value=''): |
105
|
|
|
head_df = df.head(n_extreme)[:] |
106
|
|
|
tail_df = df.tail(n_extreme)[:] |
107
|
|
|
|
108
|
|
|
if part_column is not None: |
109
|
|
|
head_df[part_column] = head_value |
110
|
|
|
tail_df[part_column] = tail_value |
111
|
|
|
|
112
|
|
|
return (pd.concat([head_df, tail_df]) |
113
|
|
|
.drop_duplicates() |
114
|
|
|
.reset_index(drop=True)) |
115
|
|
|
|
116
|
|
|
|
117
|
|
|
def assert_gensim_keyed_vectors(model): |
118
|
|
|
if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES): |
119
|
|
|
raise TypeError('model should be of type {}, not {}' |
120
|
|
|
.format(''.join(WORD_EMBEDDING_MODEL_TYPES), |
121
|
|
|
type(model))) |
122
|
|
|
|
123
|
|
|
|
124
|
|
|
def most_similar(model, positive=None, negative=None, |
125
|
|
|
topn=10, restrict_vocab=None, indexer=None, |
126
|
|
|
unrestricted=True): |
127
|
|
|
""" |
128
|
|
|
Find the top-N most similar words. |
129
|
|
|
|
130
|
|
|
Positive words contribute positively towards the similarity, |
131
|
|
|
negative words negatively. |
132
|
|
|
|
133
|
|
|
This function computes cosine similarity between a simple mean |
134
|
|
|
of the projection weight vectors of the given words and |
135
|
|
|
the vectors for each word in the model. |
136
|
|
|
The function corresponds to the `word-analogy` and `distance` |
137
|
|
|
scripts in the original word2vec implementation. |
138
|
|
|
|
139
|
|
|
Based on Gensim implementation. |
140
|
|
|
|
141
|
|
|
:param model: Word embedding model of ``gensim.model.KeyedVectors``. |
142
|
|
|
:param list positive: List of words that contribute positively. |
143
|
|
|
:param list negative: List of words that contribute negatively. |
144
|
|
|
:param int topn: Number of top-N similar words to return. |
145
|
|
|
:param int restrict_vocab: Optional integer which limits the |
146
|
|
|
range of vectors |
147
|
|
|
which are searched for most-similar values. |
148
|
|
|
For example, restrict_vocab=10000 would |
149
|
|
|
only check the first 10000 word vectors |
150
|
|
|
in the vocabulary order. (This may be |
151
|
|
|
meaningful if you've sorted the vocabulary |
152
|
|
|
by descending frequency.) |
153
|
|
|
:param bool unrestricted: Whether to restricted the most |
154
|
|
|
similar words to be not from |
155
|
|
|
the positive or negative word list. |
156
|
|
|
:return: Sequence of (word, similarity). |
157
|
|
|
""" |
158
|
|
|
if topn is not None and topn < 1: |
159
|
|
|
return [] |
160
|
|
|
|
161
|
|
|
if positive is None: |
162
|
|
|
positive = [] |
163
|
|
|
if negative is None: |
164
|
|
|
negative = [] |
165
|
|
|
|
166
|
|
|
model.init_sims() |
167
|
|
|
|
168
|
|
|
if (isinstance(positive, string_types) |
169
|
|
|
and not negative): |
170
|
|
|
# allow calls like most_similar('dog'), |
171
|
|
|
# as a shorthand for most_similar(['dog']) |
172
|
|
|
positive = [positive] |
173
|
|
|
|
174
|
|
|
if ((isinstance(positive, string_types) and negative) |
175
|
|
|
or (isinstance(negative, string_types) and positive)): |
176
|
|
|
raise ValueError('If positives and negatives are given, ' |
177
|
|
|
'both should be lists!') |
178
|
|
|
|
179
|
|
|
# add weights for each word, if not already present; |
180
|
|
|
# default to 1.0 for positive and -1.0 for negative words |
181
|
|
|
positive = [ |
182
|
|
|
(word, 1.0) if isinstance(word, string_types + (np.ndarray,)) |
183
|
|
|
else word |
184
|
|
|
for word in positive |
185
|
|
|
] |
186
|
|
|
negative = [ |
187
|
|
|
(word, -1.0) if isinstance(word, string_types + (np.ndarray,)) |
188
|
|
|
else word |
189
|
|
|
for word in negative |
190
|
|
|
] |
191
|
|
|
|
192
|
|
|
# compute the weighted average of all words |
193
|
|
|
all_words, mean = set(), [] |
194
|
|
|
for word, weight in positive + negative: |
195
|
|
|
if isinstance(word, np.ndarray): |
196
|
|
|
mean.append(weight * word) |
197
|
|
|
else: |
198
|
|
|
mean.append(weight * model.word_vec(word, use_norm=True)) |
199
|
|
|
if word in model.vocab: |
200
|
|
|
all_words.add(model.vocab[word].index) |
201
|
|
|
|
202
|
|
|
if not mean: |
203
|
|
|
raise ValueError("Cannot compute similarity with no input.") |
204
|
|
|
mean = gensim.matutils.unitvec(np.array(mean) |
205
|
|
|
.mean(axis=0)).astype(float) |
206
|
|
|
|
207
|
|
|
if indexer is not None: |
208
|
|
|
return indexer.most_similar(mean, topn) |
209
|
|
|
|
210
|
|
|
limited = (model.vectors_norm if restrict_vocab is None |
211
|
|
|
else model.vectors_norm[:restrict_vocab]) |
212
|
|
|
dists = limited @ mean |
213
|
|
|
|
214
|
|
|
if topn is None: |
215
|
|
|
return dists |
216
|
|
|
|
217
|
|
|
best = gensim.matutils.argsort(dists, |
218
|
|
|
topn=topn + len(all_words), |
219
|
|
|
reverse=True) |
220
|
|
|
|
221
|
|
|
# if not unrestricted, then ignore (don't return) |
222
|
|
|
# words from the input |
223
|
|
|
result = [(model.index2word[sim], float(dists[sim])) |
224
|
|
|
for sim in best |
225
|
|
|
if unrestricted or sim not in all_words] |
226
|
|
|
|
227
|
|
|
return result[:topn] |
228
|
|
|
|
229
|
|
|
|
230
|
|
|
def get_seed_vector(seed, bias_word_embedding): |
231
|
|
|
|
232
|
|
|
if seed == 'direction': |
233
|
|
|
positive_end = bias_word_embedding.positive_end |
234
|
|
|
negative_end = bias_word_embedding.negative_end |
235
|
|
|
bias_word_embedding._is_direction_identified() # pylint: disable=protected-access |
236
|
|
|
seed_vector = bias_word_embedding.direction |
237
|
|
|
else: |
238
|
|
|
if seed == 'ends': |
239
|
|
|
positive_end = bias_word_embedding.positive_end |
240
|
|
|
negative_end = bias_word_embedding.negative_end |
241
|
|
|
|
242
|
|
|
else: |
243
|
|
|
positive_end, negative_end = seed |
244
|
|
|
|
245
|
|
|
seed_vector = normalize(bias_word_embedding.model[positive_end] |
246
|
|
|
- bias_word_embedding.model[negative_end]) |
247
|
|
|
|
248
|
|
|
return seed_vector, positive_end, negative_end |
249
|
|
|
|
250
|
|
|
|
251
|
|
|
def plot_clustering_as_classification(X, y_true, random_state=1, ax=None): |
252
|
|
|
|
253
|
|
|
if ax is None: |
254
|
|
|
_, ax = plt.subplots(figsize=(10, 5)) |
255
|
|
|
|
256
|
|
|
y_cluster = (KMeans(n_clusters=2, random_state=random_state) |
257
|
|
|
.fit_predict(X)) |
258
|
|
|
|
259
|
|
|
embedded_vectors = (TSNE(n_components=2, random_state=random_state) |
260
|
|
|
.fit_transform(X)) |
261
|
|
|
|
262
|
|
|
for y_value in np.unique(y_cluster): |
263
|
|
|
mask = (y_cluster == y_value) |
264
|
|
|
label = 'Positive' if y_value else 'Negative' |
265
|
|
|
ax.scatter(embedded_vectors[mask, 0], |
266
|
|
|
embedded_vectors[mask, 1], |
267
|
|
|
label=label) |
268
|
|
|
|
269
|
|
|
ax.legend() |
270
|
|
|
|
271
|
|
|
acc = accuracy_score(y_true, y_cluster) |
272
|
|
|
|
273
|
|
|
return max(acc, 1 - acc) |
274
|
|
|
|