Completed
Push — master ( 13dc98...362bd5 )
by Shlomi
25s queued 12s
created

ethically.we.utils.cosine_similarities_by_words()   A

Complexity

Conditions 1

Size

Total Lines 11
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 8
dl 0
loc 11
rs 10
c 0
b 0
f 0
cc 1
nop 3
1
import math
2
3
import gensim
4
import matplotlib.pylab as plt
5
import numpy as np
6
import pandas as pd
7
from six import string_types
8
from sklearn.cluster import KMeans
9
from sklearn.manifold import TSNE
10
from sklearn.metrics import accuracy_score
11
12
13
WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
14
                              gensim.models.keyedvectors.BaseKeyedVectors,
15
                              gensim.models.fasttext.FastText,
16
                              gensim.models.word2vec.Word2Vec,
17
                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long
18
19
20
def round_to_extreme(value, digits=2):
21
    place = 10**digits
22
    new_value = math.ceil(abs(value) * place) / place
23
    if value < 0:
24
        new_value = -new_value
25
    return new_value
26
27
28
def normalize(v):
29
    """Normalize a 1-D vector."""
30
    if v.ndim != 1:
31
        raise ValueError('v should be 1-D, {}-D was given'.format(
32
            v.ndim))
33
    norm = np.linalg.norm(v)
34
    if norm == 0:
35
        return v
36
    return v / norm
37
38
39
def cosine_similarity(v, u):
40
    """Calculate the cosine similarity between two vectors."""
41
    v_norm = np.linalg.norm(v)
42
    u_norm = np.linalg.norm(u)
43
    similarity = v @ u / (v_norm * u_norm)
44
    return similarity
45
46
47
def project_vector(v, u):
48
    """Projecting the vector v onto direction u."""
49
    normalize_u = normalize(u)
50
    return (v @ normalize_u) * normalize_u
51
52
53
def reject_vector(v, u):
54
    """Rejecting the vector v onto direction u."""
55
    return v - project_vector(v, u)
56
57
58
def project_reject_vector(v, u):
59
    """Projecting and rejecting the vector v onto direction u."""
60
    projected_vector = project_vector(v, u)
61
    rejected_vector = v - projected_vector
62
    return projected_vector, rejected_vector
63
64
65
def project_params(u, v):
66
    """Projecting and rejecting the vector v onto direction u with scalar."""
67
    normalize_u = normalize(u)
68
    projection = (v @ normalize_u)
69
    projected_vector = projection * normalize_u
70
    rejected_vector = v - projected_vector
71
    return projection, projected_vector, rejected_vector
72
73
74
def cosine_similarities_by_words(model, word, words):
75
    """Compute cosine similarities between a word and a set of other words."""
76
77
    assert isinstance(word, string_types), \
78
        'The arguemnt `word` should be a string.'
79
    assert not isinstance(words, string_types), \
80
        'The argument `words` should not be a string.'
81
82
    vec = model[word]
83
    vecs = [model[w] for w in words]
84
    return model.cosine_similarities(vec, vecs)
85
86
87
def update_word_vector(model, word, new_vector):
88
    model.vectors[model.vocab[word].index] = new_vector
89
    if model.vectors_norm is not None:
90
        model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
91
92
93
def generate_one_word_forms(word):
94
    return [word.lower(), word.upper(), word.title()]
95
96
97
def generate_words_forms(words):
98
    return sum([generate_one_word_forms(word) for word in words], [])
99
100
101
def take_two_sides_extreme_sorted(df, n_extreme,
102
                                  part_column=None,
103
                                  head_value='',
104
                                  tail_value=''):
105
    head_df = df.head(n_extreme)[:]
106
    tail_df = df.tail(n_extreme)[:]
107
108
    if part_column is not None:
109
        head_df[part_column] = head_value
110
        tail_df[part_column] = tail_value
111
112
    return (pd.concat([head_df, tail_df])
113
            .drop_duplicates()
114
            .reset_index(drop=True))
115
116
117
def assert_gensim_keyed_vectors(model):
118
    if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
119
        raise TypeError('model should be of type {}, not {}'
120
                        .format(''.join(WORD_EMBEDDING_MODEL_TYPES),
121
                                type(model)))
122
123
124
def most_similar(model, positive=None, negative=None,
125
                 topn=10, restrict_vocab=None, indexer=None,
126
                 unrestricted=True):
127
    """
128
    Find the top-N most similar words.
129
130
    Positive words contribute positively towards the similarity,
131
    negative words negatively.
132
133
    This function computes cosine similarity between a simple mean
134
    of the projection weight vectors of the given words and
135
    the vectors for each word in the model.
136
    The function corresponds to the `word-analogy` and `distance`
137
    scripts in the original word2vec implementation.
138
139
    Based on Gensim implementation.
140
141
    :param model: Word embedding model of ``gensim.model.KeyedVectors``.
142
    :param list positive: List of words that contribute positively.
143
    :param list negative: List of words that contribute negatively.
144
    :param int topn: Number of top-N similar words to return.
145
    :param int restrict_vocab: Optional integer which limits the
146
                               range of vectors
147
                               which are searched for most-similar values.
148
                               For example, restrict_vocab=10000 would
149
                               only check the first 10000 word vectors
150
                               in the vocabulary order. (This may be
151
                               meaningful if you've sorted the vocabulary
152
                               by descending frequency.)
153
    :param bool unrestricted: Whether to restricted the most
154
                              similar words to be not from
155
                              the positive or negative word list.
156
    :return: Sequence of (word, similarity).
157
    """
158
    if topn is not None and topn < 1:
159
        return []
160
161
    if positive is None:
162
        positive = []
163
    if negative is None:
164
        negative = []
165
166
    model.init_sims()
167
168
    if (isinstance(positive, string_types)
169
            and not negative):
170
        # allow calls like most_similar('dog'),
171
        # as a shorthand for most_similar(['dog'])
172
        positive = [positive]
173
174
    if ((isinstance(positive, string_types) and negative)
175
            or (isinstance(negative, string_types) and positive)):
176
        raise ValueError('If positives and negatives are given, '
177
                         'both should be lists!')
178
179
    # add weights for each word, if not already present;
180
    # default to 1.0 for positive and -1.0 for negative words
181
    positive = [
182
        (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
183
        else word
184
        for word in positive
185
    ]
186
    negative = [
187
        (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
188
        else word
189
        for word in negative
190
    ]
191
192
    # compute the weighted average of all words
193
    all_words, mean = set(), []
194
    for word, weight in positive + negative:
195
        if isinstance(word, np.ndarray):
196
            mean.append(weight * word)
197
        else:
198
            mean.append(weight * model.word_vec(word, use_norm=True))
199
            if word in model.vocab:
200
                all_words.add(model.vocab[word].index)
201
202
    if not mean:
203
        raise ValueError("Cannot compute similarity with no input.")
204
    mean = gensim.matutils.unitvec(np.array(mean)
205
                                   .mean(axis=0)).astype(float)
206
207
    if indexer is not None:
208
        return indexer.most_similar(mean, topn)
209
210
    limited = (model.vectors_norm if restrict_vocab is None
211
               else model.vectors_norm[:restrict_vocab])
212
    dists = limited @ mean
213
214
    if topn is None:
215
        return dists
216
217
    best = gensim.matutils.argsort(dists,
218
                                   topn=topn + len(all_words),
219
                                   reverse=True)
220
221
    # if not unrestricted, then ignore (don't return)
222
    # words from the input
223
    result = [(model.index2word[sim], float(dists[sim]))
224
              for sim in best
225
              if unrestricted or sim not in all_words]
226
227
    return result[:topn]
228
229
230
def get_seed_vector(seed, bias_word_embedding):
231
232
    if seed == 'direction':
233
        positive_end = bias_word_embedding.positive_end
234
        negative_end = bias_word_embedding.negative_end
235
        bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
236
        seed_vector = bias_word_embedding.direction
237
    else:
238
        if seed == 'ends':
239
            positive_end = bias_word_embedding.positive_end
240
            negative_end = bias_word_embedding.negative_end
241
242
        else:
243
            positive_end, negative_end = seed
244
245
        seed_vector = normalize(bias_word_embedding.model[positive_end]
246
                                - bias_word_embedding.model[negative_end])
247
248
    return seed_vector, positive_end, negative_end
249
250
251
def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
252
253
    if ax is None:
254
        _, ax = plt.subplots(figsize=(10, 5))
255
256
    y_cluster = (KMeans(n_clusters=2, random_state=random_state)
257
                 .fit_predict(X))
258
259
    embedded_vectors = (TSNE(n_components=2, random_state=random_state)
260
                        .fit_transform(X))
261
262
    for y_value in np.unique(y_cluster):
263
        mask = (y_cluster == y_value)
264
        label = 'Positive' if y_value else 'Negative'
265
        ax.scatter(embedded_vectors[mask, 0],
266
                   embedded_vectors[mask, 1],
267
                   label=label)
268
269
    ax.legend()
270
271
    acc = accuracy_score(y_true, y_cluster)
272
273
    return max(acc, 1 - acc)
274