responsibly.we.utils.most_similar()   F
last analyzed

Complexity

Conditions 20

Size

Total Lines 104
Code Lines 49

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 20
eloc 49
nop 7
dl 0
loc 104
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like responsibly.we.utils.most_similar() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import math
2
3
import gensim
4
import matplotlib.pylab as plt
5
import numpy as np
6
import pandas as pd
7
from six import string_types
8
from sklearn.cluster import KMeans
9
from sklearn.manifold import TSNE
10
from sklearn.metrics import accuracy_score
11
12
13
WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
14
                              gensim.models.keyedvectors.BaseKeyedVectors,
15
                              gensim.models.fasttext.FastText,
16
                              gensim.models.word2vec.Word2Vec,
17
                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long
18
19
20
def round_to_extreme(value, digits=2):
21
    place = 10**digits
22
    new_value = math.ceil(abs(value) * place) / place
23
    if value < 0:
24
        new_value = -new_value
25
    return new_value
26
27
28
def normalize(v):
29
    """Normalize a 1-D vector."""
30
    if v.ndim != 1:
31
        raise ValueError('v should be 1-D, {}-D was given'.format(
32
            v.ndim))
33
    norm = np.linalg.norm(v)
34
    if norm == 0:
35
        return v
36
    return v / norm
37
38
39
def cosine_similarity(v, u):
40
    """Calculate the cosine similarity between two vectors."""
41
    v_norm = np.linalg.norm(v)
42
    u_norm = np.linalg.norm(u)
43
    similarity = v @ u / (v_norm * u_norm)
44
    return similarity
45
46
47
def project_vector(v, u):
48
    """Projecting the vector v onto direction u."""
49
    normalize_u = normalize(u)
50
    return (v @ normalize_u) * normalize_u
51
52
53
def reject_vector(v, u):
54
    """Rejecting the vector v onto direction u."""
55
    return v - project_vector(v, u)
56
57
58
def project_reject_vector(v, u):
59
    """Projecting and rejecting the vector v onto direction u."""
60
    projected_vector = project_vector(v, u)
61
    rejected_vector = v - projected_vector
62
    return projected_vector, rejected_vector
63
64
65
def project_params(u, v):
66
    """Projecting and rejecting the vector v onto direction u with scalar."""
67
    normalize_u = normalize(u)
68
    projection = (v @ normalize_u)
69
    projected_vector = projection * normalize_u
70
    rejected_vector = v - projected_vector
71
    return projection, projected_vector, rejected_vector
72
73
74
def cosine_similarities_by_words(model, word, words):
75
    """Compute cosine similarities between a word and a set of other words."""
76
77
    assert isinstance(word, string_types), \
78
        'The arguemnt `word` should be a string.'
79
    assert not isinstance(words, string_types), \
80
        'The argument `words` should not be a string.'
81
82
    vec = model[word]
83
    vecs = [model[w] for w in words]
84
    return model.cosine_similarities(vec, vecs)
85
86
87
def update_word_vector(model, word, new_vector):
88
    model.vectors[model.vocab[word].index] = new_vector
89
    if model.vectors_norm is not None:
90
        model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
91
92
93
def generate_one_word_forms(word):
94
    return [word.lower(), word.upper(), word.title()]
95
96
97
def generate_words_forms(words):
98
    return sum([generate_one_word_forms(word) for word in words], [])
99
100
101
def take_two_sides_extreme_sorted(df, n_extreme,
102
                                  part_column=None,
103
                                  head_value='',
104
                                  tail_value=''):
105
    head_df = df.head(n_extreme)[:]
106
    tail_df = df.tail(n_extreme)[:]
107
108
    if part_column is not None:
109
        head_df[part_column] = head_value
110
        tail_df[part_column] = tail_value
111
112
    return (pd.concat([head_df, tail_df])
113
            .drop_duplicates()
114
            .reset_index(drop=True))
115
116
117
def assert_gensim_keyed_vectors(model):
118
    if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
119
        type_names = (model_type.__name__
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable model_type does not seem to be defined.
Loading history...
120
                      for model_type in WORD_EMBEDDING_MODEL_TYPES)
121
        raise TypeError('model should be on of the types'
122
                        ' ({}), not {}.'
123
                        .format(', '.join(type_names),
124
                                type(model)))
125
126
127
def most_similar(model, positive=None, negative=None,
128
                 topn=10, restrict_vocab=None, indexer=None,
129
                 unrestricted=True):
130
    """
131
    Find the top-N most similar words.
132
133
    Positive words contribute positively towards the similarity,
134
    negative words negatively.
135
136
    This function computes cosine similarity between a simple mean
137
    of the projection weight vectors of the given words and
138
    the vectors for each word in the model.
139
    The function corresponds to the `word-analogy` and `distance`
140
    scripts in the original word2vec implementation.
141
142
    Based on Gensim implementation.
143
144
    :param model: Word embedding model of ``gensim.model.KeyedVectors``.
145
    :param list positive: List of words that contribute positively.
146
    :param list negative: List of words that contribute negatively.
147
    :param int topn: Number of top-N similar words to return.
148
    :param int restrict_vocab: Optional integer which limits the
149
                               range of vectors
150
                               which are searched for most-similar values.
151
                               For example, restrict_vocab=10000 would
152
                               only check the first 10000 word vectors
153
                               in the vocabulary order. (This may be
154
                               meaningful if you've sorted the vocabulary
155
                               by descending frequency.)
156
    :param bool unrestricted: Whether to restricted the most
157
                              similar words to be not from
158
                              the positive or negative word list.
159
    :return: Sequence of (word, similarity).
160
    """
161
    if topn is not None and topn < 1:
162
        return []
163
164
    if positive is None:
165
        positive = []
166
    if negative is None:
167
        negative = []
168
169
    model.init_sims()
170
171
    if (isinstance(positive, string_types)
172
            and not negative):
173
        # allow calls like most_similar('dog'),
174
        # as a shorthand for most_similar(['dog'])
175
        positive = [positive]
176
177
    if ((isinstance(positive, string_types) and negative)
178
            or (isinstance(negative, string_types) and positive)):
179
        raise ValueError('If positives and negatives are given, '
180
                         'both should be lists!')
181
182
    # add weights for each word, if not already present;
183
    # default to 1.0 for positive and -1.0 for negative words
184
    positive = [
185
        (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
186
        else word
187
        for word in positive
188
    ]
189
    negative = [
190
        (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
191
        else word
192
        for word in negative
193
    ]
194
195
    # compute the weighted average of all words
196
    all_words, mean = set(), []
197
    for word, weight in positive + negative:
198
        if isinstance(word, np.ndarray):
199
            mean.append(weight * word)
200
        else:
201
            mean.append(weight * model.word_vec(word, use_norm=True))
202
            if word in model.vocab:
203
                all_words.add(model.vocab[word].index)
204
205
    if not mean:
206
        raise ValueError("Cannot compute similarity with no input.")
207
    mean = gensim.matutils.unitvec(np.array(mean)
208
                                   .mean(axis=0)).astype(float)
209
210
    if indexer is not None:
211
        return indexer.most_similar(mean, topn)
212
213
    limited = (model.vectors_norm if restrict_vocab is None
214
               else model.vectors_norm[:restrict_vocab])
215
    dists = limited @ mean
216
217
    if topn is None:
218
        return dists
219
220
    best = gensim.matutils.argsort(dists,
221
                                   topn=topn + len(all_words),
222
                                   reverse=True)
223
224
    # if not unrestricted, then ignore (don't return)
225
    # words from the input
226
    result = [(model.index2word[sim], float(dists[sim]))
227
              for sim in best
228
              if unrestricted or sim not in all_words]
229
230
    return result[:topn]
231
232
233
def get_seed_vector(seed, bias_word_embedding):
234
235
    if seed == 'direction':
236
        positive_end = bias_word_embedding.positive_end
237
        negative_end = bias_word_embedding.negative_end
238
        bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
239
        seed_vector = bias_word_embedding.direction
240
    else:
241
        if seed == 'ends':
242
            positive_end = bias_word_embedding.positive_end
243
            negative_end = bias_word_embedding.negative_end
244
245
        else:
246
            positive_end, negative_end = seed
247
248
        seed_vector = normalize(bias_word_embedding.model[positive_end]
249
                                - bias_word_embedding.model[negative_end])
250
251
    return seed_vector, positive_end, negative_end
252
253
254
def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
255
256
    if ax is None:
257
        _, ax = plt.subplots(figsize=(10, 5))
258
259
    y_cluster = (KMeans(n_clusters=2, random_state=random_state)
260
                 .fit_predict(X))
261
262
    embedded_vectors = (TSNE(n_components=2, random_state=random_state)
263
                        .fit_transform(X))
264
265
    for y_value in np.unique(y_cluster):
266
        mask = (y_cluster == y_value)
267
        label = 'Positive' if y_value else 'Negative'
268
        ax.scatter(embedded_vectors[mask, 0],
269
                   embedded_vectors[mask, 1],
270
                   label=label)
271
272
    ax.legend()
273
274
    acc = accuracy_score(y_true, y_cluster)
275
276
    return max(acc, 1 - acc)
277