responsibly.we.utils.most_similar() - Code Metrics - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

responsibly.we.utils.most_similar() F
last analyzed 2021-04-02 13:01 UTC

↳ Parent: responsibly.we.utils

Complexity

Conditions

Size

Total Lines	104
Code Lines	49

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	20
eloc	49
nop	7
dl	0
loc	104
rs	0
c	0
b	0
f	0

How to fix Long Method Complexity

import math

import gensim
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from six import string_types
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score


WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
                              gensim.models.keyedvectors.BaseKeyedVectors,
                              gensim.models.fasttext.FastText,
                              gensim.models.word2vec.Word2Vec,
                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long


def round_to_extreme(value, digits=2):
    place = 10**digits
    new_value = math.ceil(abs(value) * place) / place
    if value < 0:
        new_value = -new_value
    return new_value


def normalize(v):
    """Normalize a 1-D vector."""
    if v.ndim != 1:
        raise ValueError('v should be 1-D, {}-D was given'.format(
            v.ndim))
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


def cosine_similarity(v, u):
    """Calculate the cosine similarity between two vectors."""
    v_norm = np.linalg.norm(v)
    u_norm = np.linalg.norm(u)
    similarity = v @ u / (v_norm * u_norm)
    return similarity


def project_vector(v, u):
    """Projecting the vector v onto direction u."""
    normalize_u = normalize(u)
    return (v @ normalize_u) * normalize_u


def reject_vector(v, u):
    """Rejecting the vector v onto direction u."""
    return v - project_vector(v, u)


def project_reject_vector(v, u):
    """Projecting and rejecting the vector v onto direction u."""
    projected_vector = project_vector(v, u)
    rejected_vector = v - projected_vector
    return projected_vector, rejected_vector


def project_params(u, v):
    """Projecting and rejecting the vector v onto direction u with scalar."""
    normalize_u = normalize(u)
    projection = (v @ normalize_u)
    projected_vector = projection * normalize_u
    rejected_vector = v - projected_vector
    return projection, projected_vector, rejected_vector


def cosine_similarities_by_words(model, word, words):
    """Compute cosine similarities between a word and a set of other words."""

    assert isinstance(word, string_types), \
        'The arguemnt `word` should be a string.'
    assert not isinstance(words, string_types), \
        'The argument `words` should not be a string.'

    vec = model[word]
    vecs = [model[w] for w in words]
    return model.cosine_similarities(vec, vecs)


def update_word_vector(model, word, new_vector):
    model.vectors[model.vocab[word].index] = new_vector
    if model.vectors_norm is not None:
        model.vectors_norm[model.vocab[word].index] = normalize(new_vector)


def generate_one_word_forms(word):
    return [word.lower(), word.upper(), word.title()]


def generate_words_forms(words):
    return sum([generate_one_word_forms(word) for word in words], [])


def take_two_sides_extreme_sorted(df, n_extreme,
                                  part_column=None,
                                  head_value='',
                                  tail_value=''):
    head_df = df.head(n_extreme)[:]
    tail_df = df.tail(n_extreme)[:]

    if part_column is not None:
        head_df[part_column] = head_value
        tail_df[part_column] = tail_value

    return (pd.concat([head_df, tail_df])
            .drop_duplicates()
            .reset_index(drop=True))


def assert_gensim_keyed_vectors(model):
    if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
        type_names = (model_type.__name__

                      for model_type in WORD_EMBEDDING_MODEL_TYPES)
        raise TypeError('model should be on of the types'
                        ' ({}), not {}.'
                        .format(', '.join(type_names),
                                type(model)))


def most_similar(model, positive=None, negative=None,
                 topn=10, restrict_vocab=None, indexer=None,
                 unrestricted=True):
    """
    Find the top-N most similar words.

    Positive words contribute positively towards the similarity,
    negative words negatively.

    This function computes cosine similarity between a simple mean
    of the projection weight vectors of the given words and
    the vectors for each word in the model.
    The function corresponds to the `word-analogy` and `distance`
    scripts in the original word2vec implementation.

    Based on Gensim implementation.

    :param model: Word embedding model of ``gensim.model.KeyedVectors``.
    :param list positive: List of words that contribute positively.
    :param list negative: List of words that contribute negatively.
    :param int topn: Number of top-N similar words to return.
    :param int restrict_vocab: Optional integer which limits the
                               range of vectors
                               which are searched for most-similar values.
                               For example, restrict_vocab=10000 would
                               only check the first 10000 word vectors
                               in the vocabulary order. (This may be
                               meaningful if you've sorted the vocabulary
                               by descending frequency.)
    :param bool unrestricted: Whether to restricted the most
                              similar words to be not from
                              the positive or negative word list.
    :return: Sequence of (word, similarity).
    """
    if topn is not None and topn < 1:
        return []

    if positive is None:
        positive = []
    if negative is None:
        negative = []

    model.init_sims()

    if (isinstance(positive, string_types)
            and not negative):
        # allow calls like most_similar('dog'),
        # as a shorthand for most_similar(['dog'])
        positive = [positive]

    if ((isinstance(positive, string_types) and negative)
            or (isinstance(negative, string_types) and positive)):
        raise ValueError('If positives and negatives are given, '
                         'both should be lists!')

    # add weights for each word, if not already present;
    # default to 1.0 for positive and -1.0 for negative words
    positive = [
        (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
        else word
        for word in positive
    ]
    negative = [
        (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
        else word
        for word in negative
    ]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive + negative:
        if isinstance(word, np.ndarray):
            mean.append(weight * word)
        else:
            mean.append(weight * model.word_vec(word, use_norm=True))
            if word in model.vocab:
                all_words.add(model.vocab[word].index)

    if not mean:
        raise ValueError("Cannot compute similarity with no input.")
    mean = gensim.matutils.unitvec(np.array(mean)
                                   .mean(axis=0)).astype(float)

    if indexer is not None:
        return indexer.most_similar(mean, topn)

    limited = (model.vectors_norm if restrict_vocab is None
               else model.vectors_norm[:restrict_vocab])
    dists = limited @ mean

    if topn is None:
        return dists

    best = gensim.matutils.argsort(dists,
                                   topn=topn + len(all_words),
                                   reverse=True)

    # if not unrestricted, then ignore (don't return)
    # words from the input
    result = [(model.index2word[sim], float(dists[sim]))
              for sim in best
              if unrestricted or sim not in all_words]

    return result[:topn]


def get_seed_vector(seed, bias_word_embedding):

    if seed == 'direction':
        positive_end = bias_word_embedding.positive_end
        negative_end = bias_word_embedding.negative_end
        bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
        seed_vector = bias_word_embedding.direction
    else:
        if seed == 'ends':
            positive_end = bias_word_embedding.positive_end
            negative_end = bias_word_embedding.negative_end

        else:
            positive_end, negative_end = seed

        seed_vector = normalize(bias_word_embedding.model[positive_end]
                                - bias_word_embedding.model[negative_end])

    return seed_vector, positive_end, negative_end


def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):

    if ax is None:
        _, ax = plt.subplots(figsize=(10, 5))

    y_cluster = (KMeans(n_clusters=2, random_state=random_state)
                 .fit_predict(X))

    embedded_vectors = (TSNE(n_components=2, random_state=random_state)
                        .fit_transform(X))

    for y_value in np.unique(y_cluster):
        mask = (y_cluster == y_value)
        label = 'Positive' if y_value else 'Negative'
        ax.scatter(embedded_vectors[mask, 0],
                   embedded_vectors[mask, 1],
                   label=label)

    ax.legend()

    acc = accuracy_score(y_true, y_cluster)

    return max(acc, 1 - acc)


1			import math
2
3			import gensim
4			import matplotlib.pylab as plt
5			import numpy as np
6			import pandas as pd
7			from six import string_types
8			from sklearn.cluster import KMeans
9			from sklearn.manifold import TSNE
10			from sklearn.metrics import accuracy_score
11
12
13			WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
14			gensim.models.keyedvectors.BaseKeyedVectors,
15			gensim.models.fasttext.FastText,
16			gensim.models.word2vec.Word2Vec,
17			gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long
18
19
20			def round_to_extreme(value, digits=2):
21			place = 10**digits
22			new_value = math.ceil(abs(value) * place) / place
23			if value < 0:
24			new_value = -new_value
25			return new_value
26
27
28			def normalize(v):
29			"""Normalize a 1-D vector."""
30			if v.ndim != 1:
31			raise ValueError('v should be 1-D, {}-D was given'.format(
32			v.ndim))
33			norm = np.linalg.norm(v)
34			if norm == 0:
35			return v
36			return v / norm
37
38
39			def cosine_similarity(v, u):
40			"""Calculate the cosine similarity between two vectors."""
41			v_norm = np.linalg.norm(v)
42			u_norm = np.linalg.norm(u)
43			similarity = v @ u / (v_norm * u_norm)
44			return similarity
45
46
47			def project_vector(v, u):
48			"""Projecting the vector v onto direction u."""
49			normalize_u = normalize(u)
50			return (v @ normalize_u) * normalize_u
51
52
53			def reject_vector(v, u):
54			"""Rejecting the vector v onto direction u."""
55			return v - project_vector(v, u)
56
57
58			def project_reject_vector(v, u):
59			"""Projecting and rejecting the vector v onto direction u."""
60			projected_vector = project_vector(v, u)
61			rejected_vector = v - projected_vector
62			return projected_vector, rejected_vector
63
64
65			def project_params(u, v):
66			"""Projecting and rejecting the vector v onto direction u with scalar."""
67			normalize_u = normalize(u)
68			projection = (v @ normalize_u)
69			projected_vector = projection * normalize_u
70			rejected_vector = v - projected_vector
71			return projection, projected_vector, rejected_vector
72
73
74			def cosine_similarities_by_words(model, word, words):
75			"""Compute cosine similarities between a word and a set of other words."""
76
77			assert isinstance(word, string_types), \
78			'The arguemnt `word` should be a string.'
79			assert not isinstance(words, string_types), \
80			'The argument `words` should not be a string.'
81
82			vec = model[word]
83			vecs = [model[w] for w in words]
84			return model.cosine_similarities(vec, vecs)
85
86
87			def update_word_vector(model, word, new_vector):
88			model.vectors[model.vocab[word].index] = new_vector
89			if model.vectors_norm is not None:
90			model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
91
92
93			def generate_one_word_forms(word):
94			return [word.lower(), word.upper(), word.title()]
95
96
97			def generate_words_forms(words):
98			return sum([generate_one_word_forms(word) for word in words], [])
99
100
101			def take_two_sides_extreme_sorted(df, n_extreme,
102			part_column=None,
103			head_value='',
104			tail_value=''):
105			head_df = df.head(n_extreme)[:]
106			tail_df = df.tail(n_extreme)[:]
107
108			if part_column is not None:
109			head_df[part_column] = head_value
110			tail_df[part_column] = tail_value
111
112			return (pd.concat([head_df, tail_df])
113			.drop_duplicates()
114			.reset_index(drop=True))
115
116
117			def assert_gensim_keyed_vectors(model):
118			if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
119			type_names = (model_type.__name__
			0 ignored issues – show Comprehensibility Best Practice introduced 2019-06-16 19:33 UTC by Report Bug Copy Issue Report The variable `model_type` does not seem to be defined. Loading history...
120			for model_type in WORD_EMBEDDING_MODEL_TYPES)
121			raise TypeError('model should be on of the types'
122			' ({}), not {}.'
123			.format(', '.join(type_names),
124			type(model)))
125
126
127			def most_similar(model, positive=None, negative=None,
128			topn=10, restrict_vocab=None, indexer=None,
129			unrestricted=True):
130			"""
131			Find the top-N most similar words.
132
133			Positive words contribute positively towards the similarity,
134			negative words negatively.
135
136			This function computes cosine similarity between a simple mean
137			of the projection weight vectors of the given words and
138			the vectors for each word in the model.
139			The function corresponds to the `word-analogy` and `distance`
140			scripts in the original word2vec implementation.
141
142			Based on Gensim implementation.
143
144			:param model: Word embedding model of ``gensim.model.KeyedVectors``.
145			:param list positive: List of words that contribute positively.
146			:param list negative: List of words that contribute negatively.
147			:param int topn: Number of top-N similar words to return.
148			:param int restrict_vocab: Optional integer which limits the
149			range of vectors
150			which are searched for most-similar values.
151			For example, restrict_vocab=10000 would
152			only check the first 10000 word vectors
153			in the vocabulary order. (This may be
154			meaningful if you've sorted the vocabulary
155			by descending frequency.)
156			:param bool unrestricted: Whether to restricted the most
157			similar words to be not from
158			the positive or negative word list.
159			:return: Sequence of (word, similarity).
160			"""
161			if topn is not None and topn < 1:
162			return []
163
164			if positive is None:
165			positive = []
166			if negative is None:
167			negative = []
168
169			model.init_sims()
170
171			if (isinstance(positive, string_types)
172			and not negative):
173			# allow calls like most_similar('dog'),
174			# as a shorthand for most_similar(['dog'])
175			positive = [positive]
176
177			if ((isinstance(positive, string_types) and negative)
178			or (isinstance(negative, string_types) and positive)):
179			raise ValueError('If positives and negatives are given, '
180			'both should be lists!')
181
182			# add weights for each word, if not already present;
183			# default to 1.0 for positive and -1.0 for negative words
184			positive = [
185			(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
186			else word
187			for word in positive
188			]
189			negative = [
190			(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
191			else word
192			for word in negative
193			]
194
195			# compute the weighted average of all words
196			all_words, mean = set(), []
197			for word, weight in positive + negative:
198			if isinstance(word, np.ndarray):
199			mean.append(weight * word)
200			else:
201			mean.append(weight * model.word_vec(word, use_norm=True))
202			if word in model.vocab:
203			all_words.add(model.vocab[word].index)
204
205			if not mean:
206			raise ValueError("Cannot compute similarity with no input.")
207			mean = gensim.matutils.unitvec(np.array(mean)
208			.mean(axis=0)).astype(float)
209
210			if indexer is not None:
211			return indexer.most_similar(mean, topn)
212
213			limited = (model.vectors_norm if restrict_vocab is None
214			else model.vectors_norm[:restrict_vocab])
215			dists = limited @ mean
216
217			if topn is None:
218			return dists
219
220			best = gensim.matutils.argsort(dists,
221			topn=topn + len(all_words),
222			reverse=True)
223
224			# if not unrestricted, then ignore (don't return)
225			# words from the input
226			result = [(model.index2word[sim], float(dists[sim]))
227			for sim in best
228			if unrestricted or sim not in all_words]
229
230			return result[:topn]
231
232
233			def get_seed_vector(seed, bias_word_embedding):
234
235			if seed == 'direction':
236			positive_end = bias_word_embedding.positive_end
237			negative_end = bias_word_embedding.negative_end
238			bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
239			seed_vector = bias_word_embedding.direction
240			else:
241			if seed == 'ends':
242			positive_end = bias_word_embedding.positive_end
243			negative_end = bias_word_embedding.negative_end
244
245			else:
246			positive_end, negative_end = seed
247
248			seed_vector = normalize(bias_word_embedding.model[positive_end]
249			- bias_word_embedding.model[negative_end])
250
251			return seed_vector, positive_end, negative_end
252
253
254			def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
255
256			if ax is None:
257			_, ax = plt.subplots(figsize=(10, 5))
258
259			y_cluster = (KMeans(n_clusters=2, random_state=random_state)
260			.fit_predict(X))
261
262			embedded_vectors = (TSNE(n_components=2, random_state=random_state)
263			.fit_transform(X))
264
265			for y_value in np.unique(y_cluster):
266			mask = (y_cluster == y_value)
267			label = 'Positive' if y_value else 'Negative'
268			ax.scatter(embedded_vectors[mask, 0],
269			embedded_vectors[mask, 1],
270			label=label)
271
272			ax.legend()
273
274			acc = accuracy_score(y_true, y_cluster)
275
276			return max(acc, 1 - acc)
277

ResponsiblyAI / responsibly

responsibly.we.utils.most_similar() F last analyzed 2021-04-02 13:01 UTC

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

responsibly.we.utils.most_similar() F
last analyzed 2021-04-02 13:01 UTC