ethically.we.utils.most_similar() - Code Metrics - Inspection of "Version 0.0.4" - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#22)

by Shlomi

created 2019-06-03 21:10 UTC

ethically.we.utils.most_similar() F

↳ Parent: ethically.we.utils

Complexity

Conditions

Size

Total Lines	104
Code Lines	49

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	20
eloc	49
nop	7
dl	0
loc	104
rs	0
c	0
b	0
f	0

How to fix Long Method Complexity

import math

import gensim
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from six import string_types
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score


WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
                              gensim.models.keyedvectors.BaseKeyedVectors,
                              gensim.models.fasttext.FastText,
                              gensim.models.word2vec.Word2Vec,
                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long


def round_to_extreme(value, digits=2):
    place = 10**digits
    new_value = math.ceil(abs(value) * place) / place
    if value < 0:
        new_value = -new_value
    return new_value


def normalize(v):
    """Normalize a 1-D vector."""
    if v.ndim != 1:
        raise ValueError('v should be 1-D, {}-D was given'.format(
            v.ndim))
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


def cosine_similarity(v, u):
    """Calculate the cosine similarity between two vectors."""
    v_norm = np.linalg.norm(v)
    u_norm = np.linalg.norm(u)
    similarity = v @ u / (v_norm * u_norm)
    return similarity


def project_vector(v, u):
    """Projecting the vector v onto direction u."""
    normalize_u = normalize(u)
    return (v @ normalize_u) * normalize_u


def reject_vector(v, u):
    """Rejecting the vector v onto direction u."""
    return v - project_vector(v, u)


def project_reject_vector(v, u):
    """Projecting and rejecting the vector v onto direction u."""
    projected_vector = project_vector(v, u)
    rejected_vector = v - projected_vector
    return projected_vector, rejected_vector


def project_params(u, v):
    """Projecting and rejecting the vector v onto direction u with scalar."""
    normalize_u = normalize(u)
    projection = (v @ normalize_u)
    projected_vector = projection * normalize_u
    rejected_vector = v - projected_vector
    return projection, projected_vector, rejected_vector


def update_word_vector(model, word, new_vector):
    model.vectors[model.vocab[word].index] = new_vector
    if model.vectors_norm is not None:
        model.vectors_norm[model.vocab[word].index] = normalize(new_vector)


def generate_one_word_forms(word):
    return [word.lower(), word.upper(), word.title()]


def generate_words_forms(words):
    return sum([generate_one_word_forms(word) for word in words], [])


def take_two_sides_extreme_sorted(df, n_extreme,
                                  part_column=None,
                                  head_value='',
                                  tail_value=''):
    head_df = df.head(n_extreme)[:]
    tail_df = df.tail(n_extreme)[:]

    if part_column is not None:
        head_df[part_column] = head_value
        tail_df[part_column] = tail_value

    return (pd.concat([head_df, tail_df])
            .drop_duplicates()
            .reset_index(drop=True))


def assert_gensim_keyed_vectors(model):
    if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
        raise TypeError('model should be of type {}, not {}'
                        .format(''.join(WORD_EMBEDDING_MODEL_TYPES),
                                type(model)))


def most_similar(model, positive=None, negative=None,
                 topn=10, restrict_vocab=None, indexer=None,
                 unrestricted=True):
    """
    Find the top-N most similar words.

    Positive words contribute positively towards the similarity,
    negative words negatively.

    This function computes cosine similarity between a simple mean
    of the projection weight vectors of the given words and
    the vectors for each word in the model.
    The function corresponds to the `word-analogy` and `distance`
    scripts in the original word2vec implementation.

    Based on Gensim implementation.

    :param model: Word embedding model of ``gensim.model.KeyedVectors``.
    :param list positive: List of words that contribute positively.
    :param list negative: List of words that contribute negatively.
    :param int topn: Number of top-N similar words to return.
    :param int restrict_vocab: Optional integer which limits the
                               range of vectors
                               which are searched for most-similar values.
                               For example, restrict_vocab=10000 would
                               only check the first 10000 word vectors
                               in the vocabulary order. (This may be
                               meaningful if you've sorted the vocabulary
                               by descending frequency.)
    :param bool unrestricted: Whether to restricted the most
                              similar words to be not from
                              the positive or negative word list.
    :return: Sequence of (word, similarity).
    """
    if topn is not None and topn < 1:
        return []

    if positive is None:
        positive = []
    if negative is None:
        negative = []

    model.init_sims()

    if (isinstance(positive, string_types)
            and not negative):
        # allow calls like most_similar('dog'),
        # as a shorthand for most_similar(['dog'])
        positive = [positive]

    if ((isinstance(positive, string_types) and negative)
            or (isinstance(negative, string_types) and positive)):
        raise ValueError('If positives and negatives are given, '
                         'both should be lists!')

    # add weights for each word, if not already present;
    # default to 1.0 for positive and -1.0 for negative words
    positive = [
        (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
        else word
        for word in positive
    ]
    negative = [
        (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
        else word
        for word in negative
    ]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive + negative:
        if isinstance(word, np.ndarray):
            mean.append(weight * word)
        else:
            mean.append(weight * model.word_vec(word, use_norm=True))
            if word in model.vocab:
                all_words.add(model.vocab[word].index)

    if not mean:
        raise ValueError("Cannot compute similarity with no input.")
    mean = gensim.matutils.unitvec(np.array(mean)
                                   .mean(axis=0)).astype(float)

    if indexer is not None:
        return indexer.most_similar(mean, topn)

    limited = (model.vectors_norm if restrict_vocab is None
               else model.vectors_norm[:restrict_vocab])
    dists = limited @ mean

    if topn is None:
        return dists

    best = gensim.matutils.argsort(dists,
                                   topn=topn + len(all_words),
                                   reverse=True)

    # if not unrestricted, then ignore (don't return)
    # words from the input
    result = [(model.index2word[sim], float(dists[sim]))
              for sim in best
              if unrestricted or sim not in all_words]

    return result[:topn]


def get_seed_vector(seed, bias_word_embedding):

    if seed == 'direction':
        positive_end = bias_word_embedding.positive_end
        negative_end = bias_word_embedding.negative_end
        bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
        seed_vector = bias_word_embedding.direction
    else:
        if seed == 'ends':
            positive_end = bias_word_embedding.positive_end
            negative_end = bias_word_embedding.negative_end

        else:
            positive_end, negative_end = seed

        seed_vector = normalize(bias_word_embedding.model[positive_end]
                                - bias_word_embedding.model[negative_end])

    return seed_vector, positive_end, negative_end


def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):

    if ax is None:
        _, ax = plt.subplots(figsize=(10, 5))

    y_cluster = (KMeans(n_clusters=2, random_state=random_state)
                 .fit_predict(X))

    embedded_vectors = (TSNE(n_components=2, random_state=random_state)
                        .fit_transform(X))

    for y_value in np.unique(y_cluster):
        mask = (y_cluster == y_value)
        label = 'Positive' if y_value else 'Negative'
        ax.scatter(embedded_vectors[mask, 0],
                   embedded_vectors[mask, 1],
                   label=label)

    ax.legend()

    acc = accuracy_score(y_true, y_cluster)

    return max(acc, 1 - acc)


1			import math
2
3			import gensim
4			import matplotlib.pylab as plt
5			import numpy as np
6			import pandas as pd
7			from six import string_types
8			from sklearn.cluster import KMeans
9			from sklearn.manifold import TSNE
10			from sklearn.metrics import accuracy_score
11
12
13			WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
14			gensim.models.keyedvectors.BaseKeyedVectors,
15			gensim.models.fasttext.FastText,
16			gensim.models.word2vec.Word2Vec,
17			gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long
18
19
20			def round_to_extreme(value, digits=2):
21			place = 10**digits
22			new_value = math.ceil(abs(value) * place) / place
23			if value < 0:
24			new_value = -new_value
25			return new_value
26
27
28			def normalize(v):
29			"""Normalize a 1-D vector."""
30			if v.ndim != 1:
31			raise ValueError('v should be 1-D, {}-D was given'.format(
32			v.ndim))
33			norm = np.linalg.norm(v)
34			if norm == 0:
35			return v
36			return v / norm
37
38
39			def cosine_similarity(v, u):
40			"""Calculate the cosine similarity between two vectors."""
41			v_norm = np.linalg.norm(v)
42			u_norm = np.linalg.norm(u)
43			similarity = v @ u / (v_norm * u_norm)
44			return similarity
45
46
47			def project_vector(v, u):
48			"""Projecting the vector v onto direction u."""
49			normalize_u = normalize(u)
50			return (v @ normalize_u) * normalize_u
51
52
53			def reject_vector(v, u):
54			"""Rejecting the vector v onto direction u."""
55			return v - project_vector(v, u)
56
57
58			def project_reject_vector(v, u):
59			"""Projecting and rejecting the vector v onto direction u."""
60			projected_vector = project_vector(v, u)
61			rejected_vector = v - projected_vector
62			return projected_vector, rejected_vector
63
64
65			def project_params(u, v):
66			"""Projecting and rejecting the vector v onto direction u with scalar."""
67			normalize_u = normalize(u)
68			projection = (v @ normalize_u)
69			projected_vector = projection * normalize_u
70			rejected_vector = v - projected_vector
71			return projection, projected_vector, rejected_vector
72
73
74			def update_word_vector(model, word, new_vector):
75			model.vectors[model.vocab[word].index] = new_vector
76			if model.vectors_norm is not None:
77			model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
78
79
80			def generate_one_word_forms(word):
81			return [word.lower(), word.upper(), word.title()]
82
83
84			def generate_words_forms(words):
85			return sum([generate_one_word_forms(word) for word in words], [])
86
87
88			def take_two_sides_extreme_sorted(df, n_extreme,
89			part_column=None,
90			head_value='',
91			tail_value=''):
92			head_df = df.head(n_extreme)[:]
93			tail_df = df.tail(n_extreme)[:]
94
95			if part_column is not None:
96			head_df[part_column] = head_value
97			tail_df[part_column] = tail_value
98
99			return (pd.concat([head_df, tail_df])
100			.drop_duplicates()
101			.reset_index(drop=True))
102
103
104			def assert_gensim_keyed_vectors(model):
105			if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
106			raise TypeError('model should be of type {}, not {}'
107			.format(''.join(WORD_EMBEDDING_MODEL_TYPES),
108			type(model)))
109
110
111			def most_similar(model, positive=None, negative=None,
112			topn=10, restrict_vocab=None, indexer=None,
113			unrestricted=True):
114			"""
115			Find the top-N most similar words.
116
117			Positive words contribute positively towards the similarity,
118			negative words negatively.
119
120			This function computes cosine similarity between a simple mean
121			of the projection weight vectors of the given words and
122			the vectors for each word in the model.
123			The function corresponds to the `word-analogy` and `distance`
124			scripts in the original word2vec implementation.
125
126			Based on Gensim implementation.
127
128			:param model: Word embedding model of ``gensim.model.KeyedVectors``.
129			:param list positive: List of words that contribute positively.
130			:param list negative: List of words that contribute negatively.
131			:param int topn: Number of top-N similar words to return.
132			:param int restrict_vocab: Optional integer which limits the
133			range of vectors
134			which are searched for most-similar values.
135			For example, restrict_vocab=10000 would
136			only check the first 10000 word vectors
137			in the vocabulary order. (This may be
138			meaningful if you've sorted the vocabulary
139			by descending frequency.)
140			:param bool unrestricted: Whether to restricted the most
141			similar words to be not from
142			the positive or negative word list.
143			:return: Sequence of (word, similarity).
144			"""
145			if topn is not None and topn < 1:
146			return []
147
148			if positive is None:
149			positive = []
150			if negative is None:
151			negative = []
152
153			model.init_sims()
154
155			if (isinstance(positive, string_types)
156			and not negative):
157			# allow calls like most_similar('dog'),
158			# as a shorthand for most_similar(['dog'])
159			positive = [positive]
160
161			if ((isinstance(positive, string_types) and negative)
162			or (isinstance(negative, string_types) and positive)):
163			raise ValueError('If positives and negatives are given, '
164			'both should be lists!')
165
166			# add weights for each word, if not already present;
167			# default to 1.0 for positive and -1.0 for negative words
168			positive = [
169			(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
170			else word
171			for word in positive
172			]
173			negative = [
174			(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
175			else word
176			for word in negative
177			]
178
179			# compute the weighted average of all words
180			all_words, mean = set(), []
181			for word, weight in positive + negative:
182			if isinstance(word, np.ndarray):
183			mean.append(weight * word)
184			else:
185			mean.append(weight * model.word_vec(word, use_norm=True))
186			if word in model.vocab:
187			all_words.add(model.vocab[word].index)
188
189			if not mean:
190			raise ValueError("Cannot compute similarity with no input.")
191			mean = gensim.matutils.unitvec(np.array(mean)
192			.mean(axis=0)).astype(float)
193
194			if indexer is not None:
195			return indexer.most_similar(mean, topn)
196
197			limited = (model.vectors_norm if restrict_vocab is None
198			else model.vectors_norm[:restrict_vocab])
199			dists = limited @ mean
200
201			if topn is None:
202			return dists
203
204			best = gensim.matutils.argsort(dists,
205			topn=topn + len(all_words),
206			reverse=True)
207
208			# if not unrestricted, then ignore (don't return)
209			# words from the input
210			result = [(model.index2word[sim], float(dists[sim]))
211			for sim in best
212			if unrestricted or sim not in all_words]
213
214			return result[:topn]
215
216
217			def get_seed_vector(seed, bias_word_embedding):
218
219			if seed == 'direction':
220			positive_end = bias_word_embedding.positive_end
221			negative_end = bias_word_embedding.negative_end
222			bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
223			seed_vector = bias_word_embedding.direction
224			else:
225			if seed == 'ends':
226			positive_end = bias_word_embedding.positive_end
227			negative_end = bias_word_embedding.negative_end
228
229			else:
230			positive_end, negative_end = seed
231
232			seed_vector = normalize(bias_word_embedding.model[positive_end]
233			- bias_word_embedding.model[negative_end])
234
235			return seed_vector, positive_end, negative_end
236
237
238			def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
239
240			if ax is None:
241			_, ax = plt.subplots(figsize=(10, 5))
242
243			y_cluster = (KMeans(n_clusters=2, random_state=random_state)
244			.fit_predict(X))
245
246			embedded_vectors = (TSNE(n_components=2, random_state=random_state)
247			.fit_transform(X))
248
249			for y_value in np.unique(y_cluster):
250			mask = (y_cluster == y_value)
251			label = 'Positive' if y_value else 'Negative'
252			ax.scatter(embedded_vectors[mask, 0],
253			embedded_vectors[mask, 1],
254			label=label)
255
256			ax.legend()
257
258			acc = accuracy_score(y_true, y_cluster)
259
260			return max(acc, 1 - acc)
261

ResponsiblyAI / responsibly

Pull Request — master (#22)

ethically.we.utils.most_similar() F

Complexity

Size

Duplication

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like