ethically.we.utils.cosine_similarities_by_words() - Code Metrics - Inspection of "Merge pull request #24 from EthicallyAI/dev" - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 13dc98...362bd5 )

by Shlomi

created 2019-06-14 13:09 UTC

ethically.we.utils.cosine_similarities_by_words() A

↳ Parent: ethically.we.utils

Complexity

Conditions

Size

Total Lines	11
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	8
dl	0
loc	11
rs	10
c	0
b	0
f	0
cc	1
nop	3

import math

import gensim
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from six import string_types
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score


WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
                              gensim.models.keyedvectors.BaseKeyedVectors,
                              gensim.models.fasttext.FastText,
                              gensim.models.word2vec.Word2Vec,
                              gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long


def round_to_extreme(value, digits=2):
    place = 10**digits
    new_value = math.ceil(abs(value) * place) / place
    if value < 0:
        new_value = -new_value
    return new_value


def normalize(v):
    """Normalize a 1-D vector."""
    if v.ndim != 1:
        raise ValueError('v should be 1-D, {}-D was given'.format(
            v.ndim))
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


def cosine_similarity(v, u):
    """Calculate the cosine similarity between two vectors."""
    v_norm = np.linalg.norm(v)
    u_norm = np.linalg.norm(u)
    similarity = v @ u / (v_norm * u_norm)
    return similarity


def project_vector(v, u):
    """Projecting the vector v onto direction u."""
    normalize_u = normalize(u)
    return (v @ normalize_u) * normalize_u


def reject_vector(v, u):
    """Rejecting the vector v onto direction u."""
    return v - project_vector(v, u)


def project_reject_vector(v, u):
    """Projecting and rejecting the vector v onto direction u."""
    projected_vector = project_vector(v, u)
    rejected_vector = v - projected_vector
    return projected_vector, rejected_vector


def project_params(u, v):
    """Projecting and rejecting the vector v onto direction u with scalar."""
    normalize_u = normalize(u)
    projection = (v @ normalize_u)
    projected_vector = projection * normalize_u
    rejected_vector = v - projected_vector
    return projection, projected_vector, rejected_vector


def cosine_similarities_by_words(model, word, words):
    """Compute cosine similarities between a word and a set of other words."""

    assert isinstance(word, string_types), \
        'The arguemnt `word` should be a string.'
    assert not isinstance(words, string_types), \
        'The argument `words` should not be a string.'

    vec = model[word]
    vecs = [model[w] for w in words]
    return model.cosine_similarities(vec, vecs)


def update_word_vector(model, word, new_vector):
    model.vectors[model.vocab[word].index] = new_vector
    if model.vectors_norm is not None:
        model.vectors_norm[model.vocab[word].index] = normalize(new_vector)


def generate_one_word_forms(word):
    return [word.lower(), word.upper(), word.title()]


def generate_words_forms(words):
    return sum([generate_one_word_forms(word) for word in words], [])


def take_two_sides_extreme_sorted(df, n_extreme,
                                  part_column=None,
                                  head_value='',
                                  tail_value=''):
    head_df = df.head(n_extreme)[:]
    tail_df = df.tail(n_extreme)[:]

    if part_column is not None:
        head_df[part_column] = head_value
        tail_df[part_column] = tail_value

    return (pd.concat([head_df, tail_df])
            .drop_duplicates()
            .reset_index(drop=True))


def assert_gensim_keyed_vectors(model):
    if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
        raise TypeError('model should be of type {}, not {}'
                        .format(''.join(WORD_EMBEDDING_MODEL_TYPES),
                                type(model)))


def most_similar(model, positive=None, negative=None,
                 topn=10, restrict_vocab=None, indexer=None,
                 unrestricted=True):
    """
    Find the top-N most similar words.

    Positive words contribute positively towards the similarity,
    negative words negatively.

    This function computes cosine similarity between a simple mean
    of the projection weight vectors of the given words and
    the vectors for each word in the model.
    The function corresponds to the `word-analogy` and `distance`
    scripts in the original word2vec implementation.

    Based on Gensim implementation.

    :param model: Word embedding model of ``gensim.model.KeyedVectors``.
    :param list positive: List of words that contribute positively.
    :param list negative: List of words that contribute negatively.
    :param int topn: Number of top-N similar words to return.
    :param int restrict_vocab: Optional integer which limits the
                               range of vectors
                               which are searched for most-similar values.
                               For example, restrict_vocab=10000 would
                               only check the first 10000 word vectors
                               in the vocabulary order. (This may be
                               meaningful if you've sorted the vocabulary
                               by descending frequency.)
    :param bool unrestricted: Whether to restricted the most
                              similar words to be not from
                              the positive or negative word list.
    :return: Sequence of (word, similarity).
    """
    if topn is not None and topn < 1:
        return []

    if positive is None:
        positive = []
    if negative is None:
        negative = []

    model.init_sims()

    if (isinstance(positive, string_types)
            and not negative):
        # allow calls like most_similar('dog'),
        # as a shorthand for most_similar(['dog'])
        positive = [positive]

    if ((isinstance(positive, string_types) and negative)
            or (isinstance(negative, string_types) and positive)):
        raise ValueError('If positives and negatives are given, '
                         'both should be lists!')

    # add weights for each word, if not already present;
    # default to 1.0 for positive and -1.0 for negative words
    positive = [
        (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
        else word
        for word in positive
    ]
    negative = [
        (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
        else word
        for word in negative
    ]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive + negative:
        if isinstance(word, np.ndarray):
            mean.append(weight * word)
        else:
            mean.append(weight * model.word_vec(word, use_norm=True))
            if word in model.vocab:
                all_words.add(model.vocab[word].index)

    if not mean:
        raise ValueError("Cannot compute similarity with no input.")
    mean = gensim.matutils.unitvec(np.array(mean)
                                   .mean(axis=0)).astype(float)

    if indexer is not None:
        return indexer.most_similar(mean, topn)

    limited = (model.vectors_norm if restrict_vocab is None
               else model.vectors_norm[:restrict_vocab])
    dists = limited @ mean

    if topn is None:
        return dists

    best = gensim.matutils.argsort(dists,
                                   topn=topn + len(all_words),
                                   reverse=True)

    # if not unrestricted, then ignore (don't return)
    # words from the input
    result = [(model.index2word[sim], float(dists[sim]))
              for sim in best
              if unrestricted or sim not in all_words]

    return result[:topn]


def get_seed_vector(seed, bias_word_embedding):

    if seed == 'direction':
        positive_end = bias_word_embedding.positive_end
        negative_end = bias_word_embedding.negative_end
        bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access
        seed_vector = bias_word_embedding.direction
    else:
        if seed == 'ends':
            positive_end = bias_word_embedding.positive_end
            negative_end = bias_word_embedding.negative_end

        else:
            positive_end, negative_end = seed

        seed_vector = normalize(bias_word_embedding.model[positive_end]
                                - bias_word_embedding.model[negative_end])

    return seed_vector, positive_end, negative_end


def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):

    if ax is None:
        _, ax = plt.subplots(figsize=(10, 5))

    y_cluster = (KMeans(n_clusters=2, random_state=random_state)
                 .fit_predict(X))

    embedded_vectors = (TSNE(n_components=2, random_state=random_state)
                        .fit_transform(X))

    for y_value in np.unique(y_cluster):
        mask = (y_cluster == y_value)
        label = 'Positive' if y_value else 'Negative'
        ax.scatter(embedded_vectors[mask, 0],
                   embedded_vectors[mask, 1],
                   label=label)

    ax.legend()

    acc = accuracy_score(y_true, y_cluster)

    return max(acc, 1 - acc)


1			import math
2
3			import gensim
4			import matplotlib.pylab as plt
5			import numpy as np
6			import pandas as pd
7			from six import string_types
8			from sklearn.cluster import KMeans
9			from sklearn.manifold import TSNE
10			from sklearn.metrics import accuracy_score
11
12
13			WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
14			gensim.models.keyedvectors.BaseKeyedVectors,
15			gensim.models.fasttext.FastText,
16			gensim.models.word2vec.Word2Vec,
17			gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long
18
19
20			def round_to_extreme(value, digits=2):
21			place = 10**digits
22			new_value = math.ceil(abs(value) * place) / place
23			if value < 0:
24			new_value = -new_value
25			return new_value
26
27
28			def normalize(v):
29			"""Normalize a 1-D vector."""
30			if v.ndim != 1:
31			raise ValueError('v should be 1-D, {}-D was given'.format(
32			v.ndim))
33			norm = np.linalg.norm(v)
34			if norm == 0:
35			return v
36			return v / norm
37
38
39			def cosine_similarity(v, u):
40			"""Calculate the cosine similarity between two vectors."""
41			v_norm = np.linalg.norm(v)
42			u_norm = np.linalg.norm(u)
43			similarity = v @ u / (v_norm * u_norm)
44			return similarity
45
46
47			def project_vector(v, u):
48			"""Projecting the vector v onto direction u."""
49			normalize_u = normalize(u)
50			return (v @ normalize_u) * normalize_u
51
52
53			def reject_vector(v, u):
54			"""Rejecting the vector v onto direction u."""
55			return v - project_vector(v, u)
56
57
58			def project_reject_vector(v, u):
59			"""Projecting and rejecting the vector v onto direction u."""
60			projected_vector = project_vector(v, u)
61			rejected_vector = v - projected_vector
62			return projected_vector, rejected_vector
63
64
65			def project_params(u, v):
66			"""Projecting and rejecting the vector v onto direction u with scalar."""
67			normalize_u = normalize(u)
68			projection = (v @ normalize_u)
69			projected_vector = projection * normalize_u
70			rejected_vector = v - projected_vector
71			return projection, projected_vector, rejected_vector
72
73
74			def cosine_similarities_by_words(model, word, words):
75			"""Compute cosine similarities between a word and a set of other words."""
76
77			assert isinstance(word, string_types), \
78			'The arguemnt `word` should be a string.'
79			assert not isinstance(words, string_types), \
80			'The argument `words` should not be a string.'
81
82			vec = model[word]
83			vecs = [model[w] for w in words]
84			return model.cosine_similarities(vec, vecs)
85
86
87			def update_word_vector(model, word, new_vector):
88			model.vectors[model.vocab[word].index] = new_vector
89			if model.vectors_norm is not None:
90			model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
91
92
93			def generate_one_word_forms(word):
94			return [word.lower(), word.upper(), word.title()]
95
96
97			def generate_words_forms(words):
98			return sum([generate_one_word_forms(word) for word in words], [])
99
100
101			def take_two_sides_extreme_sorted(df, n_extreme,
102			part_column=None,
103			head_value='',
104			tail_value=''):
105			head_df = df.head(n_extreme)[:]
106			tail_df = df.tail(n_extreme)[:]
107
108			if part_column is not None:
109			head_df[part_column] = head_value
110			tail_df[part_column] = tail_value
111
112			return (pd.concat([head_df, tail_df])
113			.drop_duplicates()
114			.reset_index(drop=True))
115
116
117			def assert_gensim_keyed_vectors(model):
118			if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
119			raise TypeError('model should be of type {}, not {}'
120			.format(''.join(WORD_EMBEDDING_MODEL_TYPES),
121			type(model)))
122
123
124			def most_similar(model, positive=None, negative=None,
125			topn=10, restrict_vocab=None, indexer=None,
126			unrestricted=True):
127			"""
128			Find the top-N most similar words.
129
130			Positive words contribute positively towards the similarity,
131			negative words negatively.
132
133			This function computes cosine similarity between a simple mean
134			of the projection weight vectors of the given words and
135			the vectors for each word in the model.
136			The function corresponds to the `word-analogy` and `distance`
137			scripts in the original word2vec implementation.
138
139			Based on Gensim implementation.
140
141			:param model: Word embedding model of ``gensim.model.KeyedVectors``.
142			:param list positive: List of words that contribute positively.
143			:param list negative: List of words that contribute negatively.
144			:param int topn: Number of top-N similar words to return.
145			:param int restrict_vocab: Optional integer which limits the
146			range of vectors
147			which are searched for most-similar values.
148			For example, restrict_vocab=10000 would
149			only check the first 10000 word vectors
150			in the vocabulary order. (This may be
151			meaningful if you've sorted the vocabulary
152			by descending frequency.)
153			:param bool unrestricted: Whether to restricted the most
154			similar words to be not from
155			the positive or negative word list.
156			:return: Sequence of (word, similarity).
157			"""
158			if topn is not None and topn < 1:
159			return []
160
161			if positive is None:
162			positive = []
163			if negative is None:
164			negative = []
165
166			model.init_sims()
167
168			if (isinstance(positive, string_types)
169			and not negative):
170			# allow calls like most_similar('dog'),
171			# as a shorthand for most_similar(['dog'])
172			positive = [positive]
173
174			if ((isinstance(positive, string_types) and negative)
175			or (isinstance(negative, string_types) and positive)):
176			raise ValueError('If positives and negatives are given, '
177			'both should be lists!')
178
179			# add weights for each word, if not already present;
180			# default to 1.0 for positive and -1.0 for negative words
181			positive = [
182			(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
183			else word
184			for word in positive
185			]
186			negative = [
187			(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
188			else word
189			for word in negative
190			]
191
192			# compute the weighted average of all words
193			all_words, mean = set(), []
194			for word, weight in positive + negative:
195			if isinstance(word, np.ndarray):
196			mean.append(weight * word)
197			else:
198			mean.append(weight * model.word_vec(word, use_norm=True))
199			if word in model.vocab:
200			all_words.add(model.vocab[word].index)
201
202			if not mean:
203			raise ValueError("Cannot compute similarity with no input.")
204			mean = gensim.matutils.unitvec(np.array(mean)
205			.mean(axis=0)).astype(float)
206
207			if indexer is not None:
208			return indexer.most_similar(mean, topn)
209
210			limited = (model.vectors_norm if restrict_vocab is None
211			else model.vectors_norm[:restrict_vocab])
212			dists = limited @ mean
213
214			if topn is None:
215			return dists
216
217			best = gensim.matutils.argsort(dists,
218			topn=topn + len(all_words),
219			reverse=True)
220
221			# if not unrestricted, then ignore (don't return)
222			# words from the input
223			result = [(model.index2word[sim], float(dists[sim]))
224			for sim in best
225			if unrestricted or sim not in all_words]
226
227			return result[:topn]
228
229
230			def get_seed_vector(seed, bias_word_embedding):
231
232			if seed == 'direction':
233			positive_end = bias_word_embedding.positive_end
234			negative_end = bias_word_embedding.negative_end
235			bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
236			seed_vector = bias_word_embedding.direction
237			else:
238			if seed == 'ends':
239			positive_end = bias_word_embedding.positive_end
240			negative_end = bias_word_embedding.negative_end
241
242			else:
243			positive_end, negative_end = seed
244
245			seed_vector = normalize(bias_word_embedding.model[positive_end]
246			- bias_word_embedding.model[negative_end])
247
248			return seed_vector, positive_end, negative_end
249
250
251			def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
252
253			if ax is None:
254			_, ax = plt.subplots(figsize=(10, 5))
255
256			y_cluster = (KMeans(n_clusters=2, random_state=random_state)
257			.fit_predict(X))
258
259			embedded_vectors = (TSNE(n_components=2, random_state=random_state)
260			.fit_transform(X))
261
262			for y_value in np.unique(y_cluster):
263			mask = (y_cluster == y_value)
264			label = 'Positive' if y_value else 'Negative'
265			ax.scatter(embedded_vectors[mask, 0],
266			embedded_vectors[mask, 1],
267			label=label)
268
269			ax.legend()
270
271			acc = accuracy_score(y_true, y_cluster)
272
273			return max(acc, 1 - acc)
274

ResponsiblyAI / responsibly

Push — master ( 13dc98...362bd5 )

ethically.we.utils.cosine_similarities_by_words() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like