| 1 |  |  | import math | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import gensim | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import matplotlib.pylab as plt | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import numpy as np | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | import pandas as pd | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | from six import string_types | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | from sklearn.cluster import KMeans | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from sklearn.manifold import TSNE | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | from sklearn.metrics import accuracy_score | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |                               gensim.models.keyedvectors.BaseKeyedVectors, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |                               gensim.models.fasttext.FastText, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |                               gensim.models.word2vec.Word2Vec, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |                               gensim.models.base_any2vec.BaseWordEmbeddingsModel,)  # pylint: disable=line-too-long | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  | def round_to_extreme(value, digits=2): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |     place = 10**digits | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |     new_value = math.ceil(abs(value) * place) / place | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |     if value < 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |         new_value = -new_value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |     return new_value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  | def normalize(v): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     """Normalize a 1-D vector.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |     if v.ndim != 1: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |         raise ValueError('v should be 1-D, {}-D was given'.format( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |             v.ndim)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     norm = np.linalg.norm(v) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |     if norm == 0: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |         return v | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |     return v / norm | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  | def cosine_similarity(v, u): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |     """Calculate the cosine similarity between two vectors.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     v_norm = np.linalg.norm(v) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |     u_norm = np.linalg.norm(u) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |     similarity = v @ u / (v_norm * u_norm) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |     return similarity | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  | def project_vector(v, u): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |     """Projecting the vector v onto direction u.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |     normalize_u = normalize(u) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |     return (v @ normalize_u) * normalize_u | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  | def reject_vector(v, u): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |     """Rejecting the vector v onto direction u.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |     return v - project_vector(v, u) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  | def project_reject_vector(v, u): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |     """Projecting and rejecting the vector v onto direction u.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |     projected_vector = project_vector(v, u) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |     rejected_vector = v - projected_vector | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |     return projected_vector, rejected_vector | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  | def project_params(u, v): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |     """Projecting and rejecting the vector v onto direction u with scalar.""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |     normalize_u = normalize(u) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |     projection = (v @ normalize_u) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |     projected_vector = projection * normalize_u | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |     rejected_vector = v - projected_vector | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |     return projection, projected_vector, rejected_vector | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 73 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 74 |  |  | def cosine_similarities_by_words(model, word, words): | 
            
                                                                        
                            
            
                                    
            
            
                | 75 |  |  |     """Compute cosine similarities between a word and a set of other words.""" | 
            
                                                                        
                            
            
                                    
            
            
                | 76 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 77 |  |  |     assert isinstance(word, string_types), \ | 
            
                                                                        
                            
            
                                    
            
            
                | 78 |  |  |         'The arguemnt `word` should be a string.' | 
            
                                                                        
                            
            
                                    
            
            
                | 79 |  |  |     assert not isinstance(words, string_types), \ | 
            
                                                                        
                            
            
                                    
            
            
                | 80 |  |  |         'The argument `words` should not be a string.' | 
            
                                                                        
                            
            
                                    
            
            
                | 81 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 82 |  |  |     vec = model[word] | 
            
                                                                        
                            
            
                                    
            
            
                | 83 |  |  |     vecs = [model[w] for w in words] | 
            
                                                                        
                            
            
                                    
            
            
                | 84 |  |  |     return model.cosine_similarities(vec, vecs) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  | def update_word_vector(model, word, new_vector): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |     model.vectors[model.vocab[word].index] = new_vector | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |     if model.vectors_norm is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |         model.vectors_norm[model.vocab[word].index] = normalize(new_vector) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  | def generate_one_word_forms(word): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |     return [word.lower(), word.upper(), word.title()] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  | def generate_words_forms(words): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |     return sum([generate_one_word_forms(word) for word in words], []) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  | def take_two_sides_extreme_sorted(df, n_extreme, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |                                   part_column=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |                                   head_value='', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |                                   tail_value=''): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |     head_df = df.head(n_extreme)[:] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |     tail_df = df.tail(n_extreme)[:] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |     if part_column is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |         head_df[part_column] = head_value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         tail_df[part_column] = tail_value | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |     return (pd.concat([head_df, tail_df]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |             .drop_duplicates() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |             .reset_index(drop=True)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  | def assert_gensim_keyed_vectors(model): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |     if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |         raise TypeError('model should be of type {}, not {}' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |                         .format(''.join(WORD_EMBEDDING_MODEL_TYPES), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |                                 type(model))) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  | def most_similar(model, positive=None, negative=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |                  topn=10, restrict_vocab=None, indexer=None, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |                  unrestricted=True): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |     Find the top-N most similar words. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |     Positive words contribute positively towards the similarity, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |     negative words negatively. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |     This function computes cosine similarity between a simple mean | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |     of the projection weight vectors of the given words and | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |     the vectors for each word in the model. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |     The function corresponds to the `word-analogy` and `distance` | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |     scripts in the original word2vec implementation. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |     Based on Gensim implementation. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |     :param model: Word embedding model of ``gensim.model.KeyedVectors``. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |     :param list positive: List of words that contribute positively. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |     :param list negative: List of words that contribute negatively. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |     :param int topn: Number of top-N similar words to return. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |     :param int restrict_vocab: Optional integer which limits the | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |                                range of vectors | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |                                which are searched for most-similar values. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |                                For example, restrict_vocab=10000 would | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |                                only check the first 10000 word vectors | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |                                in the vocabulary order. (This may be | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |                                meaningful if you've sorted the vocabulary | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |                                by descending frequency.) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |     :param bool unrestricted: Whether to restricted the most | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |                               similar words to be not from | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |                               the positive or negative word list. | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |     :return: Sequence of (word, similarity). | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |     """ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |     if topn is not None and topn < 1: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |         return [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |     if positive is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |         positive = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |     if negative is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |         negative = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |     model.init_sims() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |     if (isinstance(positive, string_types) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |             and not negative): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |         # allow calls like most_similar('dog'), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |         # as a shorthand for most_similar(['dog']) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |         positive = [positive] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |     if ((isinstance(positive, string_types) and negative) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |             or (isinstance(negative, string_types) and positive)): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |         raise ValueError('If positives and negatives are given, ' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |                          'both should be lists!') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |     # add weights for each word, if not already present; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |     # default to 1.0 for positive and -1.0 for negative words | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |     positive = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |         (word, 1.0) if isinstance(word, string_types + (np.ndarray,)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |         else word | 
            
                                                                                                            
                            
            
                                    
            
            
                | 184 |  |  |         for word in positive | 
            
                                                                                                            
                            
            
                                    
            
            
                | 185 |  |  |     ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 186 |  |  |     negative = [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 187 |  |  |         (word, -1.0) if isinstance(word, string_types + (np.ndarray,)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 188 |  |  |         else word | 
            
                                                                                                            
                            
            
                                    
            
            
                | 189 |  |  |         for word in negative | 
            
                                                                                                            
                            
            
                                    
            
            
                | 190 |  |  |     ] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 191 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 192 |  |  |     # compute the weighted average of all words | 
            
                                                                                                            
                            
            
                                    
            
            
                | 193 |  |  |     all_words, mean = set(), [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 194 |  |  |     for word, weight in positive + negative: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 195 |  |  |         if isinstance(word, np.ndarray): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 196 |  |  |             mean.append(weight * word) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 197 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 198 |  |  |             mean.append(weight * model.word_vec(word, use_norm=True)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 199 |  |  |             if word in model.vocab: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 200 |  |  |                 all_words.add(model.vocab[word].index) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 201 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 202 |  |  |     if not mean: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 203 |  |  |         raise ValueError("Cannot compute similarity with no input.") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 204 |  |  |     mean = gensim.matutils.unitvec(np.array(mean) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 205 |  |  |                                    .mean(axis=0)).astype(float) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 206 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 207 |  |  |     if indexer is not None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 208 |  |  |         return indexer.most_similar(mean, topn) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 209 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 210 |  |  |     limited = (model.vectors_norm if restrict_vocab is None | 
            
                                                                                                            
                            
            
                                    
            
            
                | 211 |  |  |                else model.vectors_norm[:restrict_vocab]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 212 |  |  |     dists = limited @ mean | 
            
                                                                                                            
                            
            
                                    
            
            
                | 213 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 214 |  |  |     if topn is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 215 |  |  |         return dists | 
            
                                                                                                            
                            
            
                                    
            
            
                | 216 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 217 |  |  |     best = gensim.matutils.argsort(dists, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 218 |  |  |                                    topn=topn + len(all_words), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 219 |  |  |                                    reverse=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 220 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 221 |  |  |     # if not unrestricted, then ignore (don't return) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 222 |  |  |     # words from the input | 
            
                                                                                                            
                            
            
                                    
            
            
                | 223 |  |  |     result = [(model.index2word[sim], float(dists[sim])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 224 |  |  |               for sim in best | 
            
                                                                                                            
                            
            
                                    
            
            
                | 225 |  |  |               if unrestricted or sim not in all_words] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 226 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 227 |  |  |     return result[:topn] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 228 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 229 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 230 |  |  | def get_seed_vector(seed, bias_word_embedding): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 231 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 232 |  |  |     if seed == 'direction': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 233 |  |  |         positive_end = bias_word_embedding.positive_end | 
            
                                                                                                            
                            
            
                                    
            
            
                | 234 |  |  |         negative_end = bias_word_embedding.negative_end | 
            
                                                                                                            
                            
            
                                    
            
            
                | 235 |  |  |         bias_word_embedding._is_direction_identified()  # pylint: disable=protected-access | 
            
                                                                                                            
                            
            
                                    
            
            
                | 236 |  |  |         seed_vector = bias_word_embedding.direction | 
            
                                                                                                            
                            
            
                                    
            
            
                | 237 |  |  |     else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 238 |  |  |         if seed == 'ends': | 
            
                                                                                                            
                            
            
                                    
            
            
                | 239 |  |  |             positive_end = bias_word_embedding.positive_end | 
            
                                                                                                            
                            
            
                                    
            
            
                | 240 |  |  |             negative_end = bias_word_embedding.negative_end | 
            
                                                                                                            
                            
            
                                    
            
            
                | 241 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 242 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 243 |  |  |             positive_end, negative_end = seed | 
            
                                                                                                            
                            
            
                                    
            
            
                | 244 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 245 |  |  |         seed_vector = normalize(bias_word_embedding.model[positive_end] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 246 |  |  |                                 - bias_word_embedding.model[negative_end]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 247 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 248 |  |  |     return seed_vector, positive_end, negative_end | 
            
                                                                                                            
                            
            
                                    
            
            
                | 249 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 250 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 251 |  |  | def plot_clustering_as_classification(X, y_true, random_state=1, ax=None): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 252 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 253 |  |  |     if ax is None: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 254 |  |  |         _, ax = plt.subplots(figsize=(10, 5)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 255 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 256 |  |  |     y_cluster = (KMeans(n_clusters=2, random_state=random_state) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 257 |  |  |                  .fit_predict(X)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 258 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 259 |  |  |     embedded_vectors = (TSNE(n_components=2, random_state=random_state) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 260 |  |  |                         .fit_transform(X)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 261 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 262 |  |  |     for y_value in np.unique(y_cluster): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 263 |  |  |         mask = (y_cluster == y_value) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 264 |  |  |         label = 'Positive' if y_value else 'Negative' | 
            
                                                                                                            
                            
            
                                    
            
            
                | 265 |  |  |         ax.scatter(embedded_vectors[mask, 0], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 266 |  |  |                    embedded_vectors[mask, 1], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 267 |  |  |                    label=label) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 268 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 269 |  |  |     ax.legend() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 270 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 271 |  |  |     acc = accuracy_score(y_true, y_cluster) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 272 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 273 |  |  |     return max(acc, 1 - acc) | 
            
                                                        
            
                                    
            
            
                | 274 |  |  |  |