| Conditions | 20 |
| Total Lines | 104 |
| Code Lines | 49 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like ethically.we.utils.most_similar() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import math |
||
| 111 | def most_similar(model, positive=None, negative=None, |
||
| 112 | topn=10, restrict_vocab=None, indexer=None, |
||
| 113 | unrestricted=True): |
||
| 114 | """ |
||
| 115 | Find the top-N most similar words. |
||
| 116 | |||
| 117 | Positive words contribute positively towards the similarity, |
||
| 118 | negative words negatively. |
||
| 119 | |||
| 120 | This function computes cosine similarity between a simple mean |
||
| 121 | of the projection weight vectors of the given words and |
||
| 122 | the vectors for each word in the model. |
||
| 123 | The function corresponds to the `word-analogy` and `distance` |
||
| 124 | scripts in the original word2vec implementation. |
||
| 125 | |||
| 126 | Based on Gensim implementation. |
||
| 127 | |||
| 128 | :param model: Word embedding model of ``gensim.model.KeyedVectors``. |
||
| 129 | :param list positive: List of words that contribute positively. |
||
| 130 | :param list negative: List of words that contribute negatively. |
||
| 131 | :param int topn: Number of top-N similar words to return. |
||
| 132 | :param int restrict_vocab: Optional integer which limits the |
||
| 133 | range of vectors |
||
| 134 | which are searched for most-similar values. |
||
| 135 | For example, restrict_vocab=10000 would |
||
| 136 | only check the first 10000 word vectors |
||
| 137 | in the vocabulary order. (This may be |
||
| 138 | meaningful if you've sorted the vocabulary |
||
| 139 | by descending frequency.) |
||
| 140 | :param bool unrestricted: Whether to restricted the most |
||
| 141 | similar words to be not from |
||
| 142 | the positive or negative word list. |
||
| 143 | :return: Sequence of (word, similarity). |
||
| 144 | """ |
||
| 145 | if topn is not None and topn < 1: |
||
| 146 | return [] |
||
| 147 | |||
| 148 | if positive is None: |
||
| 149 | positive = [] |
||
| 150 | if negative is None: |
||
| 151 | negative = [] |
||
| 152 | |||
| 153 | model.init_sims() |
||
| 154 | |||
| 155 | if (isinstance(positive, string_types) |
||
| 156 | and not negative): |
||
| 157 | # allow calls like most_similar('dog'), |
||
| 158 | # as a shorthand for most_similar(['dog']) |
||
| 159 | positive = [positive] |
||
| 160 | |||
| 161 | if ((isinstance(positive, string_types) and negative) |
||
| 162 | or (isinstance(negative, string_types) and positive)): |
||
| 163 | raise ValueError('If positives and negatives are given, ' |
||
| 164 | 'both should be lists!') |
||
| 165 | |||
| 166 | # add weights for each word, if not already present; |
||
| 167 | # default to 1.0 for positive and -1.0 for negative words |
||
| 168 | positive = [ |
||
| 169 | (word, 1.0) if isinstance(word, string_types + (np.ndarray,)) |
||
| 170 | else word |
||
| 171 | for word in positive |
||
| 172 | ] |
||
| 173 | negative = [ |
||
| 174 | (word, -1.0) if isinstance(word, string_types + (np.ndarray,)) |
||
| 175 | else word |
||
| 176 | for word in negative |
||
| 177 | ] |
||
| 178 | |||
| 179 | # compute the weighted average of all words |
||
| 180 | all_words, mean = set(), [] |
||
| 181 | for word, weight in positive + negative: |
||
| 182 | if isinstance(word, np.ndarray): |
||
| 183 | mean.append(weight * word) |
||
| 184 | else: |
||
| 185 | mean.append(weight * model.word_vec(word, use_norm=True)) |
||
| 186 | if word in model.vocab: |
||
| 187 | all_words.add(model.vocab[word].index) |
||
| 188 | |||
| 189 | if not mean: |
||
| 190 | raise ValueError("Cannot compute similarity with no input.") |
||
| 191 | mean = gensim.matutils.unitvec(np.array(mean) |
||
| 192 | .mean(axis=0)).astype(float) |
||
| 193 | |||
| 194 | if indexer is not None: |
||
| 195 | return indexer.most_similar(mean, topn) |
||
| 196 | |||
| 197 | limited = (model.vectors_norm if restrict_vocab is None |
||
| 198 | else model.vectors_norm[:restrict_vocab]) |
||
| 199 | dists = limited @ mean |
||
| 200 | |||
| 201 | if topn is None: |
||
| 202 | return dists |
||
| 203 | |||
| 204 | best = gensim.matutils.argsort(dists, |
||
| 205 | topn=topn + len(all_words), |
||
| 206 | reverse=True) |
||
| 207 | |||
| 208 | # if not unrestricted, then ignore (don't return) |
||
| 209 | # words from the input |
||
| 210 | result = [(model.index2word[sim], float(dists[sim])) |
||
| 211 | for sim in best |
||
| 212 | if unrestricted or sim not in all_words] |
||
| 213 | |||
| 214 | return result[:topn] |
||
| 215 | |||
| 261 |