Passed
Push — master ( fe038f...170db5 )
by Shlomi
03:29 queued 01:43
created

BiasWordsEmbedding.learn_full_specific_words()   C

Complexity

Conditions 9

Size

Total Lines 56
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
eloc 32
nop 4
dl 0
loc 56
rs 6.6666
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
import copy
2
3
import matplotlib.pylab as plt
4
import numpy as np
5
import pandas as pd
6
import seaborn as sns
7
from gensim.models.keyedvectors import KeyedVectors
8
from scipy.stats import spearmanr
9
from sklearn.decomposition import PCA
10
from sklearn.metrics.pairwise import euclidean_distances
11
from sklearn.svm import LinearSVC
12
from tqdm import tqdm
13
14
from tabulate import tabulate
15
16
from ..consts import RANDOM_STATE
17
from .benchmark import evaluate_words_embedding
18
from .utils import (
19
    cosine_similarity, normalize, project_reject_vector, project_vector,
20
    reject_vector, round_to_extreme, take_two_sides_extreme_sorted,
21
    update_word_vector,
22
)
23
24
25
DIRECTION_METHODS = ['single', 'sum', 'pca']
26
DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
27
FIRST_PC_THRESHOLD = 0.5
28
MAX_NON_SPECIFIC_EXAMPLES = 1000
29
30
31
class BiasWordsEmbedding:
32
    """Audit and Adjust a Bias in English Words Embedding.
33
34
    :param model: Words embedding model of ``gensim.model.KeyedVectors``
35
    :param bool only_lower: Whether the words embedding contrains
36
                            only lower case words
37
    :param bool verbose: Set vebosity
38
    """
39
40
    def __init__(self, model, only_lower=False, verbose=False,
41
                 identify_direction=False):
42
        if not isinstance(model, KeyedVectors):
43
            raise TypeError('model should be of type KeyedVectors, not {}'
44
                            .format(type(model)))
45
46
        # TODO: this is bad Python, ask someone about it
47
        # probably should be a better design
48
        # identify_direction doesn't have any meaning
49
        # for the calss BiasWordsEmbedding
50
        if self.__class__ == __class__ and identify_direction is not False:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable __class__ does not seem to be defined.
Loading history...
51
            raise ValueError('identify_direction must be False'
52
                             ' for an instance of {}'
53
                             .format(__class__))
54
55
        self.model = model
56
57
        # TODO: write unitest for when it is False
58
        self.only_lower = only_lower
59
60
        self._verbose = verbose
61
62
        self.direction = None
63
        self.positive_end = None
64
        self.negative_end = None
65
66
    def __copy__(self):
67
        bias_words_embedding = self.__class__(self.model,
68
                                              self.only_lower,
69
                                              self._verbose,
70
                                              identify_direction=False)
71
        bias_words_embedding.direction = copy.deepcopy(self.direction)
72
        bias_words_embedding.positive_end = copy.deepcopy(self.positive_end)
73
        bias_words_embedding.negative_end = copy.deepcopy(self.negative_end)
74
        return bias_words_embedding
75
76
    def __deepcopy__(self, memo):
77
        bias_words_embedding = copy.copy(self)
78
        bias_words_embedding.model = copy.deepcopy(bias_words_embedding.model)
79
        return bias_words_embedding
80
81
    def __getitem__(self, key):
82
        return self.model[key]
83
84
    def __contains__(self, item):
85
        return item in self.model
86
87
    def _filter_words_by_model(self, words):
88
        return [word for word in words if word in self]
89
90
    def _is_direction_identified(self):
91
        if self.direction is None:
92
            raise RuntimeError('The direction was not identified'
93
                               ' for this {} instance'
94
                               .format(self.__class__.__name__))
95
96
    # There is a mistake in the article
97
    # it is written (section 5.1):
98
    # "To identify the gender subspace, we took the ten gender pair difference
99
    # vectors and computed its principal components (PCs)"
100
    # however in the source code:
101
    # https://github.com/tolga-b/debiaswe/blob/10277b23e187ee4bd2b6872b507163ef4198686b/debiaswe/we.py#L235-L245
102
    def _identify_subspace_by_pca(self, definitional_pairs, n_components):
103
        matrix = []
104
105
        for word1, word2 in definitional_pairs:
106
            vector1 = normalize(self[word1])
107
            vector2 = normalize(self[word2])
108
109
            center = (vector1 + vector2) / 2
110
111
            matrix.append(vector1 - center)
112
            matrix.append(vector2 - center)
113
114
        pca = PCA(n_components=n_components)
115
        pca.fit(matrix)
116
117
        if self._verbose:
118
            table = enumerate(pca.explained_variance_ratio_, start=1)
119
            headers = ['Principal Component',
120
                       'Explained Variance Ratio']
121
            print(tabulate(table, headers=headers))
122
123
        return pca
124
125
    # TODO: add the SVD method from section 6 step 1
126
    # It seems there is a mistake there, I think it is the same as PCA
127
    # just with repleacing it with SVD
128
    def _identify_direction(self, positive_end, negative_end,
129
                            definitional, method='pca'):
130
        if method not in DIRECTION_METHODS:
131
            raise ValueError('method should be one of {}, {} was given'.format(
132
                DIRECTION_METHODS, method))
133
134
        if positive_end == negative_end:
135
            raise ValueError('positive_end and negative_end'
136
                             'should be different, and not the same "{}"'
137
                             .format(positive_end))
138
        if self._verbose:
139
            print('Identify direction using {} method...'.format(method))
140
141
        direction = None
142
143
        if method == 'single':
144
            direction = normalize(normalize(self[definitional[0]])
145
                                  - normalize(self[definitional[1]]))
146
147
        elif method == 'sum':
148
            group1_sum_vector = np.sum([self[word]
149
                                        for word in definitional[0]], axis=0)
150
            group2_sum_vector = np.sum([self[word]
151
                                        for word in definitional[1]], axis=0)
152
153
            diff_vector = (normalize(group1_sum_vector)
154
                           - normalize(group2_sum_vector))
155
156
            direction = normalize(diff_vector)
157
158
        elif method == 'pca':
159
            pca = self._identify_subspace_by_pca(definitional, 10)
160
            if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
161
                raise RuntimeError('The Explained variance'
162
                                   'of the first principal component should be'
163
                                   'at least {}, but it is {}'
164
                                   .format(FIRST_PC_THRESHOLD,
165
                                           pca.explained_variance_ratio_[0]))
166
            direction = pca.components_[0]
167
168
            # if direction is oposite (e.g. we cannot control
169
            # what the PCA will return)
170
            ends_diff_projection = cosine_similarity((self[positive_end]
171
                                                      - self[negative_end]),
172
                                                     direction)
173
            if ends_diff_projection < 0:
174
                direction = -direction  # pylint: disable=invalid-unary-operand-type
175
176
        self.direction = direction
177
        self.positive_end = positive_end
178
        self.negative_end = negative_end
179
180
    def project_on_direction(self, word):
181
        """Project the normalized vector of the word on the direction.
182
183
        :param str word: The word tor project
184
        :return float: The projection scalar
185
        """
186
187
        self._is_direction_identified()
188
189
        vector = self[word]
190
        projection_score = self.model.cosine_similarities(self.direction,
191
                                                          [vector])[0]
192
        return projection_score
193
194
    def _calc_projection_scores(self, words):
195
        self._is_direction_identified()
196
197
        df = pd.DataFrame({'word': words})
198
199
        # TODO: maybe using cosine_similarities on all the vectors?
200
        # it might be faster
201
        df['projection'] = df['word'].apply(self.project_on_direction)
202
        df = df.sort_values('projection', ascending=False)
203
204
        return df
205
206
    def plot_projection_scores(self, words, n_extreme=10,
207
                               ax=None, axis_projection_step=None):
208
        """Plot the projection scalar of words on the direction.
209
210
        :param list words: The words tor project
211
        :param int or None n_extreme: The number of extreme words to show
212
        :return: The ax object of the plot
213
        """
214
215
        self._is_direction_identified()
216
217
        projections_df = self._calc_projection_scores(words)
218
        projections_df['projection'] = projections_df['projection'].round(2)
219
220
        if n_extreme is not None:
221
            projections_df = take_two_sides_extreme_sorted(projections_df,
222
                                                           n_extreme=n_extreme)
223
224
        if ax is None:
225
            _, ax = plt.subplots(1)
226
227
        if axis_projection_step is None:
228
            axis_projection_step = 0.1
229
230
        cmap = plt.get_cmap('RdBu')
231
        projections_df['color'] = ((projections_df['projection'] + 0.5)
232
                                   .apply(cmap))
233
234
        most_extream_projection = (projections_df['projection']
235
                                   .abs()
236
                                   .max()
237
                                   .round(1))
238
239
        sns.barplot(x='projection', y='word', data=projections_df,
240
                    palette=projections_df['color'])
241
242
        plt.xticks(np.arange(-most_extream_projection,
243
                             most_extream_projection + axis_projection_step,
244
                             axis_projection_step))
245
        plt.title('← {} {} {} →'.format(self.negative_end,
246
                                        ' ' * 20,
247
                                        self.positive_end))
248
249
        plt.xlabel('Direction Projection')
250
        plt.ylabel('Words')
251
252
        return ax
253
254
    def plot_dist_projections_on_direction(self, word_groups, ax=None):
255
        """Plot the projection scalars distribution on the direction.
256
257
        :param dict word_groups word: The groups to projects
258
        :return float: The ax object of the plot
259
        """
260
261
        if ax is None:
262
            _, ax = plt.subplots(1)
263
264
        names = sorted(word_groups.keys())
265
266
        for name in names:
267
            words = word_groups[name]
268
            label = '{} (#{})'.format(name, len(words))
269
            vectors = [self[word] for word in words]
270
            projections = self.model.cosine_similarities(self.direction,
271
                                                         vectors)
272
            sns.distplot(projections, hist=False, label=label, ax=ax)
273
274
        plt.axvline(0, color='k', linestyle='--')
275
276
        plt.title('← {} {} {} →'.format(self.negative_end,
277
                                        ' ' * 20,
278
                                        self.positive_end))
279
        plt.xlabel('Direction Projection')
280
        plt.ylabel('Density')
281
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
282
283
        return ax
284
285
    @classmethod
286
    def _calc_bias_across_words_embeddings(cls,
287
                                           words_embedding_bias_dict,
288
                                           words):
289
        """
290
        Calculate to projections and rho of words for two words embeddings.
291
292
        :param dict words_embedding_bias_dict: ``WordsEmbeddingBias`` objects
293
                                               as values,
294
                                               and their names as keys.
295
        :param list words: Words to be projected.
296
        :return tuple: Projections and spearman rho.
297
        """
298
        # pylint: disable=W0212
299
        assert len(words_embedding_bias_dict) == 2, 'Support only in two'\
300
                                                    'words embeddings'
301
302
        intersection_words = [word for word in words
303
                              if all(word in web
304
                                     for web in (words_embedding_bias_dict
305
                                                 .values()))]
306
307
        projections = {name: web._calc_projection_scores(intersection_words)['projection']  # pylint: disable=C0301
308
                       for name, web in words_embedding_bias_dict.items()}
309
310
        df = pd.DataFrame(projections)
311
        df.index = intersection_words
312
313
        rho, _ = spearmanr(*df.transpose().values)
314
        return df, rho
315
316
    @classmethod
317
    def plot_bias_across_words_embeddings(cls, words_embedding_bias_dict,
318
                                          words, ax=None, scatter_kwargs=None):
319
        """
320
        Plot the projections of same words of two words Embeddings.
321
322
        :param dict words_embedding_bias_dict: ``WordsEmbeddingBias`` objects
323
                                               as values,
324
                                               and their names as keys.
325
        :param list words: Words to be projected.
326
        :param scatter_kwargs: Kwargs for matplotlib.pylab.scatter.
327
        :type scatter_kwargs: dict or None
328
        :return: The ax object of the plot
329
        """
330
        # pylint: disable=W0212
331
332
        df, rho = cls._calc_bias_across_words_embeddings(words_embedding_bias_dict,  # pylint: disable=C0301
333
                                                         words)
334
335
        if ax is None:
336
            _, ax = plt.subplots(1)
337
338
        if scatter_kwargs is None:
339
            scatter_kwargs = {}
340
341
        name1, name2 = words_embedding_bias_dict.keys()
342
343
        ax.scatter(x=name1, y=name2, data=df, **scatter_kwargs)
344
345
        plt.title('Bias Across Words Embeddings'
346
                  '(Spearman Rho = {:0.2f})'.format(rho))
347
348
        negative_end = words_embedding_bias_dict[name1].negative_end
349
        positive_end = words_embedding_bias_dict[name1].positive_end
350
        plt.xlabel('← {}     {}     {} →'.format(negative_end,
351
                                                 name1,
352
                                                 positive_end))
353
        plt.ylabel('← {}     {}     {} →'.format(negative_end,
354
                                                 name2,
355
                                                 positive_end))
356
357
        ax_min = round_to_extreme(df.values.min())
358
        ax_max = round_to_extreme(df.values.max())
359
        plt.xlim(ax_min, ax_max)
360
        plt.ylim(ax_min, ax_max)
361
362
        return ax
363
364
    # TODO: refactor for speed and clarity
365
    def generate_analogies(self, n_analogies=100, multiple=False,
366
                           delta=1., restrict_vocab=30000):
367
        """
368
        Generate anologies based on the bias directionself.
369
370
        x - y ~ direction.
371
        or a:x::b:y when a-b ~ direction.
372
373
        ``delta`` is used for semantically coherent. Default vale of 1
374
        corresponds to an angle <= pi/3.
375
376
        :param int n_analogies: Number of analogies to generate.
377
        :param bool multiple: Whether to allow multiple apprerences of a word
378
                              in the analogies.
379
        :param float delta: Threshold for semantic similarity.
380
                            The maximal distance between x and y.
381
        :param int restrict_vocab: The vocabulary size to use.
382
        :return: Data Frame of anologies (x, y), thier distances,
383
                 and their cosine similarity scores
384
        """
385
386
        # pylint: disable=C0301,R0914
387
388
        self._is_direction_identified()
389
390
        restrict_vocab_vectors = self.model.vectors[:restrict_vocab]
391
392
        normalized_vectores = (restrict_vocab_vectors
393
                               / np.linalg.norm(restrict_vocab_vectors, axis=1)[:, None])
394
395
        pairs_distances = euclidean_distances(normalized_vectores, normalized_vectores)
396
        pairs_indices = np.array(np.nonzero(
397
            ((pairs_distances < delta)
398
             & (pairs_distances != 0)))).T
399
        x_vecores = np.take(normalized_vectores, pairs_indices[:, 0], axis=0)
400
        y_vecores = np.take(normalized_vectores, pairs_indices[:, 1], axis=0)
401
402
        x_minus_y_vectors = x_vecores - y_vecores
403
        normalized_x_minus_y_vectors = (x_minus_y_vectors
404
                                        / np.linalg.norm(x_minus_y_vectors, axis=1)[:, None])
405
406
        cos_distances = normalized_x_minus_y_vectors @ self.direction
407
408
        sorted_cos_distances_indices = np.argsort(cos_distances)[::-1]
409
410
        sorted_cos_distances_indices_iter = iter(sorted_cos_distances_indices)
411
412
        analogies = []
413
        generated_words = set()
414
415
        while len(analogies) < n_analogies:
416
            cos_distance_index = next(sorted_cos_distances_indices_iter)
417
            paris_index = pairs_indices[cos_distance_index]
418
            word_x, word_y = [self.model.index2word[index]
419
                              for index in paris_index]
420
421
            if multiple or (not multiple
422
                            and (word_x not in generated_words
423
                                 and word_y not in generated_words)):
424
                analogies.append({'x': word_x,
425
                                  'y': word_y,
426
                                  'score': cos_distances[cos_distance_index],
427
                                  'distance': pairs_distances[tuple(paris_index)]})
428
            generated_words.add(word_x)
429
            generated_words.add(word_y)
430
431
        df = pd.DataFrame(analogies)
432
        df = df[['x', 'y', 'distance', 'score']]
433
        return df
434
435
    def calc_direct_bias(self, neutral_words, c=None):
436
        """Calculate the direct bias.
437
438
        Based on the projection of neuteral words on the direction.
439
440
        :param list neutral_words: List of neutral words
441
        :param c: Strictness of bias measuring
442
        :type c: float or None
443
        :return: The direct bias
444
        """
445
446
        if c is None:
447
            c = 1
448
449
        projections = self._calc_projection_scores(neutral_words)['projection']
450
        direct_bias_terms = np.abs(projections) ** c
451
        direct_bias = direct_bias_terms.sum() / len(neutral_words)
452
453
        return direct_bias
454
455
    def calc_indirect_bias(self, word1, word2):
456
        """Calculate the indirect bias between two words.
457
458
        Based on the amount of shared projection of the words on the direction.
459
460
        Also called PairBias.
461
        :param str word1: First word
462
        :param str word2: Second word
463
        :type c: float or None
464
        :return The indirect bias between the two words
465
        """
466
467
        self._is_direction_identified()
468
469
        vector1 = normalize(self[word1])
470
        vector2 = normalize(self[word2])
471
472
        perpendicular_vector1 = reject_vector(vector1, self.direction)
473
        perpendicular_vector2 = reject_vector(vector2, self.direction)
474
475
        inner_product = vector1 @ vector2
476
        perpendicular_similarity = cosine_similarity(perpendicular_vector1,
477
                                                     perpendicular_vector2)
478
479
        indirect_bias = ((inner_product - perpendicular_similarity)
480
                         / inner_product)
481
        return indirect_bias
482
483
    def generate_closest_words_indirect_bias(self,
484
                                             neutral_positive_end,
485
                                             neutral_negative_end,
486
                                             words=None, n_extreme=5):
487
        """
488
        Generate closest words to a neutral direction and thier indirect bias.
489
490
        :param str neutral_positive_end: A word that define the positive side
491
                                         of the neutral direction.
492
        :param str neutral_negative_end: A word that define the negative side
493
                                         of the neutral direction.
494
        :param list words: List of words to project on the neutral direction.
495
        :param int n_extreme: The number for the most extreme words
496
                              (positive and negative) to show.
497
        :return: Data Frame of the most extreme words
498
                 with their projection scores and indirect biases.
499
        """
500
501
        neutral_direction = normalize(self[neutral_positive_end]
502
                                      - self[neutral_negative_end])
503
504
        vectors = [normalize(self[word]) for word in words]
505
        df = (pd.DataFrame([{'word': word,
506
                             'projection': vector @ neutral_direction}
507
                            for word, vector in zip(words, vectors)])
508
              .sort_values('projection', ascending=False))
509
510
        df = take_two_sides_extreme_sorted(df, n_extreme,
511
                                           'end',
512
                                           neutral_positive_end,
513
                                           neutral_negative_end)
514
515
        df['indirect_bias'] = df.apply(lambda r:
516
                                       self.calc_indirect_bias(r['word'],
517
                                                               r['end']),
518
                                       axis=1)
519
520
        df = df.set_index(['end', 'word'])
521
        df = df[['projection', 'indirect_bias']]
522
523
        return df
524
525
    def _extract_neutral_words(self, specific_words):
526
        extended_specific_words = set()
527
528
        # because or specific_full data was trained on partial words embedding
529
        for word in specific_words:
530
            extended_specific_words.add(word)
531
            extended_specific_words.add(word.lower())
532
            extended_specific_words.add(word.upper())
533
            extended_specific_words.add(word.title())
534
535
        neutral_words = [word for word in self.model.vocab
536
                         if word not in extended_specific_words]
537
538
        return neutral_words
539
540
    def _neutralize(self, neutral_words):
541
        self._is_direction_identified()
542
543
        if self._verbose:
544
            neutral_words_iter = tqdm(neutral_words)
545
        else:
546
            neutral_words_iter = iter(neutral_words)
547
548
        for word in neutral_words_iter:
549
            neutralized_vector = reject_vector(self[word],
550
                                               self.direction)
551
            update_word_vector(self.model, word, neutralized_vector)
552
553
        self.model.init_sims(replace=True)
554
555
    def _equalize(self, equality_sets):
556
        # pylint: disable=R0914
557
558
        self._is_direction_identified()
559
560
        if self._verbose:
561
            words_data = []
562
563
        for equality_set_index, equality_set_words in enumerate(equality_sets):
564
            equality_set_vectors = [normalize(self[word])
565
                                    for word in equality_set_words]
566
            center = np.mean(equality_set_vectors, axis=0)
567
            (projected_center,
568
             rejected_center) = project_reject_vector(center,
569
                                                      self.direction)
570
            scaling = np.sqrt(1 - np.linalg.norm(rejected_center)**2)
571
572
            for word, vector in zip(equality_set_words, equality_set_vectors):
573
                projected_vector = project_vector(vector, self.direction)
574
575
                projected_part = normalize(projected_vector - projected_center)
576
577
                # In the code it is different of Bolukbasi
578
                # It behaves the same only for equality_sets
579
                # with size of 2 (pairs) - not sure!
580
                # However, my code is the same as the article
581
                # equalized_vector = rejected_center + scaling * self.direction
582
                # https://github.com/tolga-b/debiaswe/blob/10277b23e187ee4bd2b6872b507163ef4198686b/debiaswe/debias.py#L36-L37
583
                # For pairs, projected_part_vector1 == -projected_part_vector2,
584
                # and this is the same as
585
                # projected_part_vector1 == self.direction
586
                equalized_vector = rejected_center + scaling * projected_part
587
588
                update_word_vector(self.model, word, equalized_vector)
589
590
                if self._verbose:
591
                    words_data.append({
0 ignored issues
show
introduced by
The variable words_data does not seem to be defined in case self._verbose on line 560 is False. Are you sure this can never be the case?
Loading history...
592
                        'equality_set_index': equality_set_index,
593
                        'word': word,
594
                        'scaling': scaling,
595
                        'projected_scalar': vector @ self.direction,
596
                        'equalized_projected_scalar': (equalized_vector
597
                                                       @ self.direction),
598
                    })
599
600
        if self._verbose:
601
            print('Equalize Words Data '
602
                  '(all equal for 1-dim bias space (direction):')
603
            words_data_df = (pd.DataFrame(words_data)
604
                             .set_index(['equality_set_index', 'word']))
605
            print(tabulate(words_data_df, headers='keys'))
606
607
        self.model.init_sims(replace=True)
608
609
    def debias(self, method='hard', neutral_words=None, equality_sets=None,
610
               inplace=True):
611
        """Debias the words embedding.
612
613
        :param str method: The method of debiasing.
614
        :param list neutral_words: List of neutral words
615
                                   for the neutralize step
616
        :param list equality_sets: List of equality sets,
617
                                   for the equalize step.
618
                                   The sets represent the direction.
619
        :param bool inplace: Whether to debias the object inplace
620
                             or return a new one
621
622
        .. warning::
623
624
          After calling `debias`,
625
          all the vectors of the words embedding
626
          will be normalized to unit length.
627
628
        """
629
630
        # pylint: disable=W0212
631
        if inplace:
632
            bias_words_embedding = self
633
        else:
634
            bias_words_embedding = copy.deepcopy(self)
635
636
        if method not in DEBIAS_METHODS:
637
            raise ValueError('method should be one of {}, {} was given'.format(
638
                DEBIAS_METHODS, method))
639
640
        if method in ['hard', 'neutralize']:
641
            if self._verbose:
642
                print('Neutralize...')
643
            bias_words_embedding._neutralize(neutral_words)
644
645
        if method == 'hard':
646
            if self._verbose:
647
                print('Equalize...')
648
            bias_words_embedding._equalize(equality_sets)
649
650
        if inplace:
651
            return None
652
        else:
653
            return bias_words_embedding
654
655
    def evaluate_words_embedding(self,
656
                                 kwargs_word_pairs=None,
657
                                 kwargs_word_analogies=None):
658
        """
659
        Evaluate word pairs tasks and word analogies tasks.
660
661
        :param model: Words embedding.
662
        :param kwargs_word_pairs: Kwargs for
663
                                  evaluate_word_pairs
664
                                  method.
665
        :type kwargs_word_pairs: dict or None
666
        :param kwargs_word_analogies: Kwargs for
667
                                      evaluate_word_analogies
668
                                      method.
669
        :type evaluate_word_analogies: dict or None
670
        :return: Tuple of DataFrame for the evaluation results.
671
        """
672
673
        return evaluate_words_embedding(self.model,
674
                                        kwargs_word_pairs,
675
                                        kwargs_word_analogies)
676
677
    def learn_full_specific_words(self, seed_specific_words,
678
                                  max_non_specific_examples=None, debug=None):
679
        """Learn specific words given a list of seed specific wordsself.
680
681
        Using Linear SVM.
682
683
        :param list seed_specific_words: List of seed specific words
684
        :param int max_non_specific_examples: The number of non-specifc words
685
                                              to sample for training
686
        :return: List of learned specific words and the classifier object
687
        """
688
689
        if debug is None:
690
            debug = False
691
692
        if max_non_specific_examples is None:
693
            max_non_specific_examples = MAX_NON_SPECIFIC_EXAMPLES
694
695
        data = []
696
        non_specific_example_count = 0
697
698
        for word in self.model.vocab:
699
            is_specific = word in seed_specific_words
700
701
            if not is_specific:
702
                non_specific_example_count += 1
703
                if non_specific_example_count <= max_non_specific_examples:
704
                    data.append((self[word], is_specific))
705
            else:
706
                data.append((self[word], is_specific))
707
708
        np.random.seed(RANDOM_STATE)
709
        np.random.shuffle(data)
710
711
        X, y = zip(*data)
712
713
        X = np.array(X)
714
        X /= np.linalg.norm(X, axis=1)[:, None]
715
716
        y = np.array(y).astype('int')
717
718
        clf = LinearSVC(C=1, class_weight='balanced',
719
                        random_state=RANDOM_STATE)
720
721
        clf.fit(X, y)
722
723
        full_specific_words = []
724
        for word in self.model.vocab:
725
            vector = [normalize(self[word])]
726
            if clf.predict(vector):
727
                full_specific_words.append(word)
728
729
        if not debug:
730
            return full_specific_words, clf
731
732
        return full_specific_words, clf, X, y
733