ethically.we.core.BiasWordsEmbedding.__init__() - Code Metrics - Inspection of "missing dependency" - ResponsiblyAI/responsibly - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 999448...189cd4 )

by Shlomi

created 2018-08-17 09:10 UTC

ethically.we.core.BiasWordsEmbedding.init() A

↳ Parent: ethically.we.core

Complexity

Conditions

Size

Total Lines	15
Code Lines	10

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	10
nop	4
dl	0
loc	15
rs	9.9
c	0
b	0
f	0

import copy
import os
import warnings

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.models.keyedvectors import KeyedVectors
from pkg_resources import resource_filename
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from tqdm import tqdm

from tabulate import tabulate

from ..consts import RANDOM_STATE
from .utils import (
    cosine_similarity, normalize, project_reject_vector, project_vector,
    reject_vector, update_word_vector,
)


DIRECTION_METHODS = ['single', 'sum', 'pca']
DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
FIRST_PC_THRESHOLD = 0.5
MAX_NON_SPECIFIC_EXAMPLES = 1000


class BiasWordsEmbedding:

    def __init__(self, model, only_lower=True, verbose=False):
        if not isinstance(model, KeyedVectors):
            raise TypeError('model should be of type KeyedVectors, not {}'
                            .format(type(model)))

        self.model = model

        # TODO: write unitest for when it is False
        self.only_lower = only_lower

        self._verbose = verbose

        self.direction = None
        self.positive_end = None
        self.negative_end = None

    def __copy__(self):
        bias_words_embedding = self.__class__(self.model)
        bias_words_embedding.direction = copy.deepcopy(self.direction)
        bias_words_embedding.positive_end = copy.deepcopy(self.positive_end)
        bias_words_embedding.negative_end = copy.deepcopy(self.negative_end)
        return bias_words_embedding

    def __deepcopy__(self, memo):
        bias_words_embedding = copy.copy(self)
        bias_words_embedding.model = copy.deepcopy(bias_words_embedding.model)
        return bias_words_embedding

    def __getitem__(self, key):
        return self.model[key]

    def __contains__(self, item):
        return item in self.model

    def _filter_words_by_model(self, words):
        return [word for word in words if word in self]

    def _is_direction_identified(self):
        if self.direction is None:
            raise RuntimeError('The direction was not identified'
                               ' for this {} instance'
                               .format(self.__class__.__name__))

    # There is a mistake in the article
    # it is written (section 5.1):
    # "To identify the gender subspace, we took the ten gender pair difference
    # vectors and computed its principal components (PCs)"
    # however in the source code:
    # https://github.com/tolga-b/debiaswe/blob/10277b23e187ee4bd2b6872b507163ef4198686b/debiaswe/we.py#L235-L245
    def _identify_subspace_by_pca(self, definitional_pairs, n_components):
        matrix = []

        for word1, word2 in definitional_pairs:
            vector1 = normalize(self[word1])
            vector2 = normalize(self[word2])

            center = (vector1 + vector2) / 2

            matrix.append(vector1 - center)
            matrix.append(vector2 - center)

        pca = PCA(n_components=n_components)
        pca.fit(matrix)

        if self._verbose:
            table = enumerate(pca.explained_variance_ratio_, start=1)
            headers = ['Principal Component',
                       'Explained Variance Ratio']
            print(tabulate(table, headers=headers))

        return pca

    # TODO: add the SVD method from section 6 step 1
    # It seems there is a mistake there, I think it is the same as PCA
    # just with repleacing it with SVD
    def _identify_direction(self, positive_end, negative_end,
                            definitional, method='pca'):
        if method not in DIRECTION_METHODS:
            raise ValueError('method should be one of {}, {} was given'.format(
                DIRECTION_METHODS, method))

        if positive_end == negative_end:
            raise ValueError('positive_end and negative_end'
                             'should be different, and not the same "{}"'
                             .format(positive_end))

        print('Identify direction using {} method...'.format(method))

        direction = None

        if method == 'single':
            direction = normalize(normalize(self[definitional[0]])
                                  - normalize(self[definitional[1]]))

        elif method == 'sum':
            groups = list(zip(*definitional))

            group1_sum_vector = np.sum([self[word]
                                        for word in groups[0]], axis=0)
            group2_sum_vector = np.sum([self[word]
                                        for word in groups[1]], axis=0)

            diff_vector = (normalize(group1_sum_vector)
                           - normalize(group2_sum_vector))

            direction = normalize(diff_vector)

        elif method == 'pca':
            pca = self._identify_subspace_by_pca(definitional, 10)
            if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
                raise RuntimeError('The Explained variance'
                                   'of the first principal component should be'
                                   'at least {}, but it is {}'
                                   .format(FIRST_PC_THRESHOLD,
                                           pca.explained_variance_ratio_[0]))
            direction = pca.components_[0]

        # if direction is oposite (e.g. we cannot control
        # what the PCA will return)
        ends_diff_projection = cosine_similarity((self[positive_end]
                                                  - self[negative_end]),
                                                 direction)
        if ends_diff_projection < 0:
            direction = -direction  # pylint: disable=invalid-unary-operand-type

        self.direction = direction
        self.positive_end = positive_end
        self.negative_end = negative_end

    def project_on_direction(self, word):
        self._is_direction_identified()

        vector = self[word]
        projection_score = self.model.cosine_similarities(self.direction,
                                                          [vector])[0]
        return projection_score

    def _calc_projection_scores(self, words):
        self._is_direction_identified()

        df = pd.DataFrame({'word': words})

        # TODO: maybe using cosine_similarities on all the vectors?
        # it might be faster
        df['projection'] = df['word'].apply(self.project_on_direction)
        df = df.sort_values('projection', ascending=False)

        return df

    def plot_projection_scores(self, words,
                               ax=None, axis_projection_step=None):
        self._is_direction_identified()

        projections_df = self._calc_projection_scores(words)
        projections_df['projection'] = projections_df['projection'].round(2)

        if ax is None:
            _, ax = plt.subplots(1)

        if axis_projection_step is None:
            axis_projection_step = 0.1

        cmap = plt.get_cmap('RdBu')
        projections_df['color'] = ((projections_df['projection'] + 0.5)
                                   .apply(cmap))

        most_extream_projection = (projections_df['projection']
                                   .abs()
                                   .max()
                                   .round(1))

        sns.barplot(x='projection', y='word', data=projections_df,
                    palette=projections_df['color'])

        plt.xticks(np.arange(-most_extream_projection, most_extream_projection,
                             axis_projection_step))
        plt.title('← {} {} {} →'.format(self.negative_end,
                                        ' ' * 20,
                                        self.positive_end))

        plt.xlabel('Direction Projection')
        plt.ylabel('Words')

        return ax

    def plot_dist_projections_on_direction(self, word_groups, ax=None):
        if ax is None:
            _, ax = plt.subplots(1)

        for name, words in word_groups.items():
            label = '{} (#{})'.format(name, len(words))
            vectors = [self[word] for word in words]
            projections = self.model.cosine_similarities(self.direction,
                                                         vectors)
            sns.distplot(projections, hist=False, label=label, ax=ax)

        plt.axvline(0, color='k', linestyle='--')

        plt.title('← {} {} {} →'.format(self.negative_end,
                                        ' ' * 20,
                                        self.positive_end))
        plt.xlabel('Direction Projection')
        plt.ylabel('Density')
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        return ax

    def calc_direct_bias(self, neutral_words, c=None):
        if c is None:
            c = 1

        projections = self._calc_projection_scores(neutral_words)['projection']
        direct_bias_terms = np.abs(projections) ** c
        direct_bias = direct_bias_terms.sum() / len(neutral_words)

        return direct_bias

    def calc_indirect_bias(self, word1, word2):
        """Also known in the article as PairBias."""
        self._is_direction_identified()

        vector1 = normalize(self[word1])
        vector2 = normalize(self[word2])

        perpendicular_vector1 = reject_vector(vector1, self.direction)
        perpendicular_vector2 = reject_vector(vector2, self.direction)

        inner_product = vector1 @ vector2
        perpendicular_similarity = cosine_similarity(perpendicular_vector1,
                                                     perpendicular_vector2)

        indirect_bias = ((inner_product - perpendicular_similarity)
                         / inner_product)
        return indirect_bias

    def _extract_neutral_words(self, specific_words):
        extended_specific_words = set()

        # because or specific_full data was trained on partial words embedding
        for word in specific_words:
            extended_specific_words.add(word)
            extended_specific_words.add(word.lower())
            extended_specific_words.add(word.upper())
            extended_specific_words.add(word.title())

        neutral_words = [word for word in self.model.vocab
                         if word not in extended_specific_words]

        return neutral_words

    def _neutralize(self, neutral_words):
        self._is_direction_identified()

        if self._verbose:
            neutral_words_iter = tqdm(neutral_words)
        else:
            neutral_words_iter = iter(neutral_words)

        for word in neutral_words_iter:
            neutralized_vector = reject_vector(self[word],
                                               self.direction)
            update_word_vector(self.model, word, neutralized_vector)

        self.model.init_sims(replace=True)

    def _equalize(self, equality_sets):
        for equality_set_words in equality_sets:
            equality_set_vectors = [normalize(self[word])
                                    for word in equality_set_words]
            center = np.mean(equality_set_vectors, axis=0)
            (projected_center,
             rejected_center) = project_reject_vector(center,
                                                      self.direction)

            for word, vector in zip(equality_set_words, equality_set_vectors):
                projected_vector = project_vector(vector, self.direction)

                projected_part = normalize(projected_vector - projected_center)
                scaling = np.sqrt(1 - np.linalg.norm(rejected_center)**2)

                # TODO - in the code it is different - why?
                # equalized_vector = rejected_center + scaling * self.direction
                # https://github.com/tolga-b/debiaswe/blob/10277b23e187ee4bd2b6872b507163ef4198686b/debiaswe/debias.py#L36-L37
                equalized_vector = rejected_center + scaling * projected_part

                update_word_vector(self.model, word, equalized_vector)

        self.model.init_sims(replace=True)

    def debias(self, method='hard', neutral_words=None, equality_sets=None,
               inplace=True):
        # pylint: disable=W0212
        if inplace:
            bias_words_embedding = self
        else:
            bias_words_embedding = copy.deepcopy(self)

        if method not in DEBIAS_METHODS:
            raise ValueError('method should be one of {}, {} was given'.format(
                DEBIAS_METHODS, method))

        if method in ['hard', 'neutralize']:
            if self._verbose:
                print('Neutralize...')
            bias_words_embedding._neutralize(neutral_words)

        if method == 'hard':
            if self._verbose:
                print('Equalize...')
            bias_words_embedding._equalize(equality_sets)

        if inplace:
            return None
        else:
            return bias_words_embedding

    def evaluate_words_embedding(self):
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=FutureWarning)

            if self._verbose:
                print('Evaluate word pairs...')
            word_pairs_path = resource_filename(__name__,
                                                os.path.join('data',
                                                             'evaluation',
                                                             'wordsim353.tsv'))
            word_paris_result = self.model.evaluate_word_pairs(word_pairs_path)

            if self._verbose:
                print('Evaluate analogies...')
            analogies_path = resource_filename(__name__,
                                               os.path.join('data',
                                                            'evaluation',
                                                            'questions-words.txt'))  # pylint: disable=C0301
            analogies_result = self.model.evaluate_word_analogies(analogies_path)  # pylint: disable=C0301

        if self._verbose:
            print()
        print('From Gensim')
        print()
        print('-' * 30)
        print()
        print('Word Pairs Result - WordSimilarity-353:')
        print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
        print('Pearson correlation coefficient:', word_paris_result[0])
        print('Spearman rank-order correlation coefficient'
              'between the similarities from the dataset'
              'and the similarities produced by the model itself:',
              word_paris_result[1])
        print('Ratio of pairs with unknown words:', word_paris_result[2])
        print()
        print('-' * 30)
        print()
        print('Analogies Result')
        print('~~~~~~~~~~~~~~~~')
        print('Overall evaluation score:', analogies_result[0])

    def learn_full_specific_words(self, seed_specific_words,
                                  max_non_specific_examples=None, debug=None):

        if debug is None:
            debug = False

        if max_non_specific_examples is None:
            max_non_specific_examples = MAX_NON_SPECIFIC_EXAMPLES

        data = []
        non_specific_example_count = 0

        for word in self.model.vocab:
            is_specific = word in seed_specific_words

            if not is_specific:
                non_specific_example_count += 1
                if non_specific_example_count <= max_non_specific_examples:
                    data.append((self[word], is_specific))
            else:
                data.append((self[word], is_specific))

        np.random.seed(RANDOM_STATE)
        np.random.shuffle(data)

        X, y = zip(*data)

        X = np.array(X)
        X /= np.linalg.norm(X, axis=1)[:, None]

        y = np.array(y).astype('int')

        clf = LinearSVC(C=1, class_weight='balanced',
                        random_state=RANDOM_STATE)

        clf.fit(X, y)

        full_specific_words = []
        for word in self.model.vocab:
            vector = [normalize(self[word])]
            if clf.predict(vector):
                full_specific_words.append(word)

        if not debug:
            return full_specific_words, clf

        return full_specific_words, clf, X, y


1			import copy
2			import os
3			import warnings
4
5			import matplotlib.pylab as plt
6			import numpy as np
7			import pandas as pd
8			import seaborn as sns
9			from gensim.models.keyedvectors import KeyedVectors
10			from pkg_resources import resource_filename
11			from sklearn.decomposition import PCA
12			from sklearn.svm import LinearSVC
13			from tqdm import tqdm
14
15			from tabulate import tabulate
16
17			from ..consts import RANDOM_STATE
18			from .utils import (
19			cosine_similarity, normalize, project_reject_vector, project_vector,
20			reject_vector, update_word_vector,
21			)
22
23
24			DIRECTION_METHODS = ['single', 'sum', 'pca']
25			DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
26			FIRST_PC_THRESHOLD = 0.5
27			MAX_NON_SPECIFIC_EXAMPLES = 1000
28
29
30			class BiasWordsEmbedding:
31
32			def __init__(self, model, only_lower=True, verbose=False):
33			if not isinstance(model, KeyedVectors):
34			raise TypeError('model should be of type KeyedVectors, not {}'
35			.format(type(model)))
36
37			self.model = model
38
39			# TODO: write unitest for when it is False
40			self.only_lower = only_lower
41
42			self._verbose = verbose
43
44			self.direction = None
45			self.positive_end = None
46			self.negative_end = None
47
48			def __copy__(self):
49			bias_words_embedding = self.__class__(self.model)
50			bias_words_embedding.direction = copy.deepcopy(self.direction)
51			bias_words_embedding.positive_end = copy.deepcopy(self.positive_end)
52			bias_words_embedding.negative_end = copy.deepcopy(self.negative_end)
53			return bias_words_embedding
54
55			def __deepcopy__(self, memo):
56			bias_words_embedding = copy.copy(self)
57			bias_words_embedding.model = copy.deepcopy(bias_words_embedding.model)
58			return bias_words_embedding
59
60			def __getitem__(self, key):
61			return self.model[key]
62
63			def __contains__(self, item):
64			return item in self.model
65
66			def _filter_words_by_model(self, words):
67			return [word for word in words if word in self]
68
69			def _is_direction_identified(self):
70			if self.direction is None:
71			raise RuntimeError('The direction was not identified'
72			' for this {} instance'
73			.format(self.__class__.__name__))
74
75			# There is a mistake in the article
76			# it is written (section 5.1):
77			# "To identify the gender subspace, we took the ten gender pair difference
78			# vectors and computed its principal components (PCs)"
79			# however in the source code:
80			# https://github.com/tolga-b/debiaswe/blob/10277b23e187ee4bd2b6872b507163ef4198686b/debiaswe/we.py#L235-L245
81			def _identify_subspace_by_pca(self, definitional_pairs, n_components):
82			matrix = []
83
84			for word1, word2 in definitional_pairs:
85			vector1 = normalize(self[word1])
86			vector2 = normalize(self[word2])
87
88			center = (vector1 + vector2) / 2
89
90			matrix.append(vector1 - center)
91			matrix.append(vector2 - center)
92
93			pca = PCA(n_components=n_components)
94			pca.fit(matrix)
95
96			if self._verbose:
97			table = enumerate(pca.explained_variance_ratio_, start=1)
98			headers = ['Principal Component',
99			'Explained Variance Ratio']
100			print(tabulate(table, headers=headers))
101
102			return pca
103
104			# TODO: add the SVD method from section 6 step 1
105			# It seems there is a mistake there, I think it is the same as PCA
106			# just with repleacing it with SVD
107			def _identify_direction(self, positive_end, negative_end,
108			definitional, method='pca'):
109			if method not in DIRECTION_METHODS:
110			raise ValueError('method should be one of {}, {} was given'.format(
111			DIRECTION_METHODS, method))
112
113			if positive_end == negative_end:
114			raise ValueError('positive_end and negative_end'
115			'should be different, and not the same "{}"'
116			.format(positive_end))
117
118			print('Identify direction using {} method...'.format(method))
119
120			direction = None
121
122			if method == 'single':
123			direction = normalize(normalize(self[definitional[0]])
124			- normalize(self[definitional[1]]))
125
126			elif method == 'sum':
127			groups = list(zip(*definitional))
128
129			group1_sum_vector = np.sum([self[word]
130			for word in groups[0]], axis=0)
131			group2_sum_vector = np.sum([self[word]
132			for word in groups[1]], axis=0)
133
134			diff_vector = (normalize(group1_sum_vector)
135			- normalize(group2_sum_vector))
136
137			direction = normalize(diff_vector)
138
139			elif method == 'pca':
140			pca = self._identify_subspace_by_pca(definitional, 10)
141			if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
142			raise RuntimeError('The Explained variance'
143			'of the first principal component should be'
144			'at least {}, but it is {}'
145			.format(FIRST_PC_THRESHOLD,
146			pca.explained_variance_ratio_[0]))
147			direction = pca.components_[0]
148
149			# if direction is oposite (e.g. we cannot control
150			# what the PCA will return)
151			ends_diff_projection = cosine_similarity((self[positive_end]
152			- self[negative_end]),
153			direction)
154			if ends_diff_projection < 0:
155			direction = -direction # pylint: disable=invalid-unary-operand-type
156
157			self.direction = direction
158			self.positive_end = positive_end
159			self.negative_end = negative_end
160
161			def project_on_direction(self, word):
162			self._is_direction_identified()
163
164			vector = self[word]
165			projection_score = self.model.cosine_similarities(self.direction,
166			[vector])[0]
167			return projection_score
168
169			def _calc_projection_scores(self, words):
170			self._is_direction_identified()
171
172			df = pd.DataFrame({'word': words})
173
174			# TODO: maybe using cosine_similarities on all the vectors?
175			# it might be faster
176			df['projection'] = df['word'].apply(self.project_on_direction)
177			df = df.sort_values('projection', ascending=False)
178
179			return df
180
181			def plot_projection_scores(self, words,
182			ax=None, axis_projection_step=None):
183			self._is_direction_identified()
184
185			projections_df = self._calc_projection_scores(words)
186			projections_df['projection'] = projections_df['projection'].round(2)
187
188			if ax is None:
189			_, ax = plt.subplots(1)
190
191			if axis_projection_step is None:
192			axis_projection_step = 0.1
193
194			cmap = plt.get_cmap('RdBu')
195			projections_df['color'] = ((projections_df['projection'] + 0.5)
196			.apply(cmap))
197
198			most_extream_projection = (projections_df['projection']
199			.abs()
200			.max()
201			.round(1))
202
203			sns.barplot(x='projection', y='word', data=projections_df,
204			palette=projections_df['color'])
205
206			plt.xticks(np.arange(-most_extream_projection, most_extream_projection,
207			axis_projection_step))
208			plt.title('← {} {} {} →'.format(self.negative_end,
209			' ' * 20,
210			self.positive_end))
211
212			plt.xlabel('Direction Projection')
213			plt.ylabel('Words')
214
215			return ax
216
217			def plot_dist_projections_on_direction(self, word_groups, ax=None):
218			if ax is None:
219			_, ax = plt.subplots(1)
220
221			for name, words in word_groups.items():
222			label = '{} (#{})'.format(name, len(words))
223			vectors = [self[word] for word in words]
224			projections = self.model.cosine_similarities(self.direction,
225			vectors)
226			sns.distplot(projections, hist=False, label=label, ax=ax)
227
228			plt.axvline(0, color='k', linestyle='--')
229
230			plt.title('← {} {} {} →'.format(self.negative_end,
231			' ' * 20,
232			self.positive_end))
233			plt.xlabel('Direction Projection')
234			plt.ylabel('Density')
235			ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
236
237			return ax
238
239			def calc_direct_bias(self, neutral_words, c=None):
240			if c is None:
241			c = 1
242
243			projections = self._calc_projection_scores(neutral_words)['projection']
244			direct_bias_terms = np.abs(projections) ** c
245			direct_bias = direct_bias_terms.sum() / len(neutral_words)
246
247			return direct_bias
248
249			def calc_indirect_bias(self, word1, word2):
250			"""Also known in the article as PairBias."""
251			self._is_direction_identified()
252
253			vector1 = normalize(self[word1])
254			vector2 = normalize(self[word2])
255
256			perpendicular_vector1 = reject_vector(vector1, self.direction)
257			perpendicular_vector2 = reject_vector(vector2, self.direction)
258
259			inner_product = vector1 @ vector2
260			perpendicular_similarity = cosine_similarity(perpendicular_vector1,
261			perpendicular_vector2)
262
263			indirect_bias = ((inner_product - perpendicular_similarity)
264			/ inner_product)
265			return indirect_bias
266
267			def _extract_neutral_words(self, specific_words):
268			extended_specific_words = set()
269
270			# because or specific_full data was trained on partial words embedding
271			for word in specific_words:
272			extended_specific_words.add(word)
273			extended_specific_words.add(word.lower())
274			extended_specific_words.add(word.upper())
275			extended_specific_words.add(word.title())
276
277			neutral_words = [word for word in self.model.vocab
278			if word not in extended_specific_words]
279
280			return neutral_words
281
282			def _neutralize(self, neutral_words):
283			self._is_direction_identified()
284
285			if self._verbose:
286			neutral_words_iter = tqdm(neutral_words)
287			else:
288			neutral_words_iter = iter(neutral_words)
289
290			for word in neutral_words_iter:
291			neutralized_vector = reject_vector(self[word],
292			self.direction)
293			update_word_vector(self.model, word, neutralized_vector)
294
295			self.model.init_sims(replace=True)
296
297			def _equalize(self, equality_sets):
298			for equality_set_words in equality_sets:
299			equality_set_vectors = [normalize(self[word])
300			for word in equality_set_words]
301			center = np.mean(equality_set_vectors, axis=0)
302			(projected_center,
303			rejected_center) = project_reject_vector(center,
304			self.direction)
305
306			for word, vector in zip(equality_set_words, equality_set_vectors):
307			projected_vector = project_vector(vector, self.direction)
308
309			projected_part = normalize(projected_vector - projected_center)
310			scaling = np.sqrt(1 - np.linalg.norm(rejected_center)**2)
311
312			# TODO - in the code it is different - why?
313			# equalized_vector = rejected_center + scaling * self.direction
314			# https://github.com/tolga-b/debiaswe/blob/10277b23e187ee4bd2b6872b507163ef4198686b/debiaswe/debias.py#L36-L37
315			equalized_vector = rejected_center + scaling * projected_part
316
317			update_word_vector(self.model, word, equalized_vector)
318
319			self.model.init_sims(replace=True)
320
321			def debias(self, method='hard', neutral_words=None, equality_sets=None,
322			inplace=True):
323			# pylint: disable=W0212
324			if inplace:
325			bias_words_embedding = self
326			else:
327			bias_words_embedding = copy.deepcopy(self)
328
329			if method not in DEBIAS_METHODS:
330			raise ValueError('method should be one of {}, {} was given'.format(
331			DEBIAS_METHODS, method))
332
333			if method in ['hard', 'neutralize']:
334			if self._verbose:
335			print('Neutralize...')
336			bias_words_embedding._neutralize(neutral_words)
337
338			if method == 'hard':
339			if self._verbose:
340			print('Equalize...')
341			bias_words_embedding._equalize(equality_sets)
342
343			if inplace:
344			return None
345			else:
346			return bias_words_embedding
347
348			def evaluate_words_embedding(self):
349			with warnings.catch_warnings():
350			warnings.simplefilter('ignore', category=FutureWarning)
351
352			if self._verbose:
353			print('Evaluate word pairs...')
354			word_pairs_path = resource_filename(__name__,
355			os.path.join('data',
356			'evaluation',
357			'wordsim353.tsv'))
358			word_paris_result = self.model.evaluate_word_pairs(word_pairs_path)
359
360			if self._verbose:
361			print('Evaluate analogies...')
362			analogies_path = resource_filename(__name__,
363			os.path.join('data',
364			'evaluation',
365			'questions-words.txt')) # pylint: disable=C0301
366			analogies_result = self.model.evaluate_word_analogies(analogies_path) # pylint: disable=C0301
367
368			if self._verbose:
369			print()
370			print('From Gensim')
371			print()
372			print('-' * 30)
373			print()
374			print('Word Pairs Result - WordSimilarity-353:')
375			print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
376			print('Pearson correlation coefficient:', word_paris_result[0])
377			print('Spearman rank-order correlation coefficient'
378			'between the similarities from the dataset'
379			'and the similarities produced by the model itself:',
380			word_paris_result[1])
381			print('Ratio of pairs with unknown words:', word_paris_result[2])
382			print()
383			print('-' * 30)
384			print()
385			print('Analogies Result')
386			print('~~~~~~~~~~~~~~~~')
387			print('Overall evaluation score:', analogies_result[0])
388
389			def learn_full_specific_words(self, seed_specific_words,
390			max_non_specific_examples=None, debug=None):
391
392			if debug is None:
393			debug = False
394
395			if max_non_specific_examples is None:
396			max_non_specific_examples = MAX_NON_SPECIFIC_EXAMPLES
397
398			data = []
399			non_specific_example_count = 0
400
401			for word in self.model.vocab:
402			is_specific = word in seed_specific_words
403
404			if not is_specific:
405			non_specific_example_count += 1
406			if non_specific_example_count <= max_non_specific_examples:
407			data.append((self[word], is_specific))
408			else:
409			data.append((self[word], is_specific))
410
411			np.random.seed(RANDOM_STATE)
412			np.random.shuffle(data)
413
414			X, y = zip(*data)
415
416			X = np.array(X)
417			X /= np.linalg.norm(X, axis=1)[:, None]
418
419			y = np.array(y).astype('int')
420
421			clf = LinearSVC(C=1, class_weight='balanced',
422			random_state=RANDOM_STATE)
423
424			clf.fit(X, y)
425
426			full_specific_words = []
427			for word in self.model.vocab:
428			vector = [normalize(self[word])]
429			if clf.predict(vector):
430			full_specific_words.append(word)
431
432			if not debug:
433			return full_specific_words, clf
434
435			return full_specific_words, clf, X, y
436

ResponsiblyAI / responsibly

Push — master ( 999448...189cd4 )

ethically.we.core.BiasWordsEmbedding.__init__() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

ethically.we.core.BiasWordsEmbedding.init() A