Completed
Push — master ( 748d0e...582254 )
by Wouter
03:55
created

iwe_kernel_mean_matching()   B

Complexity

Conditions 5

Size

Total Lines 65

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 18
CRAP Score 5.0729

Importance

Changes 0
Metric Value
cc 5
c 0
b 0
f 0
dl 0
loc 65
ccs 18
cts 21
cp 0.8571
crap 5.0729
rs 8.2662

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.spatial.distance import cdist
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12 1
from cvxopt import matrix, solvers
13
14 1
from .util import is_pos_def
15
16
17 1
class ImportanceWeightedClassifier(object):
18
    """
19
    Class of importance-weighted classifiers.
20
21
    Methods contain different importance-weight estimators and different loss
22
    functions.
23
    """
24
25 1 View Code Duplication
    def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
26
                 clip=-1, kernel_type='rbf', bandwidth=1):
27
        """
28
        Select a particular type of importance-weighted classifier.
29
30
        Parameters
31
        ----------
32
        loss : str
33
            loss function for weighted classifier, options: 'logistic',
34
            'quadratic', 'hinge' (def: 'logistic')
35
        l2 : float
36
            l2-regularization parameter value (def:0.01)
37
        iwe : str
38
            importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
39
            'kde' (def: 'lr')
40
        smoothing : bool
41
            whether to apply Laplace smoothing to the nearest-neighbour
42
            importance-weight estimator (def: True)
43
        clip : float
44
            maximum allowable importance-weight value; if set to -1, then the
45
            weights are not clipped (def:-1)
46
        kernel_type : str
47
            what type of kernel to use for kernel density estimation or kernel
48
            mean matching, options: 'diste', 'rbf' (def: 'rbf')
49
        bandwidth : float
50
            kernel bandwidth parameter value for kernel-based weight
51
            estimators (def: 1)
52
53
        Returns
54
        -------
55
        None
56
57
        Examples
58
        --------
59
        >>>> clf = ImportanceWeightedClassifier()
60
61
        """
62 1
        self.loss = loss
63 1
        self.l2 = l2
64 1
        self.iwe = iwe
65 1
        self.smoothing = smoothing
66 1
        self.clip = clip
67 1
        self.kernel_type = kernel_type
68 1
        self.bandwidth = bandwidth
69
70
        # Initialize untrained classifiers based on choice of loss function
71 1
        if self.loss == 'logistic':
72
            # Logistic regression model
73 1
            self.clf = LogisticRegression()
74
        elif self.loss == 'quadratic':
75
            # Least-squares model
76
            self.clf = LinearRegression()
77
        elif self.loss == 'hinge':
78
            # Linear support vector machine
79
            self.clf = LinearSVC()
80
        else:
81
            # Other loss functions are not implemented
82
            raise NotImplementedError('Loss function not implemented.')
83
84
        # Whether model has been trained
85 1
        self.is_trained = False
86
87
        # Dimensionality of training data
88 1
        self.train_data_dim = ''
89
90 1
    def iwe_ratio_gaussians(self, X, Z):
91
        """
92
        Estimate importance weights based on a ratio of Gaussian distributions.
93
94
        Parameters
95
        ----------
96
        X : array
97
            source data (N samples by D features)
98
        Z : array
99
            target data (M samples by D features)
100
101
        Returns
102
        -------
103
        iw : array
104
            importance weights (N samples by 1)
105
106
        Examples
107
        --------
108
        X = np.random.randn(10, 2)
109
        Z = np.random.randn(10, 2)
110
        clf = ImportanceWeightedClassifier()
111
        iw = clf.iwe_ratio_gaussians(X, Z)
112
113
        """
114
        # Data shapes
115 1
        N, DX = X.shape
116 1
        M, DZ = Z.shape
117
118
        # Assert equivalent dimensionalities
119 1
        if not DX == DZ:
120
            raise ValueError('Dimensionalities of X and Z should be equal.')
121
122
        # Sample means in each domain
123 1
        mu_X = np.mean(X, axis=0)
124 1
        mu_Z = np.mean(Z, axis=0)
125
126
        # Sample covariances
127 1
        Si_X = np.cov(X.T)
128 1
        Si_Z = np.cov(Z.T)
129
130
        # Check for positive-definiteness of covariance matrices
131 1
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
132
            print('Warning: covariate matrices not PSD.')
133
134
            regct = -6
135
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
136
                print('Adding regularization: ' + str(1**regct))
137
138
                # Add regularization
139
                Si_X += np.eye(DX)*10.**regct
140
                Si_Z += np.eye(DZ)*10.**regct
141
142
                # Increment regularization counter
143
                regct += 1
144
145
        # Compute probability of X under each domain
146 1
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
147 1
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)
148
149
        # Check for numerical problems
150 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
151
            raise ValueError('Source probabilities are NaN or 0.')
152 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
153
            raise ValueError('Target probabilities are NaN or 0.')
154
155
        # Return the ratio of probabilities
156 1
        return pT / pS
157
158 1
    def iwe_kernel_densities(self, X, Z):
159
        """
160
        Estimate importance weights based on kernel density estimation.
161
162
        Parameters
163
        ----------
164
        X : array
165
            source data (N samples by D features)
166
        Z : array
167
            target data (M samples by D features)
168
169
        Returns
170
        -------
171
        iw : array
172
            importance weights (N samples by 1)
173
174
        Examples
175
        --------
176
        X = np.random.randn(10, 2)
177
        Z = np.random.randn(10, 2)
178
        clf = ImportanceWeightedClassifier()
179
        iw = clf.iwe_kernel_densities(X, Z)
180
181
        """
182
        # Data shapes
183 1
        N, DX = X.shape
184 1
        M, DZ = Z.shape
185
186
        # Assert equivalent dimensionalities
187 1
        if not DX == DZ:
188
            raise ValueError('Dimensionalities of X and Z should be equal.')
189
190
        # Compute probabilities based on source kernel densities
191 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
192 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
193
194
        # Check for numerical problems
195 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
196
            raise ValueError('Source probabilities are NaN or 0.')
197 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
198
            raise ValueError('Target probabilities are NaN or 0.')
199
200
        # Return the ratio of probabilities
201 1
        return pT / pS
202
203 1
    def iwe_logistic_discrimination(self, X, Z):
204
        """
205
        Estimate importance weights based on logistic regression.
206
207
        Parameters
208
        ----------
209
        X : array
210
            source data (N samples by D features)
211
        Z : array
212
            target data (M samples by D features)
213
214
        Returns
215
        -------
216
        iw : array
217
            importance weights (N samples by 1)
218
219
        Examples
220
        --------
221
        X = np.random.randn(10, 2)
222
        Z = np.random.randn(10, 2)
223
        clf = ImportanceWeightedClassifier()
224
        iw = clf.iwe_logistic_discrimination(X, Z)
225
226
        """
227
        # Data shapes
228 1
        N, DX = X.shape
229 1
        M, DZ = Z.shape
230
231
        # Assert equivalent dimensionalities
232 1
        if not DX == DZ:
233
            raise ValueError('Dimensionalities of X and Z should be equal.')
234
235
        # Make domain-label variable
236 1
        y = np.concatenate((np.zeros((N, 1)),
237
                            np.ones((M, 1))), axis=0)
238
239
        # Concatenate data
240 1
        XZ = np.concatenate((X, Z), axis=0)
241
242
        # Call a logistic regressor
243 1
        lr = LogisticRegression(C=self.l2)
244
245
        # Predict probability of belonging to target using cross-validation
246 1
        preds = cross_val_predict(lr, XZ, y[:, 0])
247
248
        # Return predictions for source samples
249 1
        return preds[:N]
250
251 1
    def iwe_nearest_neighbours(self, X, Z):
252
        """
253
        Estimate importance weights based on nearest-neighbours.
254
255
        Parameters
256
        ----------
257
        X : array
258
            source data (N samples by D features)
259
        Z : array
260
            target data (M samples by D features)
261
262
        Returns
263
        -------
264
        iw : array
265
            importance weights (N samples by 1)
266
267
        Examples
268
        --------
269
        X = np.random.randn(10, 2)
270
        Z = np.random.randn(10, 2)
271
        clf = ImportanceWeightedClassifier()
272
        iw = clf.iwe_nearest_neighbours(X, Z)
273
274
        """
275
        # Data shapes
276 1
        N, DX = X.shape
277 1
        M, DZ = Z.shape
278
279
        # Assert equivalent dimensionalities
280 1
        if not DX == DZ:
281
            raise ValueError('Dimensionalities of X and Z should be equal.')
282
283
        # Compute Euclidean distance between samples
284 1
        d = cdist(X, Z, metric='euclidean')
285
286
        # Count target samples within each source Voronoi cell
287 1
        ix = np.argmin(d, axis=1)
288 1
        iw, _ = np.array(np.histogram(ix, np.arange(N+1)))
289
290
        # Laplace smoothing
291 1
        if self.smoothing:
292 1
            iw = (iw + 1.) / (N + 1)
293
294
        # Weight clipping
295 1
        if self.clip > 0:
296
            iw = np.minimum(self.clip, np.maximum(0, iw))
297
298
        # Return weights
299 1
        return iw
300
301 1
    def iwe_kernel_mean_matching(self, X, Z):
302
        """
303
        Estimate importance weights based on kernel mean matching.
304
305
        Parameters
306
        ----------
307
        X : array
308
            source data (N samples by D features)
309
        Z : array
310
            target data (M samples by D features)
311
312
        Returns
313
        -------
314
        iw : array
315
            importance weights (N samples by 1)
316
317
        Examples
318
        --------
319
        X = np.random.randn(10, 2)
320
        Z = np.random.randn(10, 2)
321
        clf = ImportanceWeightedClassifier()
322
        iw = clf.iwe_kernel_mean_matching(X, Z)
323
324
        """
325
        # Data shapes
326 1
        N, DX = X.shape
327 1
        M, DZ = Z.shape
328
329
        # Assert equivalent dimensionalities
330 1
        if not DX == DZ:
331
            raise ValueError('Dimensionalities of X and Z should be equal.')
332
333
        # Compute sample pairwise distances
334 1
        KXX = cdist(X, X, metric='euclidean')
335 1
        KXZ = cdist(X, Z, metric='euclidean')
336
337
        # Check non-negative distances
338 1
        if not np.all(KXX >= 0):
339
            raise ValueError('Non-positive distance in source kernel.')
340 1
        if not np.all(KXZ >= 0):
341
            raise ValueError('Non-positive distance in source-target kernel.')
342
343
        # Compute kernels
344 1
        if self.kernel_type == 'rbf':
345
            # Radial basis functions
346 1
            KXX = np.exp(-KXX / (2*self.bandwidth**2))
347 1
            KXZ = np.exp(-KXZ / (2*self.bandwidth**2))
348
349
        # Collapse second kernel and normalize
350 1
        KXZ = N/M * np.sum(KXZ, axis=1)
351
352
        # Prepare for CVXOPT
353 1
        Q = matrix(KXX, tc='d')
354 1
        p = matrix(KXZ, tc='d')
355 1
        G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
356
                                   -1.*np.eye(N)), axis=0), tc='d')
357 1
        h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
358
                                   np.array([N/np.sqrt(N) - N], ndmin=2),
359
                                   np.zeros((N, 1))), axis=0), tc='d')
360
361
        # Call quadratic program solver
362 1
        sol = solvers.qp(Q, p, G, h)
363
364
        # Return optimal coefficients as importance weights
365 1
        return np.array(sol['x'])[:, 0]
366
367 1
    def fit(self, X, y, Z):
368
        """
369
        Fit/train an importance-weighted classifier.
370
371
        Arguments
372
        X : array
373
            source data (N samples by D features)
374
        y : array
375
            source labels (N samples by 1)
376
        Z : array
377
            target data (M samples by D features)
378
379
        Returns
380
        -------
381
        None
382
383
        Examples
384
        --------
385
        X = np.random.randn(10, 2)
386
        y = np.vstack((-np.ones((5,)), np.ones((5,))))
387
        Z = np.random.randn(10, 2)
388
        clf = ImportanceWeightedClassifier()
389
        clf.fit(X, y, Z)
390
391
        """
392
        # Data shapes
393 1
        N, DX = X.shape
394 1
        M, DZ = Z.shape
395
396
        # Assert equivalent dimensionalities
397 1
        if not DX == DZ:
398
            raise ValueError('Dimensionalities of X and Z should be equal.')
399
400
        # Find importance-weights
401 1
        if self.iwe == 'lr':
402 1
            w = self.iwe_logistic_discrimination(X, Z)
403
        elif self.iwe == 'rg':
404
            w = self.iwe_ratio_gaussians(X, Z)
405
        elif self.iwe == 'nn':
406
            w = self.iwe_nearest_neighbours(X, Z)
407
        elif self.iwe == 'kde':
408
            w = self.iwe_kernel_densities(X, Z)
409
        elif self.iwe == 'kmm':
410
            w = self.iwe_kernel_mean_matching(X, Z)
411
        else:
412
            raise NotImplementedError('Estimator not implemented.')
413
414
        # Train a weighted classifier
415 1
        if self.loss == 'logistic':
416
            # Logistic regression model with sample weights
417 1
            self.clf.fit(X, y, w)
418
        elif self.loss == 'quadratic':
419
            # Least-squares model with sample weights
420
            self.clf.fit(X, y, w)
421
        elif self.loss == 'hinge':
422
            # Linear support vector machine with sample weights
423
            self.clf.fit(X, y, w)
424
        else:
425
            # Other loss functions are not implemented
426
            raise NotImplementedError('Loss function not implemented.')
427
428
        # Mark classifier as trained
429 1
        self.is_trained = True
430
431
        # Store training data dimensionality
432 1
        self.train_data_dim = DX
433
434 1
    def predict(self, Z_):
435
        """
436
        Make predictions on new dataset.
437
438
        Arguments
439
        ---------
440
        Z_ : array
441
            new data set (M samples by D features)
442
443
        Returns
444
        -------
445
        preds : array
446
            label predictions (M samples by 1)
447
448
        Examples
449
        --------
450
        X = np.random.randn(10, 2)
451
        y = np.vstack((-np.ones((5,)), np.ones((5,))))
452
        Z = np.random.randn(10, 2)
453
        clf = ImportanceWeightedClassifier()
454
        clf.fit(X, y, Z)
455
        u_pred = clf.predict(Z)
456
457
        """
458
        # Data shape
459 1
        M, D = Z_.shape
460
461
        # If classifier is trained, check for same dimensionality
462 1
        if self.is_trained:
463 1
            if not self.train_data_dim == D:
464
                raise ValueError('''Test data is of different dimensionality
465
                                 than training data.''')
466
467
        # Call scikit's predict function
468 1
        preds = self.clf.predict(Z_)
469
470
        # For quadratic loss function, correct predictions
471 1
        if self.loss == 'quadratic':
472
            preds = (np.sign(preds)+1)/2.
473
474
        # Return predictions array
475 1
        return preds
476
477 1
    def get_params(self):
478
        """Get classifier parameters."""
479
        return self.clf.get_params()
480
481 1
    def is_trained(self):
482
        """Check whether classifier is trained."""
483
        return self.is_trained
484