Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

iwe_kernel_mean_matching()   B

Complexity

Conditions 5

Size

Total Lines 58

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 18
CRAP Score 5.0729

Importance

Changes 0
Metric Value
cc 5
c 0
b 0
f 0
dl 0
loc 58
ccs 18
cts 21
cp 0.8571
crap 5.0729
rs 8.392

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.spatial.distance import cdist
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12 1
from cvxopt import matrix, solvers
13
14 1
from .util import is_pos_def
15
16
17 1
class ImportanceWeightedClassifier(object):
18
    """
19
    Class of importance-weighted classifiers.
20
21
    Methods contain different importance-weight estimators and different loss
22
    functions.
23
24
    Examples
25
    --------
26
    | >>>> X = np.random.randn(10, 2)
27
    | >>>> y = np.vstack((-np.ones((5,)), np.ones((5,))))
28
    | >>>> Z = np.random.randn(10, 2)
29
    | >>>> clf = ImportanceWeightedClassifier()
30
    | >>>> clf.fit(X, y, Z)
31
    | >>>> u_pred = clf.predict(Z)
32
33
    """
34
35 1 View Code Duplication
    def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
36
                 clip=-1, kernel_type='rbf', bandwidth=1):
37
        """
38
        Select a particular type of importance-weighted classifier.
39
40
        Parameters
41
        ----------
42
        loss : str
43
            loss function for weighted classifier, options: 'logistic',
44
            'quadratic', 'hinge' (def: 'logistic')
45
        l2 : float
46
            l2-regularization parameter value (def:0.01)
47
        iwe : str
48
            importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
49
            'kde' (def: 'lr')
50
        smoothing : bool
51
            whether to apply Laplace smoothing to the nearest-neighbour
52
            importance-weight estimator (def: True)
53
        clip : float
54
            maximum allowable importance-weight value; if set to -1, then the
55
            weights are not clipped (def:-1)
56
        kernel_type : str
57
            what type of kernel to use for kernel density estimation or kernel
58
            mean matching, options: 'diste', 'rbf' (def: 'rbf')
59
        bandwidth : float
60
            kernel bandwidth parameter value for kernel-based weight
61
            estimators (def: 1)
62
63
        Returns
64
        -------
65
        None
66
67
        """
68 1
        self.loss = loss
69 1
        self.l2 = l2
70 1
        self.iwe = iwe
71 1
        self.smoothing = smoothing
72 1
        self.clip = clip
73 1
        self.kernel_type = kernel_type
74 1
        self.bandwidth = bandwidth
75
76
        # Initialize untrained classifiers based on choice of loss function
77 1
        if self.loss == 'logistic':
78
            # Logistic regression model
79 1
            self.clf = LogisticRegression()
80
        elif self.loss == 'quadratic':
81
            # Least-squares model
82
            self.clf = LinearRegression()
83
        elif self.loss == 'hinge':
84
            # Linear support vector machine
85
            self.clf = LinearSVC()
86
        else:
87
            # Other loss functions are not implemented
88
            raise NotImplementedError('Loss function not implemented.')
89
90
        # Whether model has been trained
91 1
        self.is_trained = False
92
93
        # Dimensionality of training data
94 1
        self.train_data_dim = ''
95
96 1
    def iwe_ratio_gaussians(self, X, Z):
97
        """
98
        Estimate importance weights based on a ratio of Gaussian distributions.
99
100
        Parameters
101
        ----------
102
        X : array
103
            source data (N samples by D features)
104
        Z : array
105
            target data (M samples by D features)
106
107
        Returns
108
        -------
109
        iw : array
110
            importance weights (N samples by 1)
111
112
        """
113
        # Data shapes
114 1
        N, DX = X.shape
115 1
        M, DZ = Z.shape
116
117
        # Assert equivalent dimensionalities
118 1
        if not DX == DZ:
119
            raise ValueError('Dimensionalities of X and Z should be equal.')
120
121
        # Sample means in each domain
122 1
        mu_X = np.mean(X, axis=0)
123 1
        mu_Z = np.mean(Z, axis=0)
124
125
        # Sample covariances
126 1
        Si_X = np.cov(X.T)
127 1
        Si_Z = np.cov(Z.T)
128
129
        # Check for positive-definiteness of covariance matrices
130 1
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
131
            print('Warning: covariate matrices not PSD.')
132
133
            regct = -6
134
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
135
                print('Adding regularization: ' + str(1**regct))
136
137
                # Add regularization
138
                Si_X += np.eye(DX)*10.**regct
139
                Si_Z += np.eye(DZ)*10.**regct
140
141
                # Increment regularization counter
142
                regct += 1
143
144
        # Compute probability of X under each domain
145 1
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
146 1
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)
147
148
        # Check for numerical problems
149 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
150
            raise ValueError('Source probabilities are NaN or 0.')
151 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
152
            raise ValueError('Target probabilities are NaN or 0.')
153
154
        # Return the ratio of probabilities
155 1
        return pT / pS
156
157 1
    def iwe_kernel_densities(self, X, Z):
158
        """
159
        Estimate importance weights based on kernel density estimation.
160
161
        Parameters
162
        ----------
163
        X : array
164
            source data (N samples by D features)
165
        Z : array
166
            target data (M samples by D features)
167
168
        Returns
169
        -------
170
        array
171
            importance weights (N samples by 1)
172
173
        """
174
        # Data shapes
175 1
        N, DX = X.shape
176 1
        M, DZ = Z.shape
177
178
        # Assert equivalent dimensionalities
179 1
        if not DX == DZ:
180
            raise ValueError('Dimensionalities of X and Z should be equal.')
181
182
        # Compute probabilities based on source kernel densities
183 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
184 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
185
186
        # Check for numerical problems
187 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
188
            raise ValueError('Source probabilities are NaN or 0.')
189 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
190
            raise ValueError('Target probabilities are NaN or 0.')
191
192
        # Return the ratio of probabilities
193 1
        return pT / pS
194
195 1
    def iwe_logistic_discrimination(self, X, Z):
196
        """
197
        Estimate importance weights based on logistic regression.
198
199
        Parameters
200
        ----------
201
        X : array
202
            source data (N samples by D features)
203
        Z : array
204
            target data (M samples by D features)
205
206
        Returns
207
        -------
208
        array
209
            importance weights (N samples by 1)
210
211
        """
212
        # Data shapes
213 1
        N, DX = X.shape
214 1
        M, DZ = Z.shape
215
216
        # Assert equivalent dimensionalities
217 1
        if not DX == DZ:
218
            raise ValueError('Dimensionalities of X and Z should be equal.')
219
220
        # Make domain-label variable
221 1
        y = np.concatenate((np.zeros((N, 1)),
222
                            np.ones((M, 1))), axis=0)
223
224
        # Concatenate data
225 1
        XZ = np.concatenate((X, Z), axis=0)
226
227
        # Call a logistic regressor
228 1
        lr = LogisticRegression(C=self.l2)
229
230
        # Predict probability of belonging to target using cross-validation
231 1
        preds = cross_val_predict(lr, XZ, y[:, 0])
232
233
        # Return predictions for source samples
234 1
        return preds[:N]
235
236 1
    def iwe_nearest_neighbours(self, X, Z):
237
        """
238
        Estimate importance weights based on nearest-neighbours.
239
240
        Parameters
241
        ----------
242
        X : array
243
            source data (N samples by D features)
244
        Z : array
245
            target data (M samples by D features)
246
247
        Returns
248
        -------
249
        iw : array
250
            importance weights (N samples by 1)
251
252
        """
253
        # Data shapes
254 1
        N, DX = X.shape
255 1
        M, DZ = Z.shape
256
257
        # Assert equivalent dimensionalities
258 1
        if not DX == DZ:
259
            raise ValueError('Dimensionalities of X and Z should be equal.')
260
261
        # Compute Euclidean distance between samples
262 1
        d = cdist(X, Z, metric='euclidean')
263
264
        # Count target samples within each source Voronoi cell
265 1
        ix = np.argmin(d, axis=1)
266 1
        iw, _ = np.array(np.histogram(ix, np.arange(N+1)))
267
268
        # Laplace smoothing
269 1
        if self.smoothing:
270 1
            iw = (iw + 1.) / (N + 1)
271
272
        # Weight clipping
273 1
        if self.clip > 0:
274
            iw = np.minimum(self.clip, np.maximum(0, iw))
275
276
        # Return weights
277 1
        return iw
278
279 1
    def iwe_kernel_mean_matching(self, X, Z):
280
        """
281
        Estimate importance weights based on kernel mean matching.
282
283
        Parameters
284
        ----------
285
        X : array
286
            source data (N samples by D features)
287
        Z : array
288
            target data (M samples by D features)
289
290
        Returns
291
        -------
292
        iw : array
293
            importance weights (N samples by 1)
294
295
        """
296
        # Data shapes
297 1
        N, DX = X.shape
298 1
        M, DZ = Z.shape
299
300
        # Assert equivalent dimensionalities
301 1
        if not DX == DZ:
302
            raise ValueError('Dimensionalities of X and Z should be equal.')
303
304
        # Compute sample pairwise distances
305 1
        KXX = cdist(X, X, metric='euclidean')
306 1
        KXZ = cdist(X, Z, metric='euclidean')
307
308
        # Check non-negative distances
309 1
        if not np.all(KXX >= 0):
310
            raise ValueError('Non-positive distance in source kernel.')
311 1
        if not np.all(KXZ >= 0):
312
            raise ValueError('Non-positive distance in source-target kernel.')
313
314
        # Compute kernels
315 1
        if self.kernel_type == 'rbf':
316
            # Radial basis functions
317 1
            KXX = np.exp(-KXX / (2*self.bandwidth**2))
318 1
            KXZ = np.exp(-KXZ / (2*self.bandwidth**2))
319
320
        # Collapse second kernel and normalize
321 1
        KXZ = N/M * np.sum(KXZ, axis=1)
322
323
        # Prepare for CVXOPT
324 1
        Q = matrix(KXX, tc='d')
325 1
        p = matrix(KXZ, tc='d')
326 1
        G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
327
                                   -1.*np.eye(N)), axis=0), tc='d')
328 1
        h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
329
                                   np.array([N/np.sqrt(N) - N], ndmin=2),
330
                                   np.zeros((N, 1))), axis=0), tc='d')
331
332
        # Call quadratic program solver
333 1
        sol = solvers.qp(Q, p, G, h)
334
335
        # Return optimal coefficients as importance weights
336 1
        return np.array(sol['x'])[:, 0]
337
338 1
    def fit(self, X, y, Z):
339
        """
340
        Fit/train an importance-weighted classifier.
341
342
        Parameters
343
        ----------
344
        X : array
345
            source data (N samples by D features)
346
        y : array
347
            source labels (N samples by 1)
348
        Z : array
349
            target data (M samples by D features)
350
351
        Returns
352
        -------
353
        None
354
355
        """
356
        # Data shapes
357 1
        N, DX = X.shape
358 1
        M, DZ = Z.shape
359
360
        # Assert equivalent dimensionalities
361 1
        if not DX == DZ:
362
            raise ValueError('Dimensionalities of X and Z should be equal.')
363
364
        # Find importance-weights
365 1
        if self.iwe == 'lr':
366 1
            w = self.iwe_logistic_discrimination(X, Z)
367
        elif self.iwe == 'rg':
368
            w = self.iwe_ratio_gaussians(X, Z)
369
        elif self.iwe == 'nn':
370
            w = self.iwe_nearest_neighbours(X, Z)
371
        elif self.iwe == 'kde':
372
            w = self.iwe_kernel_densities(X, Z)
373
        elif self.iwe == 'kmm':
374
            w = self.iwe_kernel_mean_matching(X, Z)
375
        else:
376
            raise NotImplementedError('Estimator not implemented.')
377
378
        # Train a weighted classifier
379 1
        if self.loss == 'logistic':
380
            # Logistic regression model with sample weights
381 1
            self.clf.fit(X, y, w)
382
        elif self.loss == 'quadratic':
383
            # Least-squares model with sample weights
384
            self.clf.fit(X, y, w)
385
        elif self.loss == 'hinge':
386
            # Linear support vector machine with sample weights
387
            self.clf.fit(X, y, w)
388
        else:
389
            # Other loss functions are not implemented
390
            raise NotImplementedError('Loss function not implemented.')
391
392
        # Mark classifier as trained
393 1
        self.is_trained = True
394
395
        # Store training data dimensionality
396 1
        self.train_data_dim = DX
397
398 1
    def predict(self, Z):
399
        """
400
        Make predictions on new dataset.
401
402
        Parameters
403
        ----------
404
        Z : array
405
            new data set (M samples by D features)
406
407
        Returns
408
        -------
409
        preds : array
410
            label predictions (M samples by 1)
411
412
        """
413
        # Data shape
414 1
        M, D = Z.shape
415
416
        # If classifier is trained, check for same dimensionality
417 1
        if self.is_trained:
418 1
            if not self.train_data_dim == D:
419
                raise ValueError('''Test data is of different dimensionality
420
                                 than training data.''')
421
422
        # Call scikit's predict function
423 1
        preds = self.clf.predict(Z)
424
425
        # For quadratic loss function, correct predictions
426 1
        if self.loss == 'quadratic':
427
            preds = (np.sign(preds)+1)/2.
428
429
        # Return predictions array
430 1
        return preds
431
432 1
    def get_params(self):
433
        """Get classifier parameters."""
434
        return self.clf.get_params()
435
436 1
    def is_trained(self):
437
        """Check whether classifier is trained."""
438
        return self.is_trained
439