Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

combine_class_covariances()   B

Complexity

Conditions 3

Size

Total Lines 33

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 3.0261

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 3
c 1
b 0
f 0
dl 0
loc 33
ccs 6
cts 7
cp 0.8571
crap 3.0261
rs 8.8571
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import numpy.linalg as al
6 1
from scipy.stats import multivariate_normal as mvn
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12
13 1
from .util import one_hot, regularize_matrix
14
15
16 1
class TargetContrastivePessimisticClassifier(object):
17
    """
18
    Classifiers based on Target Contrastive Pessimistic Risk minimization.
19
20
    Methods contain models, risk functions, parameter estimation, etc.
21
    """
22
23 1
    def __init__(self, loss='lda', l2=1.0, max_iter=500, tolerance=1e-12,
24
                 learning_rate=1.0, rate_decay='linear', verbosity=0):
25
        """
26
        Select a particular type of TCPR classifier.
27
28
        Parameters
29
        ----------
30
        loss : str
31
            loss function for TCP Risk, options: 'ls', 'least_squares', 'lda',
32
            'linear_discriminant_analysis', 'qda',
33
            'quadratic_discriminant_analysis' (def: 'lda')
34
        l2 : float
35
            l2-regularization parameter value (def:0.01)
36
        max_iter : int
37
            maximum number of iterations for optimization (def: 500)
38
        tolerance : float
39
            convergence criterion on the TCP parameters (def: 1e-5)
40
        learning_rate : float
41
            parameter for size of update of gradient (def: 1.0)
42
        rate_decay : str
43
            type of learning rate decay, options: 'linear', 'quadratic',
44
            'geometric', 'exponential' (def: 'linear')
45
46
        Returns
47
        -------
48
        None
49
50
        """
51
        # Classifier options
52 1
        self.loss = loss
53 1
        self.l2 = l2
54
55
        # Optimization parameters
56 1
        self.max_iter = max_iter
57 1
        self.tolerance = tolerance
58 1
        self.learning_rate = learning_rate
59 1
        self.rate_decay = rate_decay
60 1
        self.verbosity = verbosity
61
62 1
        if self.loss in ['linear discriminant analysis', 'lda']:
63
64
            # Set to short name
65 1
            self.loss = 'lda'
66
67
        elif self.loss in ['quadratic discriminant analysis', 'qda']:
68
69
            # Set to short name
70
            self.loss = 'qda'
71
72
        else:
73
            # Other loss functions are not implemented
74
            raise ValueError('Model not implemented.')
75
76
        # Initialize classifier and classes parameters
77 1
        self.parameters = []
78 1
        self.classes = []
79
80
        # Whether model has been trained
81 1
        self.is_trained = False
82
83
        # Dimensionality of training data
84 1
        self.train_data_dim = 0
85
86 1
    def add_intercept(self, X):
87
        """Add 1's to data as last features."""
88
        # Data shape
89
        N, D = X.shape
90
91
        # Check if there's not already an intercept column
92
        if np.any(np.sum(X, axis=0) == N):
93
94
            # Report
95
            print('Intercept is not the last feature. Swapping..')
96
97
            # Find which column contains the intercept
98
            intercept_index = np.argwhere(np.sum(X, axis=0) == N)
99
100
            # Swap intercept to last
101
            X = X[:, np.setdiff1d(np.arange(D), intercept_index)]
102
103
        # Add intercept as last column
104
        X = np.hstack((X, np.ones((N, 1))))
105
106
        # Append column of 1's to data, and increment dimensionality
107
        return X, D+1
108
109 1
    def remove_intercept(self, X):
110
        """Remove 1's from data as last features."""
111
        # Data shape
112 1
        N, D = X.shape
113
114
        # Find which column contains the intercept
115 1
        intercept_index = np.argwhere(np.sum(X, axis=0) == N)
116
117
        # Swap intercept to last
118 1
        X = X[:, np.setdiff1d(np.arange(D), intercept_index)]
119
120 1
        return X, D-1
121
122 1
    def project_simplex(self, v, z=1.0):
123
        """
124
        Project vector onto simplex using sorting.
125
126
        Reference: "Efficient Projections onto the L1-Ball for Learning in High
127
        Dimensions (Duchi, Shalev-Shwartz, Singer, Chandra, 2006)."
128
129
        Parameters
130
        ----------
131
        v : array
132
            vector to be projected (n dimensions by 0)
133
        z : float
134
            constant (def: 1.0)
135
136
        Returns
137
        -------
138
        w : array
139
            projected vector (n dimensions by 0)
140
141
        """
142
        # Number of dimensions
143 1
        n = v.shape[0]
144
145
        # Sort vector
146 1
        mu = np.sort(v, axis=0)[::-1]
147
148
        # Find rho
149 1
        C = np.cumsum(mu) - z
150 1
        j = np.arange(n) + 1
151 1
        rho = j[mu - C/j > 0][-1]
152
153
        # Define theta
154 1
        theta = C[mu - C/j > 0][-1] / float(rho)
155
156
        # Subtract theta from original vector and cap at 0
157 1
        w = np.maximum(v - theta, 0)
158
159
        # Return projected vector
160 1
        return w
161
162 1
    def learning_rate_t(self, t):
163
        """
164
        Compute current learning rate after decay.
165
166
        Parameters
167
        ----------
168
        t : int
169
            current iteration
170
171
        Returns
172
        -------
173
        alpha : float
174
            current learning rate
175
176
        """
177
        # Select rate decay
178 1
        if self.rate_decay == 'linear':
179
180
            # Linear dropoff between t=0 and t=T
181 1
            alpha = (self.max_iter - t)/(self.learning_rate*self.max_iter)
182
183
        elif self.rate_decay == 'quadratic':
184
185
            # Quadratic dropoff between t=0 and t=T
186
            alpha = ((self.max_iter - t)/(self.learning_rate*self.max_iter))**2
187
188
        elif self.rate_decay == 'geometric':
189
190
            # Drop rate off inversely to time
191
            alpha = 1 / (self.learning_rate * t)
192
193
        elif self.rate_decay == 'exponential':
194
195
            # Exponential dropoff
196
            alpha = np.exp(-self.learning_rate * t)
197
198
        else:
199
            raise ValueError('Rate decay type unknown.')
200
201 1
        return alpha
202
203 1
    def risk(self, Z, theta, q):
204
        """
205
        Compute target contrastive pessimistic risk.
206
207
        Parameters
208
        ----------
209
        Z : array
210
            target samples (M samples by D features)
211
        theta : array
212
            classifier parameters (D features by K classes)
213
        q : array
214
            soft labels (M samples by K classes)
215
216
        Returns
217
        -------
218
        float
219
            Value of risk function.
220
221
        """
222
        # Number of classes
223 1
        K = q.shape[1]
224
225
        # Compute negative log-likelihood
226 1
        L = self.neg_log_likelihood(Z, theta)
227
228
        # Weight loss by soft labels
229 1
        for k in range(K):
230 1
            L[:, k] *= q[:, k]
231
232
        # Sum over weighted losses
233 1
        L = np.sum(L, axis=1)
234
235
        # Risk is average loss
236 1
        return np.mean(L, axis=0)
237
238 1
    def neg_log_likelihood(self, X, theta):
239
        """
240
        Compute negative log-likelihood under Gaussian distributions.
241
242
        Parameters
243
        ----------
244
        X : array
245
            data (N samples by D features)
246
        theta : tuple(array, array, array)
247
            tuple containing class proportions 'pi', class means 'mu',
248
            and class-covariances 'Si'
249
250
        Returns
251
        -------
252
        L : array
253
            loss (N samples by K classes)
254
255
        """
256
        # Unpack parameters
257 1
        pi, mu, Si = theta
258
259
        # Check if parameter sets match
260 1
        if not pi.shape[1] == mu.shape[0]:
261
            raise ValueError('Number proportions does not match number means.')
262
263 1
        if not mu.shape[1] == Si.shape[0] == Si.shape[1]:
264
            raise ValueError('''Dimensions of mean does not match dimensions of
265
                              covariance matrix.''')
266
267
        # Number of classes
268 1
        K = pi.shape[1]
269
270
        # Data shape
271 1
        N, D = X.shape
272
273
        # Preallocate loss array
274 1
        L = np.zeros((N, K))
275
276 1
        for k in range(K):
277
278
            # Check for linear or quadratic
279 1
            if self.loss == 'lda':
280
281 1
                try:
282
                    # Probability under k-th Gaussian with shared covariance
283 1
                    probs = mvn.pdf(X, mu[k, :], Si)
284
285
                except al.LinAlgError as err:
286
                    print('Covariance matrix is singular. Add regularization.')
287
                    raise err
288
289
            elif self.loss == 'qda':
290
291
                try:
292
                    # Probability under k-th Gaussian with own covariance
293
                    probs = mvn.pdf(X, mu[k, :], Si[:, :, k])
294
295
                except al.LinAlgError as err:
296
                    print('Covariance matrix is singular. Add regularization.')
297
                    raise err
298
299
            else:
300
                raise ValueError('Loss unknown.')
301
302
            # Negative log-likelihood
303 1
            L[:, k] = -np.log(pi[0, k]) - np.log(probs)
304
305 1
        return L
306
307 1
    def discriminant_parameters(self, X, Y):
308
        """
309
        Estimate parameters of Gaussian distribution for discriminant analysis.
310
311
        Parameters
312
        ----------
313
        X : array
314
            data array (N samples by D features)
315
        Y : array
316
            label array (N samples by K classes)
317
318
        Returns
319
        -------
320
        pi : array
321
            class proportions (1 by K classes)
322
        mu : array
323
            class means (K classes by D features)
324
        Si : array
325
            class covariances (D features D features by K classes)
326
327
        """
328
        # Check labels
329 1
        K = Y.shape[1]
330
331
        # Check for sufficient labels
332 1
        if not K > 1:
333
            raise ValueError('Number of classes should be larger than 1.')
334
335
        # Data shape
336 1
        N, D = X.shape
337
338
        # Preallocate parameter arrays
339 1
        pi = np.zeros((1, K))
340 1
        mu = np.zeros((K, D))
341 1
        Si = np.zeros((D, D, K))
342
343
        # For each class
344 1
        for k in range(K):
345
346
            # Number of samples for current class
347 1
            Nk = np.sum(Y[:, k], axis=0)
348
349
            # Check for no samples assigned to certain class
350 1
            if Nk == 0:
351
352
                # Proportion of samples for current class
353
                pi[0, k] = 0
354
355
                # Mean of current class
356
                mu[k, :] = np.zeros((1, D))
357
358
                # Covariance of current class
359
                Si[:, :, k] = np.eye(D, D)
360
361
            else:
362
363
                # Proportion of samples for current class
364 1
                pi[0, k] = Nk / N
365
366
                # Mean of current class
367 1
                mu[k, :] = np.dot(Y[:, k].T, X) / Nk
368
369
                # Subtract mean from data
370 1
                X_ = X - mu[k, :]
371
372
                # Diagonalize current label vector
373 1
                dYk = np.diag(Y[:, k])
374
375
                # Covariance of current class
376 1
                Si[:, :, k] = np.dot(np.dot(X_.T, dYk), X_) / Nk
377
378
                # Regularization
379 1
                Si[:, :, k] = regularize_matrix(Si[:, :, k], a=self.l2)
380
381
        # Check for linear or quadratic discriminant analysis
382 1
        if self.loss == 'lda':
383
384
            # In LDA, the class-covariance matrices are combined
385 1
            Si = self.combine_class_covariances(Si, pi)
386
387
        # Check for numerical problems
388 1
        if np.any(np.isnan(mu)) or np.any(np.isnan(Si)):
389
            raise RuntimeError('Parameter estimate is NaN.')
390
391 1
        return pi, mu, Si
392
393 1
    def combine_class_covariances(self, Si, pi):
394
        """
395
        Linear combination of class covariance matrices.
396
397
        Parameters
398
        ----------
399
        Si : array
400
            Covariance matrix (D features by D features by K classes)
401
        pi : array
402
            class proportions (1 by K classes)
403
404
        Returns
405
        -------
406
        Si : array
407
            Combined covariance matrix (D by D)
408
409
        """
410
        # Number of classes
411 1
        K = Si.shape[2]
412
413
        # Check if w is size K
414 1
        if not pi.shape[1] == K:
415
            raise ValueError('''Number of proportions does not match with number
416
                             classes of covariance matrix.''')
417
418
        # For each class
419 1
        for k in range(K):
420
421
            # Weight each class-covariance
422 1
            Si[:, :, k] = Si[:, :, k] * pi[0, k]
423
424
        # Sum over weighted class-covariances
425 1
        return np.sum(Si, axis=2)
426
427 1
    def tcpr_da(self, X, y, Z):
428
        """
429
        Target Contrastive Pessimistic Risk - discriminant analysis.
430
431
        Parameters
432
        ----------
433
        X : array
434
            source data (N samples by D features)
435
        y : array
436
            source labels (N samples by 1)
437
        Z : array
438
            target data (M samples by D features)
439
440
        Returns
441
        -------
442
        theta : array
443
            classifier parameters (D features by K classes)
444
445
        """
446
        # Data shapes
447 1
        N, DX = X.shape
448 1
        M, DZ = Z.shape
449
450
        # Assert equivalent dimensionalities
451 1
        if not DX == DZ:
452
            raise ValueError('Dimensionalities of X and Z should be equal.')
453
454
        # Augment data with bias if necessary
455 1
        X, DX = self.remove_intercept(X)
456 1
        Z, DZ = self.remove_intercept(Z)
457
458
        # Map labels to one-hot-encoding
459 1
        Y, classes = one_hot(y)
460
461
        # Number of classes
462 1
        K = len(classes)
463
464
        # Check for at least 2 classes
465 1
        if not K > 1:
466
            raise ValueError('Number of classes should be larger than 1.')
467
468
        # Estimate parameters of source model
469 1
        theta_ref = self.discriminant_parameters(X, Y)
470
471
        # Loss is negative log-likelihood under reference parameters
472 1
        L_ref = self.neg_log_likelihood(Z, theta_ref)
473
474
        # Initialize target posterior
475 1
        q = np.ones((M, K)) / K
476
477 1
        print('Starting TCP optimization')
478
479 1
        TCPRt = np.inf
480 1
        for t in range(self.max_iter):
481
482
            # Maximization phase
483
484
            # Estimate parameters using TCP risk
485 1
            theta_tcp = self.discriminant_parameters(Z, q)
486
487
            # Minimization phase
488
489
            # Compute loss under new parameters
490 1
            L_tcp = self.neg_log_likelihood(Z, theta_tcp)
491
492
            # Gradient is difference in losses
493 1
            Dq = L_tcp - L_ref
494
495
            # Update learning rate
496 1
            alpha = self.learning_rate_t(t)
497
498
            # Steepest descent step
499 1
            q -= alpha*Dq
500
501
            # Project back onto simplex
502 1
            for m in range(M):
503 1
                q[m, :] = self.project_simplex(q[m, :])
504
505
            # Monitor progress
506
507
            # Risks of current parameters
508 1
            R_tcp = self.risk(Z, theta_tcp, q)
509 1
            R_ref = self.risk(Z, theta_ref, q)
510
511
            # Assert no numerical problems
512 1
            if np.isnan(R_tcp) or np.isnan(R_ref):
513
                raise RuntimeError('Objective is NaN.')
514
515
            # Current TCP risk
516 1
            TCPRt_ = R_tcp - R_ref
517
518
            # Change in risk difference for this iteration
519 1
            dR = al.norm(TCPRt - TCPRt_)
520
521
            # Check for change smaller than tolerance
522 1
            if (dR < self.tolerance):
523 1
                print('Broke at iteration '+str(t)+', TCP Risk = '+str(TCPRt_))
524 1
                break
525
526
            # Report progress
527 1
            if (t % 100) == 1:
528 1
                print('Iteration ' + str(t) + '/' + str(self.max_iter) +
529
                      ', TCP Risk = ' + str(TCPRt_))
530
531
            # Update
532 1
            TCPRt = TCPRt_
533
534
        # Return estimated parameters
535 1
        return theta_tcp
536
537 1
    def fit(self, X, y, Z):
538
        """
539
        Fit/train an importance-weighted classifier.
540
541
        Parameters
542
        ----------
543
        X : array
544
            source data (N samples by D features)
545
        y : array
546
            source labels (N samples by 1)
547
        Z : array
548
            target data (M samples by D features)
549
550
        Returns
551
        -------
552
        None
553
554
        """
555
        # Data shapes
556 1
        N, DX = X.shape
557 1
        M, DZ = Z.shape
558
559
        # Assert equivalent dimensionalities
560 1
        if not DX == DZ:
561
            raise ValueError('Dimensionalities of X and Z should be equal.')
562
563 1
        if self.loss in ['lda', 'qda']:
564
565
            # Discriminant analysis model for TCPR
566 1
            self.parameters = self.tcpr_da(X, y, Z)
567
568
        else:
569
            # Other loss functions are not implemented
570
            raise ValueError('Loss function unknown.')
571
572
        # Extract and store classes
573 1
        self.classes = np.unique(y)
574
575
        # Mark classifier as trained
576 1
        self.is_trained = True
577
578
        # Store training data dimensionality
579 1
        self.train_data_dim = DX
580
581 1
    def predict(self, Z_):
582
        """
583
        Make predictions on new dataset.
584
        
585
        Parameters
586
        ----------
587
        Z : array
588
            new data set (M samples by D features)
589
590
        Returns
591
        -------
592
        preds : array
593
            label predictions (M samples by 1)
594
595
        """
596
        # Data shape
597 1
        M, D = Z_.shape
598
599
        # If classifier is trained, check for same dimensionality
600 1
        if self.is_trained:
601 1
            if not self.train_data_dim == D:
602
                raise ValueError('''Test data is of different dimensionality
603
                                 than training data.''')
604
605 1
        if self.loss in ['lda', 'qda']:
606
607
            # Compute probabilities under each distribution
608 1
            probs = self.neg_log_likelihood(Z_, self.parameters)
609
610
            # Take largest probability as predictions
611 1
            preds = np.argmax(probs, axis=1)
612
613
        # Map predictions back to original labels
614 1
        preds = self.classes[preds]
615
616
        # Return predictions array
617 1
        return preds
618
619 1
    def get_params(self):
620
        """Return classifier parameters."""
621
        # Check if classifier is trained
622
        if self.is_trained:
623
            return self.parameters
624
625
        else:
626
            # Throw soft error
627
            print('Classifier is not trained yet.')
628
            return []
629
630 1
    def error_rate(self, preds, u_):
631
        """Compute classification error rate."""
632
        return np.mean(preds != u_, axis=0)
633