Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

TransferComponentClassifier.kernel()   B

Complexity

Conditions 6

Size

Total Lines 47

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 8.8343

Importance

Changes 0
Metric Value
cc 6
c 0
b 0
f 0
dl 0
loc 47
ccs 8
cts 14
cp 0.5714
crap 8.8343
rs 7.6129
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse.linalg import eigs
7 1
from scipy.spatial.distance import cdist
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class TransferComponentClassifier(object):
18
    """
19
    Class of classifiers based on Transfer Component Analysis.
20
21
    Methods contain component analysis and general utilities.
22
    """
23
24 1
    def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1,
25
                 kernel_type='rbf', bandwidth=1.0, order=2.0):
26
        """
27
        Select a particular type of transfer component classifier.
28
29
        Parameters
30
        ----------
31
        loss : str
32
            loss function for weighted classifier, options: 'logistic',
33
            'quadratic', 'hinge' (def: 'logistic')
34
        l2 : float
35
            l2-regularization parameter value (def:0.01)
36
        mu : float
37
            trade-off parameter (def: 1.0)
38
        num_components : int
39
            number of transfer components to maintain (def: 1)
40
        kernel_type : str
41
            type of kernel to use, options: 'rbf' (def: 'rbf')
42
        bandwidth : float
43
            kernel bandwidth for transfer component analysis (def: 1.0)
44
        order : float
45
            order of polynomial for kernel (def: 2.0)
46
47
        Returns
48
        -------
49
        None
50
51
        Attributes
52
        ----------
53
        loss
54
            which loss function to use
55
        is_trained
56
            whether the classifier has been trained on data already
57
58
        """
59 1
        self.loss = loss
60 1
        self.l2 = l2
61 1
        self.mu = mu
62 1
        self.num_components = num_components
63
64 1
        self.kernel_type = kernel_type
65 1
        self.bandwidth = bandwidth
66 1
        self.order = order
67
68
        # Initialize untrained classifiers
69 1
        if self.loss == 'logistic':
70
            # Logistic regression model
71 1
            self.clf = LogisticRegression()
72
        elif self.loss == 'quadratic':
73
            # Least-squares model
74
            self.clf = LinearRegression()
75
        elif self.loss == 'hinge':
76
            # Linear support vector machine
77
            self.clf = LinearSVC()
78
        else:
79
            # Other loss functions are not implemented
80
            raise NotImplementedError
81
82
        # Maintain source and transfer data for computing kernels
83 1
        self.XZ = ''
84
85
        # Maintain transfer components
86 1
        self.C = ''
87
88
        # Whether model has been trained
89 1
        self.is_trained = False
90
91
        # Dimensionality of training data
92 1
        self.train_data_dim = ''
93
94 1
    def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0):
95
        """
96
        Compute kernel for given data set.
97
98
        Parameters
99
        ----------
100
        X : array
101
            data set (N samples by D features)
102
        Z : array
103
            data set (M samples by D features)
104
        type : str
105
            type of kernel, options: 'linear', 'polynomial', 'rbf',
106
            'sigmoid' (def: 'linear')
107
        order : float
108
            degree for the polynomial kernel (def: 2.0)
109
        bandwidth : float
110
            kernel bandwidth (def: 1.0)
111
112
        Returns
113
        -------
114
        array
115
            kernel matrix (N+M by N+M)
116
117
        """
118
        # Data shapes
119 1
        N, DX = X.shape
120 1
        M, DZ = Z.shape
121
122
        # Assert equivalent dimensionalities
123 1
        if not DX == DZ:
124
            raise ValueError('Dimensionalities of X and Z should be equal.')
125
126
        # Select type of kernel to compute
127 1
        if type == 'linear':
128
            # Linear kernel is data outer product
129
            return np.dot(X, Z.T)
130 1
        elif type == 'polynomial':
131
            # Polynomial kernel is an exponentiated data outer product
132
            return (np.dot(X, Z.T) + 1)**p
133 1
        elif type == 'rbf':
134
            # Radial basis function kernel
135 1
            return np.exp(-cdist(X, Z) / (2.*bandwidth**2))
136
        elif type == 'sigmoid':
137
            # Sigmoidal kernel
138
            return 1./(1 + np.exp(np.dot(X, Z.T)))
139
        else:
140
            raise NotImplementedError('Loss not implemented yet.')
141
142 1
    def transfer_component_analysis(self, X, Z):
143
        """
144
        Transfer Component Analysis.
145
146
        Parameters
147
        ----------
148
        X : array
149
            source data set (N samples by D features)
150
        Z : array
151
            target data set (M samples by D features)
152
153
        Returns
154
        -------
155
        C : array
156
            transfer components (D features by num_components)
157
        K : array
158
            source and target data kernel distances
159
160
        """
161
        # Data shapes
162 1
        N, DX = X.shape
163 1
        M, DZ = Z.shape
164
165
        # Assert equivalent dimensionalities
166 1
        if not DX == DZ:
167
            raise ValueError('Dimensionalities of X and Z should be equal.')
168
169
        # Compute kernel matrix
170 1
        XZ = np.concatenate((X, Z), axis=0)
171 1
        K = self.kernel(XZ, XZ, type=self.kernel_type,
172
                        bandwidth=self.bandwidth)
173
174
        # Ensure positive-definiteness
175 1
        if not is_pos_def(K):
176
            print('Warning: covariate matrices not PSD.')
177
178
            regct = -6
179
            while not is_pos_def(K):
180
                print('Adding regularization: ' + str(10**regct))
181
182
                # Add regularization
183
                K += np.eye(N + M)*10.**regct
184
185
                # Increment regularization counter
186
                regct += 1
187
188
        # Normalization matrix
189 1
        L = np.vstack((np.hstack((np.ones((N, N))/N**2,
190
                                  -1*np.ones((N, M))/(N*M))),
191
                       np.hstack((-1*np.ones((M, N))/(N*M),
192
                                  np.ones((M, M))/M**2))))
193
194
        # Centering matrix
195 1
        H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M)
196
197
        # Matrix Lagrangian objective function: (I + mu*K*L*K)^{-1}*K*H*K
198 1
        J = np.dot(np.linalg.inv(np.eye(N + M) +
199
                   self.mu*np.dot(np.dot(K, L), K)),
200
                   np.dot(np.dot(K, H), K))
201
202
        # Eigenvector decomposition as solution to trace minimization
203 1
        _, C = eigs(J, k=self.num_components)
204
205
        # Discard imaginary numbers (possible computation issue)
206 1
        return np.real(C), K
207
208 1
    def fit(self, X, y, Z):
209
        """
210
        Fit/train a classifier on data mapped onto transfer components.
211
212
        Parameters
213
        ----------
214
        X : array
215
            source data (N samples by D features)
216
        y : array
217
            source labels (N samples by 1)
218
        Z : array
219
            target data (M samples by D features)
220
221
        Returns
222
        -------
223
        None
224
225
        """
226
        # Data shapes
227 1
        N, DX = X.shape
228 1
        M, DZ = Z.shape
229
230
        # Assert equivalent dimensionalities
231 1
        if not DX == DZ:
232
            raise ValueError('Dimensionalities of X and Z should be equal.')
233
234
        # Assert correct number of components for given dataset
235 1
        if not self.num_components <= N + M - 1:
236
            raise ValueError('''Number of components must be smaller than or
237
                             equal to the source sample size plus target sample
238
                             size plus 1.''')
239
240
        # Maintain source and target data for later kernel computations
241 1
        self.XZ = np.concatenate((X, Z), axis=0)
242
243
        # Transfer component analysis
244 1
        self.C, K = self.transfer_component_analysis(X, Z)
245
246
        # Map source data onto transfer components
247 1
        X = np.dot(K[:N, :], self.C)
248
249
        # Train a weighted classifier
250 1
        if self.loss == 'logistic':
251
            # Logistic regression model with sample weights
252 1
            self.clf.fit(X, y)
253
        elif self.loss == 'quadratic':
254
            # Least-squares model with sample weights
255
            self.clf.fit(X, y)
256
        elif self.loss == 'hinge':
257
            # Linear support vector machine with sample weights
258
            self.clf.fit(X, y)
259
        else:
260
            # Other loss functions are not implemented
261
            raise NotImplementedError('Loss not implemented yet.')
262
263
        # Mark classifier as trained
264 1
        self.is_trained = True
265
266
        # Store training data dimensionality
267 1
        self.train_data_dim = DX
268
269 1
    def predict(self, Z):
270
        """
271
        Make predictions on new dataset.
272
273
        Parameters
274
        ----------
275
        Z : array
276
            new data set (M samples by D features)
277
278
        Returns
279
        -------
280
        preds : array
281
            label predictions (M samples by 1)
282
283
        """
284
        # Data shape
285 1
        M, D = Z.shape
286
287
        # If classifier is trained, check for same dimensionality
288 1
        if self.is_trained:
289 1
            if not self.train_data_dim == D:
290
                raise ValueError('''Test data is of different dimensionality
291
                                 than training data.''')
292
293
        # Compute kernel for new data
294 1
        K = self.kernel(Z, self.XZ, type=self.kernel_type,
295
                        bandwidth=self.bandwidth, order=self.order)
296
297
        # Map new data onto transfer components
298 1
        Z = np.dot(K, self.C)
299
300
        # Call scikit's predict function
301 1
        preds = self.clf.predict(Z)
302
303
        # For quadratic loss function, correct predictions
304 1
        if self.loss == 'quadratic':
305
            preds = (np.sign(preds)+1)/2.
306
307
        # Return predictions array
308 1
        return preds
309
310 1
    def get_params(self):
311
        """Get classifier parameters."""
312
        return self.clf.get_params()
313
314 1
    def is_trained(self):
315
        """Check whether classifier is trained."""
316
        return self.is_trained
317