Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

transfer_component_analysis()   B

Complexity

Conditions 4

Size

Total Lines 65

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 4.7999

Importance

Changes 0
Metric Value
cc 4
c 0
b 0
f 0
dl 0
loc 65
ccs 12
cts 19
cp 0.6316
crap 4.7999
rs 8.8507

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse.linalg import eigs
7 1
from scipy.spatial.distance import cdist
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class TransferComponentClassifier(object):
18
    """
19
    Class of classifiers based on Transfer Component Analysis.
20
21
    Methods contain component analysis and general utilities.
22
    """
23
24 1
    def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1,
25
                 kernel_type='rbf', bandwidth=1.0, order=2.0):
26
        """
27
        Select a particular type of transfer component classifier.
28
29
        Parameters
30
        ----------
31
        loss : str
32
            loss function for weighted classifier, options: 'logistic',
33
            'quadratic', 'hinge' (def: 'logistic')
34
        l2 : float
35
            l2-regularization parameter value (def:0.01)
36
        mu : float
37
            trade-off parameter (def: 1.0)
38
        num_components : int
39
            number of transfer components to maintain (def: 1)
40
        kernel_type : str
41
            type of kernel to use, options: 'rbf' (def: 'rbf')
42
        bandwidth : float
43
            kernel bandwidth for transfer component analysis (def: 1.0)
44
        order : float
45
            order of polynomial for kernel (def: 2.0)
46
47
        Returns
48
        -------
49
        None
50
51
        Attributes
52
        ----------
53
        loss
54
            which loss function to use
55
        is_trained
56
            whether the classifier has been trained on data already
57
58
        """
59 1
        self.loss = loss
60 1
        self.l2 = l2
61 1
        self.mu = mu
62 1
        self.num_components = num_components
63
64 1
        self.kernel_type = kernel_type
65 1
        self.bandwidth = bandwidth
66 1
        self.order = order
67
68
        # Initialize untrained classifiers
69 1
        if self.loss == 'logistic':
70
            # Logistic regression model
71 1
            self.clf = LogisticRegression()
72
        elif self.loss == 'quadratic':
73
            # Least-squares model
74
            self.clf = LinearRegression()
75
        elif self.loss == 'hinge':
76
            # Linear support vector machine
77
            self.clf = LinearSVC()
78
        else:
79
            # Other loss functions are not implemented
80
            raise NotImplementedError
81
82
        # Maintain source and transfer data for computing kernels
83 1
        self.XZ = ''
84
85
        # Maintain transfer components
86 1
        self.C = ''
87
88
        # Whether model has been trained
89 1
        self.is_trained = False
90
91
        # Dimensionality of training data
92 1
        self.train_data_dim = ''
93
94 1
    def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0):
95
        """
96
        Compute kernel for given data set.
97
98
        Parameters
99
        ----------
100
        X : array
101
            data set (N samples by D features)
102
        Z : array
103
            data set (M samples by D features)
104
        type : str
105
            type of kernel, options: 'linear', 'polynomial', 'rbf',
106
            'sigmoid' (def: 'linear')
107
        order : float
108
            degree for the polynomial kernel (def: 2.0)
109
        bandwidth : float
110
            kernel bandwidth (def: 1.0)
111
112
        Returns
113
        -------
114
        array
115
            kernel matrix (N+M by N+M)
116
117
        """
118
        # Data shapes
119 1
        N, DX = X.shape
120 1
        M, DZ = Z.shape
121
122
        # Assert equivalent dimensionalities
123 1
        if not DX == DZ:
124
            raise ValueError('Dimensionalities of X and Z should be equal.')
125
126
        # Select type of kernel to compute
127 1
        if type == 'linear':
128
            # Linear kernel is data outer product
129
            return np.dot(X, Z.T)
130 1
        elif type == 'polynomial':
131
            # Polynomial kernel is an exponentiated data outer product
132
            return (np.dot(X, Z.T) + 1)**p
133 1
        elif type == 'rbf':
134
            # Radial basis function kernel
135 1
            return np.exp(-cdist(X, Z) / (2.*bandwidth**2))
136
        elif type == 'sigmoid':
137
            # Sigmoidal kernel
138
            return 1./(1 + np.exp(np.dot(X, Z.T)))
139
        else:
140
            raise NotImplementedError('Loss not implemented yet.')
141
142 1
    def transfer_component_analysis(self, X, Z):
143
        """
144
        Transfer Component Analysis.
145
146
        Parameters
147
        ----------
148
        X : array
149
            source data set (N samples by D features)
150
        Z : array
151
            target data set (M samples by D features)
152
153
        Returns
154
        -------
155
        C : array
156
            transfer components (D features by num_components)
157
        K : array
158
            source and target data kernel distances
159
160
        """
161
        # Data shapes
162 1
        N, DX = X.shape
163 1
        M, DZ = Z.shape
164
165
        # Assert equivalent dimensionalities
166 1
        if not DX == DZ:
167
            raise ValueError('Dimensionalities of X and Z should be equal.')
168
169
        # Compute kernel matrix
170 1
        XZ = np.concatenate((X, Z), axis=0)
171 1
        K = self.kernel(XZ, XZ, type=self.kernel_type,
172
                        bandwidth=self.bandwidth)
173
174
        # Ensure positive-definiteness
175 1
        if not is_pos_def(K):
176
            print('Warning: covariate matrices not PSD.')
177
178
            regct = -6
179
            while not is_pos_def(K):
180
                print('Adding regularization: ' + str(10**regct))
181
182
                # Add regularization
183
                K += np.eye(N + M)*10.**regct
184
185
                # Increment regularization counter
186
                regct += 1
187
188
        # Normalization matrix
189 1
        L = np.vstack((np.hstack((np.ones((N, N))/N**2,
190
                                  -1*np.ones((N, M))/(N*M))),
191
                       np.hstack((-1*np.ones((M, N))/(N*M),
192
                                  np.ones((M, M))/M**2))))
193
194
        # Centering matrix
195 1
        H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M)
196
197
        # Matrix Lagrangian objective function: (I + mu*K*L*K)^{-1}*K*H*K
198 1
        J = np.dot(np.linalg.inv(np.eye(N + M) +
199
                   self.mu*np.dot(np.dot(K, L), K)),
200
                   np.dot(np.dot(K, H), K))
201
202
        # Eigenvector decomposition as solution to trace minimization
203 1
        _, C = eigs(J, k=self.num_components)
204
205
        # Discard imaginary numbers (possible computation issue)
206 1
        return np.real(C), K
207
208 1
    def fit(self, X, y, Z):
209
        """
210
        Fit/train a classifier on data mapped onto transfer components.
211
212
        Parameters
213
        ----------
214
        X : array
215
            source data (N samples by D features)
216
        y : array
217
            source labels (N samples by 1)
218
        Z : array
219
            target data (M samples by D features)
220
221
        Returns
222
        -------
223
        None
224
225
        """
226
        # Data shapes
227 1
        N, DX = X.shape
228 1
        M, DZ = Z.shape
229
230
        # Assert equivalent dimensionalities
231 1
        if not DX == DZ:
232
            raise ValueError('Dimensionalities of X and Z should be equal.')
233
234
        # Assert correct number of components for given dataset
235 1
        if not self.num_components <= N + M - 1:
236
            raise ValueError('''Number of components must be smaller than or
237
                             equal to the source sample size plus target sample
238
                             size plus 1.''')
239
240
        # Maintain source and target data for later kernel computations
241 1
        self.XZ = np.concatenate((X, Z), axis=0)
242
243
        # Transfer component analysis
244 1
        self.C, K = self.transfer_component_analysis(X, Z)
245
246
        # Map source data onto transfer components
247 1
        X = np.dot(K[:N, :], self.C)
248
249
        # Train a weighted classifier
250 1
        if self.loss == 'logistic':
251
            # Logistic regression model with sample weights
252 1
            self.clf.fit(X, y)
253
        elif self.loss == 'quadratic':
254
            # Least-squares model with sample weights
255
            self.clf.fit(X, y)
256
        elif self.loss == 'hinge':
257
            # Linear support vector machine with sample weights
258
            self.clf.fit(X, y)
259
        else:
260
            # Other loss functions are not implemented
261
            raise NotImplementedError('Loss not implemented yet.')
262
263
        # Mark classifier as trained
264 1
        self.is_trained = True
265
266
        # Store training data dimensionality
267 1
        self.train_data_dim = DX
268
269 1
    def predict(self, Z):
270
        """
271
        Make predictions on new dataset.
272
273
        Parameters
274
        ----------
275
        Z : array
276
            new data set (M samples by D features)
277
278
        Returns
279
        -------
280
        preds : array
281
            label predictions (M samples by 1)
282
283
        """
284
        # Data shape
285 1
        M, D = Z.shape
286
287
        # If classifier is trained, check for same dimensionality
288 1
        if self.is_trained:
289 1
            if not self.train_data_dim == D:
290
                raise ValueError('''Test data is of different dimensionality
291
                                 than training data.''')
292
293
        # Compute kernel for new data
294 1
        K = self.kernel(Z, self.XZ, type=self.kernel_type,
295
                        bandwidth=self.bandwidth, order=self.order)
296
297
        # Map new data onto transfer components
298 1
        Z = np.dot(K, self.C)
299
300
        # Call scikit's predict function
301 1
        preds = self.clf.predict(Z)
302
303
        # For quadratic loss function, correct predictions
304 1
        if self.loss == 'quadratic':
305
            preds = (np.sign(preds)+1)/2.
306
307
        # Return predictions array
308 1
        return preds
309
310 1
    def get_params(self):
311
        """Get classifier parameters."""
312
        return self.clf.get_params()
313
314 1
    def is_trained(self):
315
        """Check whether classifier is trained."""
316
        return self.is_trained
317