TransferComponentClassifier.transfer_component_analysis() - Code Metrics - Inspection of "Added sphinx and readthedocs." - wmkouw/libTLDA - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f3d068...88fa67 )

by Wouter

created 2018-06-17 15:54 UTC

transfer_component_analysis() B

↳ Parent: TransferComponentClassifier

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	12
CRAP Score	4.7999

Importance

Changes

Metric	Value
cc	4
c	0
b	0
f	0
dl	0
loc	65
ccs	12
cts	19
cp	0.6316
crap	4.7999
rs	8.8507

How to fix Long Method

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import scipy.stats as st
from scipy.sparse.linalg import eigs
from scipy.spatial.distance import cdist
import sklearn as sk
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_predict
from os.path import basename

from .util import is_pos_def


class TransferComponentClassifier(object):
    """
    Class of classifiers based on Transfer Component Analysis.

    Methods contain component analysis and general utilities.
    """

    def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1,
                 kernel_type='rbf', bandwidth=1.0, order=2.0):
        """
        Select a particular type of transfer component classifier.

        Parameters
        ----------
        loss : str
            loss function for weighted classifier, options: 'logistic',
            'quadratic', 'hinge' (def: 'logistic')
        l2 : float
            l2-regularization parameter value (def:0.01)
        mu : float
            trade-off parameter (def: 1.0)
        num_components : int
            number of transfer components to maintain (def: 1)
        kernel_type : str
            type of kernel to use, options: 'rbf' (def: 'rbf')
        bandwidth : float
            kernel bandwidth for transfer component analysis (def: 1.0)
        order : float
            order of polynomial for kernel (def: 2.0)

        Returns
        -------
        None

        Attributes
        ----------
        loss
            which loss function to use
        is_trained
            whether the classifier has been trained on data already

        """
        self.loss = loss
        self.l2 = l2
        self.mu = mu
        self.num_components = num_components

        self.kernel_type = kernel_type
        self.bandwidth = bandwidth
        self.order = order

        # Initialize untrained classifiers
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Maintain source and transfer data for computing kernels
        self.XZ = ''

        # Maintain transfer components
        self.C = ''

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0):
        """
        Compute kernel for given data set.

        Parameters
        ----------
        X : array
            data set (N samples by D features)
        Z : array
            data set (M samples by D features)
        type : str
            type of kernel, options: 'linear', 'polynomial', 'rbf',
            'sigmoid' (def: 'linear')
        order : float
            degree for the polynomial kernel (def: 2.0)
        bandwidth : float
            kernel bandwidth (def: 1.0)

        Returns
        -------
        array
            kernel matrix (N+M by N+M)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Select type of kernel to compute
        if type == 'linear':
            # Linear kernel is data outer product
            return np.dot(X, Z.T)
        elif type == 'polynomial':
            # Polynomial kernel is an exponentiated data outer product
            return (np.dot(X, Z.T) + 1)**p
        elif type == 'rbf':
            # Radial basis function kernel
            return np.exp(-cdist(X, Z) / (2.*bandwidth**2))
        elif type == 'sigmoid':
            # Sigmoidal kernel
            return 1./(1 + np.exp(np.dot(X, Z.T)))
        else:
            raise NotImplementedError('Loss not implemented yet.')

    def transfer_component_analysis(self, X, Z):
        """
        Transfer Component Analysis.

        Parameters
        ----------
        X : array
            source data set (N samples by D features)
        Z : array
            target data set (M samples by D features)

        Returns
        -------
        C : array
            transfer components (D features by num_components)
        K : array
            source and target data kernel distances

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Compute kernel matrix
        XZ = np.concatenate((X, Z), axis=0)
        K = self.kernel(XZ, XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth)

        # Ensure positive-definiteness
        if not is_pos_def(K):
            print('Warning: covariate matrices not PSD.')

            regct = -6
            while not is_pos_def(K):
                print('Adding regularization: ' + str(10**regct))

                # Add regularization
                K += np.eye(N + M)*10.**regct

                # Increment regularization counter
                regct += 1

        # Normalization matrix
        L = np.vstack((np.hstack((np.ones((N, N))/N**2,
                                  -1*np.ones((N, M))/(N*M))),
                       np.hstack((-1*np.ones((M, N))/(N*M),
                                  np.ones((M, M))/M**2))))

        # Centering matrix
        H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M)

        # Matrix Lagrangian objective function: (I + mu*K*L*K)^{-1}*K*H*K
        J = np.dot(np.linalg.inv(np.eye(N + M) +
                   self.mu*np.dot(np.dot(K, L), K)),
                   np.dot(np.dot(K, H), K))

        # Eigenvector decomposition as solution to trace minimization
        _, C = eigs(J, k=self.num_components)

        # Discard imaginary numbers (possible computation issue)
        return np.real(C), K

    def fit(self, X, y, Z):
        """
        Fit/train a classifier on data mapped onto transfer components.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        y : array
            source labels (N samples by 1)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        None

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Assert correct number of components for given dataset
        if not self.num_components <= N + M - 1:
            raise ValueError('''Number of components must be smaller than or
                             equal to the source sample size plus target sample
                             size plus 1.''')

        # Maintain source and target data for later kernel computations
        self.XZ = np.concatenate((X, Z), axis=0)

        # Transfer component analysis
        self.C, K = self.transfer_component_analysis(X, Z)

        # Map source data onto transfer components
        X = np.dot(K[:N, :], self.C)

        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss not implemented yet.')

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z):
        """
        Make predictions on new dataset.

        Parameters
        ----------
        Z : array
            new data set (M samples by D features)

        Returns
        -------
        preds : array
            label predictions (M samples by 1)

        """
        # Data shape
        M, D = Z.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            if not self.train_data_dim == D:
                raise ValueError('''Test data is of different dimensionality
                                 than training data.''')

        # Compute kernel for new data
        K = self.kernel(Z, self.XZ, type=self.kernel_type,
                        bandwidth=self.bandwidth, order=self.order)

        # Map new data onto transfer components
        Z = np.dot(K, self.C)

        # Call scikit's predict function
        preds = self.clf.predict(Z)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained


1		#!/usr/bin/env python
2		# -- coding: utf-8 --
3
4	1	import numpy as np
5	1	import scipy.stats as st
6	1	from scipy.sparse.linalg import eigs
7	1	from scipy.spatial.distance import cdist
8	1	import sklearn as sk
9	1	from sklearn.svm import LinearSVC
10	1	from sklearn.linear_model import LogisticRegression, LinearRegression
11	1	from sklearn.model_selection import cross_val_predict
12	1	from os.path import basename
13
14	1	from .util import is_pos_def
15
16
17	1	class TransferComponentClassifier(object):
18		"""
19		Class of classifiers based on Transfer Component Analysis.
20
21		Methods contain component analysis and general utilities.
22		"""
23
24	1	def __init__(self, loss='logistic', l2=1.0, mu=1.0, num_components=1,
25		kernel_type='rbf', bandwidth=1.0, order=2.0):
26		"""
27		Select a particular type of transfer component classifier.
28
29		Parameters
30		----------
31		loss : str
32		loss function for weighted classifier, options: 'logistic',
33		'quadratic', 'hinge' (def: 'logistic')
34		l2 : float
35		l2-regularization parameter value (def:0.01)
36		mu : float
37		trade-off parameter (def: 1.0)
38		num_components : int
39		number of transfer components to maintain (def: 1)
40		kernel_type : str
41		type of kernel to use, options: 'rbf' (def: 'rbf')
42		bandwidth : float
43		kernel bandwidth for transfer component analysis (def: 1.0)
44		order : float
45		order of polynomial for kernel (def: 2.0)
46
47		Returns
48		-------
49		None
50
51		Attributes
52		----------
53		loss
54		which loss function to use
55		is_trained
56		whether the classifier has been trained on data already
57
58		"""
59	1	self.loss = loss
60	1	self.l2 = l2
61	1	self.mu = mu
62	1	self.num_components = num_components
63
64	1	self.kernel_type = kernel_type
65	1	self.bandwidth = bandwidth
66	1	self.order = order
67
68		# Initialize untrained classifiers
69	1	if self.loss == 'logistic':
70		# Logistic regression model
71	1	self.clf = LogisticRegression()
72		elif self.loss == 'quadratic':
73		# Least-squares model
74		self.clf = LinearRegression()
75		elif self.loss == 'hinge':
76		# Linear support vector machine
77		self.clf = LinearSVC()
78		else:
79		# Other loss functions are not implemented
80		raise NotImplementedError
81
82		# Maintain source and transfer data for computing kernels
83	1	self.XZ = ''
84
85		# Maintain transfer components
86	1	self.C = ''
87
88		# Whether model has been trained
89	1	self.is_trained = False
90
91		# Dimensionality of training data
92	1	self.train_data_dim = ''
93
94	1	def kernel(self, X, Z, type='rbf', order=2, bandwidth=1.0):
95		"""
96		Compute kernel for given data set.
97
98		Parameters
99		----------
100		X : array
101		data set (N samples by D features)
102		Z : array
103		data set (M samples by D features)
104		type : str
105		type of kernel, options: 'linear', 'polynomial', 'rbf',
106		'sigmoid' (def: 'linear')
107		order : float
108		degree for the polynomial kernel (def: 2.0)
109		bandwidth : float
110		kernel bandwidth (def: 1.0)
111
112		Returns
113		-------
114		array
115		kernel matrix (N+M by N+M)
116
117		"""
118		# Data shapes
119	1	N, DX = X.shape
120	1	M, DZ = Z.shape
121
122		# Assert equivalent dimensionalities
123	1	if not DX == DZ:
124		raise ValueError('Dimensionalities of X and Z should be equal.')
125
126		# Select type of kernel to compute
127	1	if type == 'linear':
128		# Linear kernel is data outer product
129		return np.dot(X, Z.T)
130	1	elif type == 'polynomial':
131		# Polynomial kernel is an exponentiated data outer product
132		return (np.dot(X, Z.T) + 1)**p
133	1	elif type == 'rbf':
134		# Radial basis function kernel
135	1	return np.exp(-cdist(X, Z) / (2.bandwidth*2))
136		elif type == 'sigmoid':
137		# Sigmoidal kernel
138		return 1./(1 + np.exp(np.dot(X, Z.T)))
139		else:
140		raise NotImplementedError('Loss not implemented yet.')
141
142	1	def transfer_component_analysis(self, X, Z):
143		"""
144		Transfer Component Analysis.
145
146		Parameters
147		----------
148		X : array
149		source data set (N samples by D features)
150		Z : array
151		target data set (M samples by D features)
152
153		Returns
154		-------
155		C : array
156		transfer components (D features by num_components)
157		K : array
158		source and target data kernel distances
159
160		"""
161		# Data shapes
162	1	N, DX = X.shape
163	1	M, DZ = Z.shape
164
165		# Assert equivalent dimensionalities
166	1	if not DX == DZ:
167		raise ValueError('Dimensionalities of X and Z should be equal.')
168
169		# Compute kernel matrix
170	1	XZ = np.concatenate((X, Z), axis=0)
171	1	K = self.kernel(XZ, XZ, type=self.kernel_type,
172		bandwidth=self.bandwidth)
173
174		# Ensure positive-definiteness
175	1	if not is_pos_def(K):
176		print('Warning: covariate matrices not PSD.')
177
178		regct = -6
179		while not is_pos_def(K):
180		print('Adding regularization: ' + str(10**regct))
181
182		# Add regularization
183		K += np.eye(N + M)10.*regct
184
185		# Increment regularization counter
186		regct += 1
187
188		# Normalization matrix
189	1	L = np.vstack((np.hstack((np.ones((N, N))/N**2,
190		-1np.ones((N, M))/(NM))),
191		np.hstack((-1np.ones((M, N))/(NM),
192		np.ones((M, M))/M**2))))
193
194		# Centering matrix
195	1	H = np.eye(N + M) - np.ones((N + M, N + M)) / float(N + M)
196
197		# Matrix Lagrangian objective function: (I + muKLK)^{-1}KHK
198	1	J = np.dot(np.linalg.inv(np.eye(N + M) +
199		self.mu*np.dot(np.dot(K, L), K)),
200		np.dot(np.dot(K, H), K))
201
202		# Eigenvector decomposition as solution to trace minimization
203	1	_, C = eigs(J, k=self.num_components)
204
205		# Discard imaginary numbers (possible computation issue)
206	1	return np.real(C), K
207
208	1	def fit(self, X, y, Z):
209		"""
210		Fit/train a classifier on data mapped onto transfer components.
211
212		Parameters
213		----------
214		X : array
215		source data (N samples by D features)
216		y : array
217		source labels (N samples by 1)
218		Z : array
219		target data (M samples by D features)
220
221		Returns
222		-------
223		None
224
225		"""
226		# Data shapes
227	1	N, DX = X.shape
228	1	M, DZ = Z.shape
229
230		# Assert equivalent dimensionalities
231	1	if not DX == DZ:
232		raise ValueError('Dimensionalities of X and Z should be equal.')
233
234		# Assert correct number of components for given dataset
235	1	if not self.num_components <= N + M - 1:
236		raise ValueError('''Number of components must be smaller than or
237		equal to the source sample size plus target sample
238		size plus 1.''')
239
240		# Maintain source and target data for later kernel computations
241	1	self.XZ = np.concatenate((X, Z), axis=0)
242
243		# Transfer component analysis
244	1	self.C, K = self.transfer_component_analysis(X, Z)
245
246		# Map source data onto transfer components
247	1	X = np.dot(K[:N, :], self.C)
248
249		# Train a weighted classifier
250	1	if self.loss == 'logistic':
251		# Logistic regression model with sample weights
252	1	self.clf.fit(X, y)
253		elif self.loss == 'quadratic':
254		# Least-squares model with sample weights
255		self.clf.fit(X, y)
256		elif self.loss == 'hinge':
257		# Linear support vector machine with sample weights
258		self.clf.fit(X, y)
259		else:
260		# Other loss functions are not implemented
261		raise NotImplementedError('Loss not implemented yet.')
262
263		# Mark classifier as trained
264	1	self.is_trained = True
265
266		# Store training data dimensionality
267	1	self.train_data_dim = DX
268
269	1	def predict(self, Z):
270		"""
271		Make predictions on new dataset.
272
273		Parameters
274		----------
275		Z : array
276		new data set (M samples by D features)
277
278		Returns
279		-------
280		preds : array
281		label predictions (M samples by 1)
282
283		"""
284		# Data shape
285	1	M, D = Z.shape
286
287		# If classifier is trained, check for same dimensionality
288	1	if self.is_trained:
289	1	if not self.train_data_dim == D:
290		raise ValueError('''Test data is of different dimensionality
291		than training data.''')
292
293		# Compute kernel for new data
294	1	K = self.kernel(Z, self.XZ, type=self.kernel_type,
295		bandwidth=self.bandwidth, order=self.order)
296
297		# Map new data onto transfer components
298	1	Z = np.dot(K, self.C)
299
300		# Call scikit's predict function
301	1	preds = self.clf.predict(Z)
302
303		# For quadratic loss function, correct predictions
304	1	if self.loss == 'quadratic':
305		preds = (np.sign(preds)+1)/2.
306
307		# Return predictions array
308	1	return preds
309
310	1	def get_params(self):
311		"""Get classifier parameters."""
312		return self.clf.get_params()
313
314	1	def is_trained(self):
315		"""Check whether classifier is trained."""
316		return self.is_trained
317

wmkouw / libTLDA

Push — master ( f3d068...88fa67 )

transfer_component_analysis() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like