ImportanceWeightedClassifier.iwe_kernel_mean_matching() - Code Metrics - Inspection of "Added a few more unit tests." - wmkouw/libTLDA - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 748d0e...582254 )

by Wouter

created 2018-06-13 06:22 UTC

iwe_kernel_mean_matching() B

↳ Parent: ImportanceWeightedClassifier

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	18
CRAP Score	5.0729

Importance

Changes

Metric	Value
cc	5
c	0
b	0
f	0
dl	0
loc	65
ccs	18
cts	21
cp	0.8571
crap	5.0729
rs	8.2662

How to fix Long Method

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import scipy.stats as st
from scipy.spatial.distance import cdist
import sklearn as sk
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_predict
from os.path import basename
from cvxopt import matrix, solvers

from .util import is_pos_def


class ImportanceWeightedClassifier(object):
    """
    Class of importance-weighted classifiers.

    Methods contain different importance-weight estimators and different loss
    functions.
    """

    def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,

                 clip=-1, kernel_type='rbf', bandwidth=1):
        """
        Select a particular type of importance-weighted classifier.

        Parameters
        ----------
        loss : str
            loss function for weighted classifier, options: 'logistic',
            'quadratic', 'hinge' (def: 'logistic')
        l2 : float
            l2-regularization parameter value (def:0.01)
        iwe : str
            importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
            'kde' (def: 'lr')
        smoothing : bool
            whether to apply Laplace smoothing to the nearest-neighbour
            importance-weight estimator (def: True)
        clip : float
            maximum allowable importance-weight value; if set to -1, then the
            weights are not clipped (def:-1)
        kernel_type : str
            what type of kernel to use for kernel density estimation or kernel
            mean matching, options: 'diste', 'rbf' (def: 'rbf')
        bandwidth : float
            kernel bandwidth parameter value for kernel-based weight
            estimators (def: 1)

        Returns
        -------
        None

        Examples
        --------
        >>>> clf = ImportanceWeightedClassifier()

        """
        self.loss = loss
        self.l2 = l2
        self.iwe = iwe
        self.smoothing = smoothing
        self.clip = clip
        self.kernel_type = kernel_type
        self.bandwidth = bandwidth

        # Initialize untrained classifiers based on choice of loss function
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss function not implemented.')

        # Whether model has been trained
        self.is_trained = False

        # Dimensionality of training data
        self.train_data_dim = ''

    def iwe_ratio_gaussians(self, X, Z):
        """
        Estimate importance weights based on a ratio of Gaussian distributions.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        iw : array
            importance weights (N samples by 1)

        Examples
        --------
        X = np.random.randn(10, 2)
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        iw = clf.iwe_ratio_gaussians(X, Z)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Sample means in each domain
        mu_X = np.mean(X, axis=0)
        mu_Z = np.mean(Z, axis=0)

        # Sample covariances
        Si_X = np.cov(X.T)
        Si_Z = np.cov(Z.T)

        # Check for positive-definiteness of covariance matrices
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
            print('Warning: covariate matrices not PSD.')

            regct = -6
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
                print('Adding regularization: ' + str(1**regct))

                # Add regularization
                Si_X += np.eye(DX)*10.**regct
                Si_Z += np.eye(DZ)*10.**regct

                # Increment regularization counter
                regct += 1

        # Compute probability of X under each domain
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)

        # Check for numerical problems
        if np.any(np.isnan(pT)) or np.any(pT == 0):
            raise ValueError('Source probabilities are NaN or 0.')
        if np.any(np.isnan(pS)) or np.any(pS == 0):
            raise ValueError('Target probabilities are NaN or 0.')

        # Return the ratio of probabilities
        return pT / pS

    def iwe_kernel_densities(self, X, Z):
        """
        Estimate importance weights based on kernel density estimation.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        iw : array
            importance weights (N samples by 1)

        Examples
        --------
        X = np.random.randn(10, 2)
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        iw = clf.iwe_kernel_densities(X, Z)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Compute probabilities based on source kernel densities
        pT = st.gaussian_kde(Z.T).pdf(X.T)
        pS = st.gaussian_kde(X.T).pdf(X.T)

        # Check for numerical problems
        if np.any(np.isnan(pT)) or np.any(pT == 0):
            raise ValueError('Source probabilities are NaN or 0.')
        if np.any(np.isnan(pS)) or np.any(pS == 0):
            raise ValueError('Target probabilities are NaN or 0.')

        # Return the ratio of probabilities
        return pT / pS

    def iwe_logistic_discrimination(self, X, Z):
        """
        Estimate importance weights based on logistic regression.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        iw : array
            importance weights (N samples by 1)

        Examples
        --------
        X = np.random.randn(10, 2)
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        iw = clf.iwe_logistic_discrimination(X, Z)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Make domain-label variable
        y = np.concatenate((np.zeros((N, 1)),
                            np.ones((M, 1))), axis=0)

        # Concatenate data
        XZ = np.concatenate((X, Z), axis=0)

        # Call a logistic regressor
        lr = LogisticRegression(C=self.l2)

        # Predict probability of belonging to target using cross-validation
        preds = cross_val_predict(lr, XZ, y[:, 0])

        # Return predictions for source samples
        return preds[:N]

    def iwe_nearest_neighbours(self, X, Z):
        """
        Estimate importance weights based on nearest-neighbours.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        iw : array
            importance weights (N samples by 1)

        Examples
        --------
        X = np.random.randn(10, 2)
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        iw = clf.iwe_nearest_neighbours(X, Z)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Compute Euclidean distance between samples
        d = cdist(X, Z, metric='euclidean')

        # Count target samples within each source Voronoi cell
        ix = np.argmin(d, axis=1)
        iw, _ = np.array(np.histogram(ix, np.arange(N+1)))

        # Laplace smoothing
        if self.smoothing:
            iw = (iw + 1.) / (N + 1)

        # Weight clipping
        if self.clip > 0:
            iw = np.minimum(self.clip, np.maximum(0, iw))

        # Return weights
        return iw

    def iwe_kernel_mean_matching(self, X, Z):
        """
        Estimate importance weights based on kernel mean matching.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        iw : array
            importance weights (N samples by 1)

        Examples
        --------
        X = np.random.randn(10, 2)
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        iw = clf.iwe_kernel_mean_matching(X, Z)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Compute sample pairwise distances
        KXX = cdist(X, X, metric='euclidean')
        KXZ = cdist(X, Z, metric='euclidean')

        # Check non-negative distances
        if not np.all(KXX >= 0):
            raise ValueError('Non-positive distance in source kernel.')
        if not np.all(KXZ >= 0):
            raise ValueError('Non-positive distance in source-target kernel.')

        # Compute kernels
        if self.kernel_type == 'rbf':
            # Radial basis functions
            KXX = np.exp(-KXX / (2*self.bandwidth**2))
            KXZ = np.exp(-KXZ / (2*self.bandwidth**2))

        # Collapse second kernel and normalize
        KXZ = N/M * np.sum(KXZ, axis=1)

        # Prepare for CVXOPT
        Q = matrix(KXX, tc='d')
        p = matrix(KXZ, tc='d')
        G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
                                   -1.*np.eye(N)), axis=0), tc='d')
        h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
                                   np.array([N/np.sqrt(N) - N], ndmin=2),
                                   np.zeros((N, 1))), axis=0), tc='d')

        # Call quadratic program solver
        sol = solvers.qp(Q, p, G, h)

        # Return optimal coefficients as importance weights
        return np.array(sol['x'])[:, 0]

    def fit(self, X, y, Z):
        """
        Fit/train an importance-weighted classifier.

        Arguments
        X : array
            source data (N samples by D features)
        y : array
            source labels (N samples by 1)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        None

        Examples
        --------
        X = np.random.randn(10, 2)
        y = np.vstack((-np.ones((5,)), np.ones((5,))))
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        clf.fit(X, y, Z)

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Find importance-weights
        if self.iwe == 'lr':
            w = self.iwe_logistic_discrimination(X, Z)
        elif self.iwe == 'rg':
            w = self.iwe_ratio_gaussians(X, Z)
        elif self.iwe == 'nn':
            w = self.iwe_nearest_neighbours(X, Z)
        elif self.iwe == 'kde':
            w = self.iwe_kernel_densities(X, Z)
        elif self.iwe == 'kmm':
            w = self.iwe_kernel_mean_matching(X, Z)
        else:
            raise NotImplementedError('Estimator not implemented.')

        # Train a weighted classifier
        if self.loss == 'logistic':
            # Logistic regression model with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'quadratic':
            # Least-squares model with sample weights
            self.clf.fit(X, y, w)
        elif self.loss == 'hinge':
            # Linear support vector machine with sample weights
            self.clf.fit(X, y, w)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss function not implemented.')

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX

    def predict(self, Z_):
        """
        Make predictions on new dataset.

        Arguments
        ---------
        Z_ : array
            new data set (M samples by D features)

        Returns
        -------
        preds : array
            label predictions (M samples by 1)

        Examples
        --------
        X = np.random.randn(10, 2)
        y = np.vstack((-np.ones((5,)), np.ones((5,))))
        Z = np.random.randn(10, 2)
        clf = ImportanceWeightedClassifier()
        clf.fit(X, y, Z)
        u_pred = clf.predict(Z)

        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            if not self.train_data_dim == D:
                raise ValueError('''Test data is of different dimensionality
                                 than training data.''')

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3
4	1		import numpy as np
5	1		import scipy.stats as st
6	1		from scipy.spatial.distance import cdist
7	1		import sklearn as sk
8	1		from sklearn.svm import LinearSVC
9	1		from sklearn.linear_model import LogisticRegression, LinearRegression
10	1		from sklearn.model_selection import cross_val_predict
11	1		from os.path import basename
12	1		from cvxopt import matrix, solvers
13
14	1		from .util import is_pos_def
15
16
17	1		class ImportanceWeightedClassifier(object):
18			"""
19			Class of importance-weighted classifiers.
20
21			Methods contain different importance-weight estimators and different loss
22			functions.
23			"""
24
25	1	View Code Duplication	def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,
			0 ignored issues – show Duplication introduced 2018-06-12 14:41 UTC Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
26			clip=-1, kernel_type='rbf', bandwidth=1):
27			"""
28			Select a particular type of importance-weighted classifier.
29
30			Parameters
31			----------
32			loss : str
33			loss function for weighted classifier, options: 'logistic',
34			'quadratic', 'hinge' (def: 'logistic')
35			l2 : float
36			l2-regularization parameter value (def:0.01)
37			iwe : str
38			importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
39			'kde' (def: 'lr')
40			smoothing : bool
41			whether to apply Laplace smoothing to the nearest-neighbour
42			importance-weight estimator (def: True)
43			clip : float
44			maximum allowable importance-weight value; if set to -1, then the
45			weights are not clipped (def:-1)
46			kernel_type : str
47			what type of kernel to use for kernel density estimation or kernel
48			mean matching, options: 'diste', 'rbf' (def: 'rbf')
49			bandwidth : float
50			kernel bandwidth parameter value for kernel-based weight
51			estimators (def: 1)
52
53			Returns
54			-------
55			None
56
57			Examples
58			--------
59			>>>> clf = ImportanceWeightedClassifier()
60
61			"""
62	1		self.loss = loss
63	1		self.l2 = l2
64	1		self.iwe = iwe
65	1		self.smoothing = smoothing
66	1		self.clip = clip
67	1		self.kernel_type = kernel_type
68	1		self.bandwidth = bandwidth
69
70			# Initialize untrained classifiers based on choice of loss function
71	1		if self.loss == 'logistic':
72			# Logistic regression model
73	1		self.clf = LogisticRegression()
74			elif self.loss == 'quadratic':
75			# Least-squares model
76			self.clf = LinearRegression()
77			elif self.loss == 'hinge':
78			# Linear support vector machine
79			self.clf = LinearSVC()
80			else:
81			# Other loss functions are not implemented
82			raise NotImplementedError('Loss function not implemented.')
83
84			# Whether model has been trained
85	1		self.is_trained = False
86
87			# Dimensionality of training data
88	1		self.train_data_dim = ''
89
90	1		def iwe_ratio_gaussians(self, X, Z):
91			"""
92			Estimate importance weights based on a ratio of Gaussian distributions.
93
94			Parameters
95			----------
96			X : array
97			source data (N samples by D features)
98			Z : array
99			target data (M samples by D features)
100
101			Returns
102			-------
103			iw : array
104			importance weights (N samples by 1)
105
106			Examples
107			--------
108			X = np.random.randn(10, 2)
109			Z = np.random.randn(10, 2)
110			clf = ImportanceWeightedClassifier()
111			iw = clf.iwe_ratio_gaussians(X, Z)
112
113			"""
114			# Data shapes
115	1		N, DX = X.shape
116	1		M, DZ = Z.shape
117
118			# Assert equivalent dimensionalities
119	1		if not DX == DZ:
120			raise ValueError('Dimensionalities of X and Z should be equal.')
121
122			# Sample means in each domain
123	1		mu_X = np.mean(X, axis=0)
124	1		mu_Z = np.mean(Z, axis=0)
125
126			# Sample covariances
127	1		Si_X = np.cov(X.T)
128	1		Si_Z = np.cov(Z.T)
129
130			# Check for positive-definiteness of covariance matrices
131	1		if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
132			print('Warning: covariate matrices not PSD.')
133
134			regct = -6
135			while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
136			print('Adding regularization: ' + str(1**regct))
137
138			# Add regularization
139			Si_X += np.eye(DX)10.*regct
140			Si_Z += np.eye(DZ)10.*regct
141
142			# Increment regularization counter
143			regct += 1
144
145			# Compute probability of X under each domain
146	1		pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
147	1		pS = st.multivariate_normal.pdf(X, mu_X, Si_X)
148
149			# Check for numerical problems
150	1		if np.any(np.isnan(pT)) or np.any(pT == 0):
151			raise ValueError('Source probabilities are NaN or 0.')
152	1		if np.any(np.isnan(pS)) or np.any(pS == 0):
153			raise ValueError('Target probabilities are NaN or 0.')
154
155			# Return the ratio of probabilities
156	1		return pT / pS
157
158	1		def iwe_kernel_densities(self, X, Z):
159			"""
160			Estimate importance weights based on kernel density estimation.
161
162			Parameters
163			----------
164			X : array
165			source data (N samples by D features)
166			Z : array
167			target data (M samples by D features)
168
169			Returns
170			-------
171			iw : array
172			importance weights (N samples by 1)
173
174			Examples
175			--------
176			X = np.random.randn(10, 2)
177			Z = np.random.randn(10, 2)
178			clf = ImportanceWeightedClassifier()
179			iw = clf.iwe_kernel_densities(X, Z)
180
181			"""
182			# Data shapes
183	1		N, DX = X.shape
184	1		M, DZ = Z.shape
185
186			# Assert equivalent dimensionalities
187	1		if not DX == DZ:
188			raise ValueError('Dimensionalities of X and Z should be equal.')
189
190			# Compute probabilities based on source kernel densities
191	1		pT = st.gaussian_kde(Z.T).pdf(X.T)
192	1		pS = st.gaussian_kde(X.T).pdf(X.T)
193
194			# Check for numerical problems
195	1		if np.any(np.isnan(pT)) or np.any(pT == 0):
196			raise ValueError('Source probabilities are NaN or 0.')
197	1		if np.any(np.isnan(pS)) or np.any(pS == 0):
198			raise ValueError('Target probabilities are NaN or 0.')
199
200			# Return the ratio of probabilities
201	1		return pT / pS
202
203	1		def iwe_logistic_discrimination(self, X, Z):
204			"""
205			Estimate importance weights based on logistic regression.
206
207			Parameters
208			----------
209			X : array
210			source data (N samples by D features)
211			Z : array
212			target data (M samples by D features)
213
214			Returns
215			-------
216			iw : array
217			importance weights (N samples by 1)
218
219			Examples
220			--------
221			X = np.random.randn(10, 2)
222			Z = np.random.randn(10, 2)
223			clf = ImportanceWeightedClassifier()
224			iw = clf.iwe_logistic_discrimination(X, Z)
225
226			"""
227			# Data shapes
228	1		N, DX = X.shape
229	1		M, DZ = Z.shape
230
231			# Assert equivalent dimensionalities
232	1		if not DX == DZ:
233			raise ValueError('Dimensionalities of X and Z should be equal.')
234
235			# Make domain-label variable
236	1		y = np.concatenate((np.zeros((N, 1)),
237			np.ones((M, 1))), axis=0)
238
239			# Concatenate data
240	1		XZ = np.concatenate((X, Z), axis=0)
241
242			# Call a logistic regressor
243	1		lr = LogisticRegression(C=self.l2)
244
245			# Predict probability of belonging to target using cross-validation
246	1		preds = cross_val_predict(lr, XZ, y[:, 0])
247
248			# Return predictions for source samples
249	1		return preds[:N]
250
251	1		def iwe_nearest_neighbours(self, X, Z):
252			"""
253			Estimate importance weights based on nearest-neighbours.
254
255			Parameters
256			----------
257			X : array
258			source data (N samples by D features)
259			Z : array
260			target data (M samples by D features)
261
262			Returns
263			-------
264			iw : array
265			importance weights (N samples by 1)
266
267			Examples
268			--------
269			X = np.random.randn(10, 2)
270			Z = np.random.randn(10, 2)
271			clf = ImportanceWeightedClassifier()
272			iw = clf.iwe_nearest_neighbours(X, Z)
273
274			"""
275			# Data shapes
276	1		N, DX = X.shape
277	1		M, DZ = Z.shape
278
279			# Assert equivalent dimensionalities
280	1		if not DX == DZ:
281			raise ValueError('Dimensionalities of X and Z should be equal.')
282
283			# Compute Euclidean distance between samples
284	1		d = cdist(X, Z, metric='euclidean')
285
286			# Count target samples within each source Voronoi cell
287	1		ix = np.argmin(d, axis=1)
288	1		iw, _ = np.array(np.histogram(ix, np.arange(N+1)))
289
290			# Laplace smoothing
291	1		if self.smoothing:
292	1		iw = (iw + 1.) / (N + 1)
293
294			# Weight clipping
295	1		if self.clip > 0:
296			iw = np.minimum(self.clip, np.maximum(0, iw))
297
298			# Return weights
299	1		return iw
300
301	1		def iwe_kernel_mean_matching(self, X, Z):
302			"""
303			Estimate importance weights based on kernel mean matching.
304
305			Parameters
306			----------
307			X : array
308			source data (N samples by D features)
309			Z : array
310			target data (M samples by D features)
311
312			Returns
313			-------
314			iw : array
315			importance weights (N samples by 1)
316
317			Examples
318			--------
319			X = np.random.randn(10, 2)
320			Z = np.random.randn(10, 2)
321			clf = ImportanceWeightedClassifier()
322			iw = clf.iwe_kernel_mean_matching(X, Z)
323
324			"""
325			# Data shapes
326	1		N, DX = X.shape
327	1		M, DZ = Z.shape
328
329			# Assert equivalent dimensionalities
330	1		if not DX == DZ:
331			raise ValueError('Dimensionalities of X and Z should be equal.')
332
333			# Compute sample pairwise distances
334	1		KXX = cdist(X, X, metric='euclidean')
335	1		KXZ = cdist(X, Z, metric='euclidean')
336
337			# Check non-negative distances
338	1		if not np.all(KXX >= 0):
339			raise ValueError('Non-positive distance in source kernel.')
340	1		if not np.all(KXZ >= 0):
341			raise ValueError('Non-positive distance in source-target kernel.')
342
343			# Compute kernels
344	1		if self.kernel_type == 'rbf':
345			# Radial basis functions
346	1		KXX = np.exp(-KXX / (2self.bandwidth*2))
347	1		KXZ = np.exp(-KXZ / (2self.bandwidth*2))
348
349			# Collapse second kernel and normalize
350	1		KXZ = N/M * np.sum(KXZ, axis=1)
351
352			# Prepare for CVXOPT
353	1		Q = matrix(KXX, tc='d')
354	1		p = matrix(KXZ, tc='d')
355	1		G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
356			-1.*np.eye(N)), axis=0), tc='d')
357	1		h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
358			np.array([N/np.sqrt(N) - N], ndmin=2),
359			np.zeros((N, 1))), axis=0), tc='d')
360
361			# Call quadratic program solver
362	1		sol = solvers.qp(Q, p, G, h)
363
364			# Return optimal coefficients as importance weights
365	1		return np.array(sol['x'])[:, 0]
366
367	1		def fit(self, X, y, Z):
368			"""
369			Fit/train an importance-weighted classifier.
370
371			Arguments
372			X : array
373			source data (N samples by D features)
374			y : array
375			source labels (N samples by 1)
376			Z : array
377			target data (M samples by D features)
378
379			Returns
380			-------
381			None
382
383			Examples
384			--------
385			X = np.random.randn(10, 2)
386			y = np.vstack((-np.ones((5,)), np.ones((5,))))
387			Z = np.random.randn(10, 2)
388			clf = ImportanceWeightedClassifier()
389			clf.fit(X, y, Z)
390
391			"""
392			# Data shapes
393	1		N, DX = X.shape
394	1		M, DZ = Z.shape
395
396			# Assert equivalent dimensionalities
397	1		if not DX == DZ:
398			raise ValueError('Dimensionalities of X and Z should be equal.')
399
400			# Find importance-weights
401	1		if self.iwe == 'lr':
402	1		w = self.iwe_logistic_discrimination(X, Z)
403			elif self.iwe == 'rg':
404			w = self.iwe_ratio_gaussians(X, Z)
405			elif self.iwe == 'nn':
406			w = self.iwe_nearest_neighbours(X, Z)
407			elif self.iwe == 'kde':
408			w = self.iwe_kernel_densities(X, Z)
409			elif self.iwe == 'kmm':
410			w = self.iwe_kernel_mean_matching(X, Z)
411			else:
412			raise NotImplementedError('Estimator not implemented.')
413
414			# Train a weighted classifier
415	1		if self.loss == 'logistic':
416			# Logistic regression model with sample weights
417	1		self.clf.fit(X, y, w)
418			elif self.loss == 'quadratic':
419			# Least-squares model with sample weights
420			self.clf.fit(X, y, w)
421			elif self.loss == 'hinge':
422			# Linear support vector machine with sample weights
423			self.clf.fit(X, y, w)
424			else:
425			# Other loss functions are not implemented
426			raise NotImplementedError('Loss function not implemented.')
427
428			# Mark classifier as trained
429	1		self.is_trained = True
430
431			# Store training data dimensionality
432	1		self.train_data_dim = DX
433
434	1		def predict(self, Z_):
435			"""
436			Make predictions on new dataset.
437
438			Arguments
439			---------
440			Z_ : array
441			new data set (M samples by D features)
442
443			Returns
444			-------
445			preds : array
446			label predictions (M samples by 1)
447
448			Examples
449			--------
450			X = np.random.randn(10, 2)
451			y = np.vstack((-np.ones((5,)), np.ones((5,))))
452			Z = np.random.randn(10, 2)
453			clf = ImportanceWeightedClassifier()
454			clf.fit(X, y, Z)
455			u_pred = clf.predict(Z)
456
457			"""
458			# Data shape
459	1		M, D = Z_.shape
460
461			# If classifier is trained, check for same dimensionality
462	1		if self.is_trained:
463	1		if not self.train_data_dim == D:
464			raise ValueError('''Test data is of different dimensionality
465			than training data.''')
466
467			# Call scikit's predict function
468	1		preds = self.clf.predict(Z_)
469
470			# For quadratic loss function, correct predictions
471	1		if self.loss == 'quadratic':
472			preds = (np.sign(preds)+1)/2.
473
474			# Return predictions array
475	1		return preds
476
477	1		def get_params(self):
478			"""Get classifier parameters."""
479			return self.clf.get_params()
480
481	1		def is_trained(self):
482			"""Check whether classifier is trained."""
483			return self.is_trained
484

wmkouw / libTLDA

Push — master ( 748d0e...582254 )

iwe_kernel_mean_matching() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like