Total Complexity | 47 |
Total Lines | 467 |
Duplicated Lines | 13.7 % |
Coverage | 68.38% |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like ImportanceWeightedClassifier often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | #!/usr/bin/env python |
||
17 | 1 | class ImportanceWeightedClassifier(object): |
|
18 | """ |
||
19 | Class of importance-weighted classifiers. |
||
20 | |||
21 | Methods contain different importance-weight estimators and different loss |
||
22 | functions. |
||
23 | """ |
||
24 | |||
25 | 1 | View Code Duplication | def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True, |
|
|||
26 | clip=-1, kernel_type='rbf', bandwidth=1): |
||
27 | """ |
||
28 | Select a particular type of importance-weighted classifier. |
||
29 | |||
30 | Parameters |
||
31 | ---------- |
||
32 | loss : str |
||
33 | loss function for weighted classifier, options: 'logistic', |
||
34 | 'quadratic', 'hinge' (def: 'logistic') |
||
35 | l2 : float |
||
36 | l2-regularization parameter value (def:0.01) |
||
37 | iwe : str |
||
38 | importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm', |
||
39 | 'kde' (def: 'lr') |
||
40 | smoothing : bool |
||
41 | whether to apply Laplace smoothing to the nearest-neighbour |
||
42 | importance-weight estimator (def: True) |
||
43 | clip : float |
||
44 | maximum allowable importance-weight value; if set to -1, then the |
||
45 | weights are not clipped (def:-1) |
||
46 | kernel_type : str |
||
47 | what type of kernel to use for kernel density estimation or kernel |
||
48 | mean matching, options: 'diste', 'rbf' (def: 'rbf') |
||
49 | bandwidth : float |
||
50 | kernel bandwidth parameter value for kernel-based weight |
||
51 | estimators (def: 1) |
||
52 | |||
53 | Returns |
||
54 | ------- |
||
55 | None |
||
56 | |||
57 | Examples |
||
58 | -------- |
||
59 | >>>> clf = ImportanceWeightedClassifier() |
||
60 | |||
61 | """ |
||
62 | 1 | self.loss = loss |
|
63 | 1 | self.l2 = l2 |
|
64 | 1 | self.iwe = iwe |
|
65 | 1 | self.smoothing = smoothing |
|
66 | 1 | self.clip = clip |
|
67 | 1 | self.kernel_type = kernel_type |
|
68 | 1 | self.bandwidth = bandwidth |
|
69 | |||
70 | # Initialize untrained classifiers based on choice of loss function |
||
71 | 1 | if self.loss == 'logistic': |
|
72 | # Logistic regression model |
||
73 | 1 | self.clf = LogisticRegression() |
|
74 | elif self.loss == 'quadratic': |
||
75 | # Least-squares model |
||
76 | self.clf = LinearRegression() |
||
77 | elif self.loss == 'hinge': |
||
78 | # Linear support vector machine |
||
79 | self.clf = LinearSVC() |
||
80 | else: |
||
81 | # Other loss functions are not implemented |
||
82 | raise NotImplementedError('Loss function not implemented.') |
||
83 | |||
84 | # Whether model has been trained |
||
85 | 1 | self.is_trained = False |
|
86 | |||
87 | # Dimensionality of training data |
||
88 | 1 | self.train_data_dim = '' |
|
89 | |||
90 | 1 | def iwe_ratio_gaussians(self, X, Z): |
|
91 | """ |
||
92 | Estimate importance weights based on a ratio of Gaussian distributions. |
||
93 | |||
94 | Parameters |
||
95 | ---------- |
||
96 | X : array |
||
97 | source data (N samples by D features) |
||
98 | Z : array |
||
99 | target data (M samples by D features) |
||
100 | |||
101 | Returns |
||
102 | ------- |
||
103 | iw : array |
||
104 | importance weights (N samples by 1) |
||
105 | |||
106 | Examples |
||
107 | -------- |
||
108 | X = np.random.randn(10, 2) |
||
109 | Z = np.random.randn(10, 2) |
||
110 | clf = ImportanceWeightedClassifier() |
||
111 | iw = clf.iwe_ratio_gaussians(X, Z) |
||
112 | |||
113 | """ |
||
114 | # Data shapes |
||
115 | 1 | N, DX = X.shape |
|
116 | 1 | M, DZ = Z.shape |
|
117 | |||
118 | # Assert equivalent dimensionalities |
||
119 | 1 | if not DX == DZ: |
|
120 | raise ValueError('Dimensionalities of X and Z should be equal.') |
||
121 | |||
122 | # Sample means in each domain |
||
123 | 1 | mu_X = np.mean(X, axis=0) |
|
124 | 1 | mu_Z = np.mean(Z, axis=0) |
|
125 | |||
126 | # Sample covariances |
||
127 | 1 | Si_X = np.cov(X.T) |
|
128 | 1 | Si_Z = np.cov(Z.T) |
|
129 | |||
130 | # Check for positive-definiteness of covariance matrices |
||
131 | 1 | if not (is_pos_def(Si_X) or is_pos_def(Si_Z)): |
|
132 | print('Warning: covariate matrices not PSD.') |
||
133 | |||
134 | regct = -6 |
||
135 | while not (is_pos_def(Si_X) or is_pos_def(Si_Z)): |
||
136 | print('Adding regularization: ' + str(1**regct)) |
||
137 | |||
138 | # Add regularization |
||
139 | Si_X += np.eye(DX)*10.**regct |
||
140 | Si_Z += np.eye(DZ)*10.**regct |
||
141 | |||
142 | # Increment regularization counter |
||
143 | regct += 1 |
||
144 | |||
145 | # Compute probability of X under each domain |
||
146 | 1 | pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z) |
|
147 | 1 | pS = st.multivariate_normal.pdf(X, mu_X, Si_X) |
|
148 | |||
149 | # Check for numerical problems |
||
150 | 1 | if np.any(np.isnan(pT)) or np.any(pT == 0): |
|
151 | raise ValueError('Source probabilities are NaN or 0.') |
||
152 | 1 | if np.any(np.isnan(pS)) or np.any(pS == 0): |
|
153 | raise ValueError('Target probabilities are NaN or 0.') |
||
154 | |||
155 | # Return the ratio of probabilities |
||
156 | 1 | return pT / pS |
|
157 | |||
158 | 1 | def iwe_kernel_densities(self, X, Z): |
|
159 | """ |
||
160 | Estimate importance weights based on kernel density estimation. |
||
161 | |||
162 | Parameters |
||
163 | ---------- |
||
164 | X : array |
||
165 | source data (N samples by D features) |
||
166 | Z : array |
||
167 | target data (M samples by D features) |
||
168 | |||
169 | Returns |
||
170 | ------- |
||
171 | iw : array |
||
172 | importance weights (N samples by 1) |
||
173 | |||
174 | Examples |
||
175 | -------- |
||
176 | X = np.random.randn(10, 2) |
||
177 | Z = np.random.randn(10, 2) |
||
178 | clf = ImportanceWeightedClassifier() |
||
179 | iw = clf.iwe_kernel_densities(X, Z) |
||
180 | |||
181 | """ |
||
182 | # Data shapes |
||
183 | 1 | N, DX = X.shape |
|
184 | 1 | M, DZ = Z.shape |
|
185 | |||
186 | # Assert equivalent dimensionalities |
||
187 | 1 | if not DX == DZ: |
|
188 | raise ValueError('Dimensionalities of X and Z should be equal.') |
||
189 | |||
190 | # Compute probabilities based on source kernel densities |
||
191 | 1 | pT = st.gaussian_kde(Z.T).pdf(X.T) |
|
192 | 1 | pS = st.gaussian_kde(X.T).pdf(X.T) |
|
193 | |||
194 | # Check for numerical problems |
||
195 | 1 | if np.any(np.isnan(pT)) or np.any(pT == 0): |
|
196 | raise ValueError('Source probabilities are NaN or 0.') |
||
197 | 1 | if np.any(np.isnan(pS)) or np.any(pS == 0): |
|
198 | raise ValueError('Target probabilities are NaN or 0.') |
||
199 | |||
200 | # Return the ratio of probabilities |
||
201 | 1 | return pT / pS |
|
202 | |||
203 | 1 | def iwe_logistic_discrimination(self, X, Z): |
|
204 | """ |
||
205 | Estimate importance weights based on logistic regression. |
||
206 | |||
207 | Parameters |
||
208 | ---------- |
||
209 | X : array |
||
210 | source data (N samples by D features) |
||
211 | Z : array |
||
212 | target data (M samples by D features) |
||
213 | |||
214 | Returns |
||
215 | ------- |
||
216 | iw : array |
||
217 | importance weights (N samples by 1) |
||
218 | |||
219 | Examples |
||
220 | -------- |
||
221 | X = np.random.randn(10, 2) |
||
222 | Z = np.random.randn(10, 2) |
||
223 | clf = ImportanceWeightedClassifier() |
||
224 | iw = clf.iwe_logistic_discrimination(X, Z) |
||
225 | |||
226 | """ |
||
227 | # Data shapes |
||
228 | 1 | N, DX = X.shape |
|
229 | 1 | M, DZ = Z.shape |
|
230 | |||
231 | # Assert equivalent dimensionalities |
||
232 | 1 | if not DX == DZ: |
|
233 | raise ValueError('Dimensionalities of X and Z should be equal.') |
||
234 | |||
235 | # Make domain-label variable |
||
236 | 1 | y = np.concatenate((np.zeros((N, 1)), |
|
237 | np.ones((M, 1))), axis=0) |
||
238 | |||
239 | # Concatenate data |
||
240 | 1 | XZ = np.concatenate((X, Z), axis=0) |
|
241 | |||
242 | # Call a logistic regressor |
||
243 | 1 | lr = LogisticRegression(C=self.l2) |
|
244 | |||
245 | # Predict probability of belonging to target using cross-validation |
||
246 | 1 | preds = cross_val_predict(lr, XZ, y[:, 0]) |
|
247 | |||
248 | # Return predictions for source samples |
||
249 | 1 | return preds[:N] |
|
250 | |||
251 | 1 | def iwe_nearest_neighbours(self, X, Z): |
|
252 | """ |
||
253 | Estimate importance weights based on nearest-neighbours. |
||
254 | |||
255 | Parameters |
||
256 | ---------- |
||
257 | X : array |
||
258 | source data (N samples by D features) |
||
259 | Z : array |
||
260 | target data (M samples by D features) |
||
261 | |||
262 | Returns |
||
263 | ------- |
||
264 | iw : array |
||
265 | importance weights (N samples by 1) |
||
266 | |||
267 | Examples |
||
268 | -------- |
||
269 | X = np.random.randn(10, 2) |
||
270 | Z = np.random.randn(10, 2) |
||
271 | clf = ImportanceWeightedClassifier() |
||
272 | iw = clf.iwe_nearest_neighbours(X, Z) |
||
273 | |||
274 | """ |
||
275 | # Data shapes |
||
276 | 1 | N, DX = X.shape |
|
277 | 1 | M, DZ = Z.shape |
|
278 | |||
279 | # Assert equivalent dimensionalities |
||
280 | 1 | if not DX == DZ: |
|
281 | raise ValueError('Dimensionalities of X and Z should be equal.') |
||
282 | |||
283 | # Compute Euclidean distance between samples |
||
284 | 1 | d = cdist(X, Z, metric='euclidean') |
|
285 | |||
286 | # Count target samples within each source Voronoi cell |
||
287 | 1 | ix = np.argmin(d, axis=1) |
|
288 | 1 | iw, _ = np.array(np.histogram(ix, np.arange(N+1))) |
|
289 | |||
290 | # Laplace smoothing |
||
291 | 1 | if self.smoothing: |
|
292 | 1 | iw = (iw + 1.) / (N + 1) |
|
293 | |||
294 | # Weight clipping |
||
295 | 1 | if self.clip > 0: |
|
296 | iw = np.minimum(self.clip, np.maximum(0, iw)) |
||
297 | |||
298 | # Return weights |
||
299 | 1 | return iw |
|
300 | |||
301 | 1 | def iwe_kernel_mean_matching(self, X, Z): |
|
302 | """ |
||
303 | Estimate importance weights based on kernel mean matching. |
||
304 | |||
305 | Parameters |
||
306 | ---------- |
||
307 | X : array |
||
308 | source data (N samples by D features) |
||
309 | Z : array |
||
310 | target data (M samples by D features) |
||
311 | |||
312 | Returns |
||
313 | ------- |
||
314 | iw : array |
||
315 | importance weights (N samples by 1) |
||
316 | |||
317 | Examples |
||
318 | -------- |
||
319 | X = np.random.randn(10, 2) |
||
320 | Z = np.random.randn(10, 2) |
||
321 | clf = ImportanceWeightedClassifier() |
||
322 | iw = clf.iwe_kernel_mean_matching(X, Z) |
||
323 | |||
324 | """ |
||
325 | # Data shapes |
||
326 | 1 | N, DX = X.shape |
|
327 | 1 | M, DZ = Z.shape |
|
328 | |||
329 | # Assert equivalent dimensionalities |
||
330 | 1 | if not DX == DZ: |
|
331 | raise ValueError('Dimensionalities of X and Z should be equal.') |
||
332 | |||
333 | # Compute sample pairwise distances |
||
334 | 1 | KXX = cdist(X, X, metric='euclidean') |
|
335 | 1 | KXZ = cdist(X, Z, metric='euclidean') |
|
336 | |||
337 | # Check non-negative distances |
||
338 | 1 | if not np.all(KXX >= 0): |
|
339 | raise ValueError('Non-positive distance in source kernel.') |
||
340 | 1 | if not np.all(KXZ >= 0): |
|
341 | raise ValueError('Non-positive distance in source-target kernel.') |
||
342 | |||
343 | # Compute kernels |
||
344 | 1 | if self.kernel_type == 'rbf': |
|
345 | # Radial basis functions |
||
346 | 1 | KXX = np.exp(-KXX / (2*self.bandwidth**2)) |
|
347 | 1 | KXZ = np.exp(-KXZ / (2*self.bandwidth**2)) |
|
348 | |||
349 | # Collapse second kernel and normalize |
||
350 | 1 | KXZ = N/M * np.sum(KXZ, axis=1) |
|
351 | |||
352 | # Prepare for CVXOPT |
||
353 | 1 | Q = matrix(KXX, tc='d') |
|
354 | 1 | p = matrix(KXZ, tc='d') |
|
355 | 1 | G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)), |
|
356 | -1.*np.eye(N)), axis=0), tc='d') |
||
357 | 1 | h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2), |
|
358 | np.array([N/np.sqrt(N) - N], ndmin=2), |
||
359 | np.zeros((N, 1))), axis=0), tc='d') |
||
360 | |||
361 | # Call quadratic program solver |
||
362 | 1 | sol = solvers.qp(Q, p, G, h) |
|
363 | |||
364 | # Return optimal coefficients as importance weights |
||
365 | 1 | return np.array(sol['x'])[:, 0] |
|
366 | |||
367 | 1 | def fit(self, X, y, Z): |
|
368 | """ |
||
369 | Fit/train an importance-weighted classifier. |
||
370 | |||
371 | Arguments |
||
372 | X : array |
||
373 | source data (N samples by D features) |
||
374 | y : array |
||
375 | source labels (N samples by 1) |
||
376 | Z : array |
||
377 | target data (M samples by D features) |
||
378 | |||
379 | Returns |
||
380 | ------- |
||
381 | None |
||
382 | |||
383 | Examples |
||
384 | -------- |
||
385 | X = np.random.randn(10, 2) |
||
386 | y = np.vstack((-np.ones((5,)), np.ones((5,)))) |
||
387 | Z = np.random.randn(10, 2) |
||
388 | clf = ImportanceWeightedClassifier() |
||
389 | clf.fit(X, y, Z) |
||
390 | |||
391 | """ |
||
392 | # Data shapes |
||
393 | 1 | N, DX = X.shape |
|
394 | 1 | M, DZ = Z.shape |
|
395 | |||
396 | # Assert equivalent dimensionalities |
||
397 | 1 | if not DX == DZ: |
|
398 | raise ValueError('Dimensionalities of X and Z should be equal.') |
||
399 | |||
400 | # Find importance-weights |
||
401 | 1 | if self.iwe == 'lr': |
|
402 | 1 | w = self.iwe_logistic_discrimination(X, Z) |
|
403 | elif self.iwe == 'rg': |
||
404 | w = self.iwe_ratio_gaussians(X, Z) |
||
405 | elif self.iwe == 'nn': |
||
406 | w = self.iwe_nearest_neighbours(X, Z) |
||
407 | elif self.iwe == 'kde': |
||
408 | w = self.iwe_kernel_densities(X, Z) |
||
409 | elif self.iwe == 'kmm': |
||
410 | w = self.iwe_kernel_mean_matching(X, Z) |
||
411 | else: |
||
412 | raise NotImplementedError('Estimator not implemented.') |
||
413 | |||
414 | # Train a weighted classifier |
||
415 | 1 | if self.loss == 'logistic': |
|
416 | # Logistic regression model with sample weights |
||
417 | 1 | self.clf.fit(X, y, w) |
|
418 | elif self.loss == 'quadratic': |
||
419 | # Least-squares model with sample weights |
||
420 | self.clf.fit(X, y, w) |
||
421 | elif self.loss == 'hinge': |
||
422 | # Linear support vector machine with sample weights |
||
423 | self.clf.fit(X, y, w) |
||
424 | else: |
||
425 | # Other loss functions are not implemented |
||
426 | raise NotImplementedError('Loss function not implemented.') |
||
427 | |||
428 | # Mark classifier as trained |
||
429 | 1 | self.is_trained = True |
|
430 | |||
431 | # Store training data dimensionality |
||
432 | 1 | self.train_data_dim = DX |
|
433 | |||
434 | 1 | def predict(self, Z_): |
|
435 | """ |
||
436 | Make predictions on new dataset. |
||
437 | |||
438 | Arguments |
||
439 | --------- |
||
440 | Z_ : array |
||
441 | new data set (M samples by D features) |
||
442 | |||
443 | Returns |
||
444 | ------- |
||
445 | preds : array |
||
446 | label predictions (M samples by 1) |
||
447 | |||
448 | Examples |
||
449 | -------- |
||
450 | X = np.random.randn(10, 2) |
||
451 | y = np.vstack((-np.ones((5,)), np.ones((5,)))) |
||
452 | Z = np.random.randn(10, 2) |
||
453 | clf = ImportanceWeightedClassifier() |
||
454 | clf.fit(X, y, Z) |
||
455 | u_pred = clf.predict(Z) |
||
456 | |||
457 | """ |
||
458 | # Data shape |
||
459 | 1 | M, D = Z_.shape |
|
460 | |||
461 | # If classifier is trained, check for same dimensionality |
||
462 | 1 | if self.is_trained: |
|
463 | 1 | if not self.train_data_dim == D: |
|
464 | raise ValueError('''Test data is of different dimensionality |
||
465 | than training data.''') |
||
466 | |||
467 | # Call scikit's predict function |
||
468 | 1 | preds = self.clf.predict(Z_) |
|
469 | |||
470 | # For quadratic loss function, correct predictions |
||
471 | 1 | if self.loss == 'quadratic': |
|
472 | preds = (np.sign(preds)+1)/2. |
||
473 | |||
474 | # Return predictions array |
||
475 | 1 | return preds |
|
476 | |||
477 | 1 | def get_params(self): |
|
478 | """Get classifier parameters.""" |
||
479 | return self.clf.get_params() |
||
480 | |||
481 | 1 | def is_trained(self): |
|
482 | """Check whether classifier is trained.""" |
||
483 | return self.is_trained |
||
484 |