|
1
|
|
|
import numpy as np |
|
|
|
|
|
|
2
|
|
|
|
|
3
|
|
|
import sklearn.cross_validation as skl_cross_validation |
|
|
|
|
|
|
4
|
|
|
|
|
5
|
|
|
from Orange.data import Table |
|
6
|
|
|
|
|
7
|
|
|
__all__ = ["Results", "CrossValidation", "LeaveOneOut", "TestOnTrainingData", |
|
8
|
|
|
"ShuffleSplit", "TestOnTestData", "sample"] |
|
9
|
|
|
|
|
10
|
|
|
|
|
11
|
|
|
class Results: |
|
12
|
|
|
""" |
|
13
|
|
|
Class for storing predictions in model testing. |
|
14
|
|
|
|
|
15
|
|
|
.. attribute:: data |
|
16
|
|
|
|
|
17
|
|
|
Data used for testing (optional; can be `None`). When data is stored, |
|
18
|
|
|
this is typically not a copy but a reference. |
|
19
|
|
|
|
|
20
|
|
|
.. attribute:: row_indices |
|
21
|
|
|
|
|
22
|
|
|
Indices of rows in :obj:`data` that were used in testing, stored as |
|
23
|
|
|
a numpy vector of length `nrows`. Values of `actual[i]`, `predicted[i]` |
|
24
|
|
|
and `probabilities[i]` refer to the target value of instance |
|
25
|
|
|
`data[row_indices[i]]`. |
|
26
|
|
|
|
|
27
|
|
|
.. attribute:: nrows |
|
28
|
|
|
|
|
29
|
|
|
The number of test instances (including duplicates). |
|
30
|
|
|
|
|
31
|
|
|
.. attribute:: models |
|
32
|
|
|
|
|
33
|
|
|
A list of induced models (optional; can be `None`). |
|
34
|
|
|
|
|
35
|
|
|
.. attribute:: actual |
|
36
|
|
|
|
|
37
|
|
|
Actual values of target variable; a numpy vector of length `nrows` and |
|
38
|
|
|
of the same type as `data` (or `np.float32` if the type of data cannot |
|
39
|
|
|
be determined). |
|
40
|
|
|
|
|
41
|
|
|
.. attribute:: predicted |
|
42
|
|
|
|
|
43
|
|
|
Predicted values of target variable; a numpy array of shape |
|
44
|
|
|
(number-of-methods, `nrows`) and of the same type as `data` (or |
|
45
|
|
|
`np.float32` if the type of data cannot be determined). |
|
46
|
|
|
|
|
47
|
|
|
.. attribute:: probabilities |
|
48
|
|
|
|
|
49
|
|
|
Predicted probabilities (for discrete target variables); a numpy array |
|
50
|
|
|
of shape (number-of-methods, `nrows`, number-of-classes) of type |
|
51
|
|
|
`np.float32`. |
|
52
|
|
|
|
|
53
|
|
|
.. attribute:: folds |
|
54
|
|
|
|
|
55
|
|
|
A list of indices (or slice objects) corresponding to rows of each |
|
56
|
|
|
fold; `None` if not applicable. |
|
57
|
|
|
""" |
|
58
|
|
|
|
|
59
|
|
|
# noinspection PyBroadException |
|
60
|
|
|
# noinspection PyNoneFunctionAssignment |
|
61
|
|
|
def __init__(self, data=None, nmethods=0, nrows=None, nclasses=None, |
|
62
|
|
|
store_data=False, store_models=False, domain=None, |
|
63
|
|
|
actual=None, row_indices=None, |
|
64
|
|
|
predicted=None, probabilities=None, |
|
65
|
|
|
preprocessor=None, callback=None): |
|
66
|
|
|
""" |
|
67
|
|
|
Construct an instance with default values: `None` for :obj:`data` and |
|
68
|
|
|
:obj:`models`. |
|
69
|
|
|
|
|
70
|
|
|
If the number of rows and/or the number of classes is not given, it is |
|
71
|
|
|
inferred from :obj:`data`, if provided. The data type for |
|
72
|
|
|
:obj:`actual` and :obj:`predicted` is determined from the data; if the |
|
73
|
|
|
latter cannot be find, `np.float32` is used. |
|
74
|
|
|
|
|
75
|
|
|
Attribute :obj:`actual` and :obj:`row_indices` are constructed as empty |
|
76
|
|
|
(uninitialized) arrays of the appropriate size, if the number of rows |
|
77
|
|
|
is known. Attribute :obj:`predicted` is constructed if the number of |
|
78
|
|
|
rows and of methods is given; :obj:`probabilities` also requires |
|
79
|
|
|
knowing the number of classes. |
|
80
|
|
|
|
|
81
|
|
|
:param data: Data or domain |
|
82
|
|
|
:type data: Orange.data.Table or Orange.data.Domain |
|
83
|
|
|
:param nmethods: The number of methods that will be tested |
|
84
|
|
|
:type nmethods: int |
|
85
|
|
|
:param nrows: The number of test instances (including duplicates) |
|
86
|
|
|
:type nrows: int |
|
87
|
|
|
:param nclasses: The number of class values |
|
88
|
|
|
:type nclasses: int |
|
89
|
|
|
:param store_data: A flag that tells whether to store the data; |
|
90
|
|
|
this argument can be given only as keyword argument |
|
91
|
|
|
:type store_data: bool |
|
92
|
|
|
:param store_models: A flag that tells whether to store the models; |
|
93
|
|
|
this argument can be given only as keyword argument |
|
94
|
|
|
:type store_models: bool |
|
95
|
|
|
:param preprocessor: Preprocessor for training data |
|
96
|
|
|
:type preprocessor: Orange.preprocess.Preprocess |
|
97
|
|
|
:param callback: Function for reporting back the progress as a value |
|
98
|
|
|
between 0 and 1 |
|
99
|
|
|
:type callback: callable |
|
100
|
|
|
""" |
|
101
|
|
|
self.store_data = store_data |
|
102
|
|
|
self.store_models = store_models |
|
103
|
|
|
self.data = data if store_data else None |
|
104
|
|
|
self.models = None |
|
105
|
|
|
self.folds = None |
|
106
|
|
|
dtype = np.float32 |
|
107
|
|
|
self.preprocessor = preprocessor |
|
108
|
|
|
self.callback = callback |
|
109
|
|
|
|
|
110
|
|
|
def set_or_raise(value, exp_values, msg): |
|
111
|
|
|
for exp_value in exp_values: |
|
112
|
|
|
if exp_value is False: |
|
113
|
|
|
continue |
|
114
|
|
|
if value is None: |
|
115
|
|
|
value = exp_value |
|
116
|
|
|
elif value != exp_value: |
|
117
|
|
|
raise ValueError(msg) |
|
118
|
|
|
return value |
|
119
|
|
|
|
|
120
|
|
|
domain = self.domain = set_or_raise( |
|
121
|
|
|
domain, [data is not None and data.domain], |
|
122
|
|
|
"mismatching domain") |
|
123
|
|
|
nrows = set_or_raise( |
|
124
|
|
|
nrows, [data is not None and len(data), |
|
125
|
|
|
actual is not None and len(actual), |
|
126
|
|
|
row_indices is not None and len(row_indices), |
|
127
|
|
|
predicted is not None and predicted.shape[1], |
|
128
|
|
|
probabilities is not None and probabilities.shape[1]], |
|
129
|
|
|
"mismatching number of rows") |
|
130
|
|
|
nclasses = set_or_raise( |
|
131
|
|
|
nclasses, [domain and (len(domain.class_var.values) |
|
132
|
|
|
if domain.has_discrete_class |
|
133
|
|
|
else None), |
|
134
|
|
|
probabilities is not None and probabilities.shape[2]], |
|
135
|
|
|
"mismatching number of class values") |
|
136
|
|
|
if nclasses is not None and probabilities is not None: |
|
137
|
|
|
raise ValueError("regression results cannot have 'probabilities'") |
|
138
|
|
|
nmethods = set_or_raise( |
|
139
|
|
|
nmethods, [predicted is not None and predicted.shape[0], |
|
140
|
|
|
probabilities is not None and probabilities.shape[0]], |
|
141
|
|
|
"mismatching number of methods") |
|
142
|
|
|
try: |
|
143
|
|
|
dtype = data.Y.dtype |
|
144
|
|
|
except AttributeError: # no data or no Y or not numpy |
|
|
|
|
|
|
145
|
|
|
pass |
|
146
|
|
|
|
|
147
|
|
|
if nmethods is not None: |
|
148
|
|
|
self.failed = [False] * nmethods |
|
149
|
|
|
|
|
150
|
|
|
if actual is not None: |
|
151
|
|
|
self.actual = actual |
|
152
|
|
|
elif nrows is not None: |
|
153
|
|
|
self.actual = np.empty(nrows, dtype=dtype) |
|
154
|
|
|
|
|
155
|
|
|
if row_indices is not None: |
|
156
|
|
|
self.row_indices = row_indices |
|
157
|
|
|
elif nrows is not None: |
|
158
|
|
|
self.row_indices = np.empty(nrows, dtype=np.int32) |
|
|
|
|
|
|
159
|
|
|
|
|
160
|
|
|
if predicted is not None: |
|
161
|
|
|
self.predicted = predicted |
|
162
|
|
|
elif nmethods is not None and nrows is not None: |
|
163
|
|
|
self.predicted = np.empty((nmethods, nrows), dtype=dtype) |
|
164
|
|
|
|
|
165
|
|
|
if probabilities is not None: |
|
166
|
|
|
self.probabilities = probabilities |
|
167
|
|
|
elif nmethods is not None and nrows is not None and \ |
|
168
|
|
|
nclasses is not None: |
|
169
|
|
|
self.probabilities = \ |
|
170
|
|
|
np.empty((nmethods, nrows, nclasses), dtype=np.float32) |
|
171
|
|
|
|
|
172
|
|
|
def train_if_succ(self, learner_index, learner, data): |
|
173
|
|
|
if self.failed[learner_index]: |
|
174
|
|
|
return False |
|
175
|
|
|
try: |
|
176
|
|
|
return learner(data) |
|
177
|
|
|
except Exception as ex: |
|
|
|
|
|
|
178
|
|
|
self.failed[learner_index] = ex |
|
179
|
|
|
return False |
|
180
|
|
|
|
|
181
|
|
|
def call_callback(self, progress): |
|
182
|
|
|
if self.callback: |
|
183
|
|
|
self.callback(progress) |
|
184
|
|
|
|
|
185
|
|
|
def get_fold(self, fold): |
|
186
|
|
|
results = Results() |
|
187
|
|
|
results.data = self.data |
|
188
|
|
|
|
|
189
|
|
|
if self.folds is None: |
|
190
|
|
|
raise ValueError("This 'Results' instance does not have folds.") |
|
191
|
|
|
|
|
192
|
|
|
if self.models is not None: |
|
193
|
|
|
results.models = self.models[fold] |
|
194
|
|
|
|
|
195
|
|
|
results.row_indices = self.row_indices[self.folds[fold]] |
|
196
|
|
|
results.actual = self.actual[self.folds[fold]] |
|
197
|
|
|
results.predicted = self.predicted[:, self.folds[fold]] |
|
198
|
|
|
results.domain = self.domain |
|
199
|
|
|
|
|
200
|
|
|
if self.probabilities is not None: |
|
201
|
|
|
results.probabilities = self.probabilities[:, self.folds[fold]] |
|
202
|
|
|
|
|
203
|
|
|
return results |
|
204
|
|
|
|
|
205
|
|
|
|
|
206
|
|
|
class CrossValidation(Results): |
|
207
|
|
|
""" |
|
208
|
|
|
K-fold cross validation. |
|
209
|
|
|
|
|
210
|
|
|
If the constructor is given the data and a list of learning algorithms, it |
|
211
|
|
|
runs cross validation and returns an instance of `Results` containing the |
|
212
|
|
|
predicted values and probabilities. |
|
213
|
|
|
|
|
214
|
|
|
.. attribute:: k |
|
215
|
|
|
|
|
216
|
|
|
The number of folds. |
|
217
|
|
|
|
|
218
|
|
|
.. attribute:: random_state |
|
219
|
|
|
|
|
220
|
|
|
""" |
|
221
|
|
|
def __init__(self, data, learners, k=10, random_state=0, store_data=False, |
|
222
|
|
|
store_models=False, preprocessor=None, callback=None): |
|
223
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
|
224
|
|
|
store_models=store_models, preprocessor=preprocessor, |
|
225
|
|
|
callback=callback) |
|
226
|
|
|
self.k = k |
|
227
|
|
|
self.random_state = random_state |
|
228
|
|
|
Y = data.Y.copy().flatten() |
|
229
|
|
|
if data.domain.has_discrete_class: |
|
230
|
|
|
indices = skl_cross_validation.StratifiedKFold( |
|
231
|
|
|
Y, self.k, shuffle=True, random_state=self.random_state |
|
232
|
|
|
) |
|
233
|
|
|
else: |
|
234
|
|
|
indices = skl_cross_validation.KFold( |
|
235
|
|
|
len(Y), self.k, shuffle=True, random_state=self.random_state |
|
236
|
|
|
) |
|
237
|
|
|
|
|
238
|
|
|
self.folds = [] |
|
239
|
|
|
if self.store_models: |
|
240
|
|
|
self.models = [] |
|
241
|
|
|
ptr = 0 |
|
242
|
|
|
nmethods = len(learners) |
|
243
|
|
|
n_callbacks = nmethods * self.k |
|
244
|
|
|
for fold_idx, (train, test) in enumerate(indices): |
|
245
|
|
|
train_data, test_data = data[train], data[test] |
|
246
|
|
|
if self.preprocessor is not None: |
|
247
|
|
|
train_data = self.preprocessor(train_data) |
|
248
|
|
|
if len(test_data) == 0: |
|
249
|
|
|
raise RuntimeError("One of the test folds is empty.") |
|
250
|
|
|
fold_slice = slice(ptr, ptr + len(test)) |
|
251
|
|
|
self.folds.append(fold_slice) |
|
252
|
|
|
self.row_indices[fold_slice] = test |
|
253
|
|
|
self.actual[fold_slice] = test_data.Y.flatten() |
|
254
|
|
|
if self.store_models: |
|
255
|
|
|
fold_models = [None] * nmethods |
|
256
|
|
|
self.models.append(fold_models) |
|
257
|
|
|
for i, learner in enumerate(learners): |
|
258
|
|
|
model = self.train_if_succ(i, learner, train_data) |
|
259
|
|
|
self.call_callback((fold_idx * nmethods + i) / n_callbacks) |
|
260
|
|
|
if not model: |
|
261
|
|
|
continue |
|
262
|
|
|
if self.store_models: |
|
263
|
|
|
fold_models[i] = model |
|
264
|
|
|
if data.domain.has_discrete_class: |
|
|
|
|
|
|
265
|
|
|
values, probs = model(test_data, model.ValueProbs) |
|
266
|
|
|
self.predicted[i][fold_slice] = values |
|
267
|
|
|
self.probabilities[i][fold_slice, :] = probs |
|
268
|
|
|
elif data.domain.has_continuous_class: |
|
269
|
|
|
values = model(test_data, model.Value) |
|
270
|
|
|
self.predicted[i][fold_slice] = values |
|
271
|
|
|
ptr += len(test) |
|
272
|
|
|
self.call_callback(1) |
|
273
|
|
|
|
|
274
|
|
|
|
|
275
|
|
|
class LeaveOneOut(Results): |
|
276
|
|
|
"""Leave-one-out testing""" |
|
277
|
|
|
|
|
278
|
|
|
def __init__(self, data, learners, store_data=False, store_models=False, |
|
279
|
|
|
preprocessor=None, callback=None): |
|
280
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
|
281
|
|
|
store_models=store_models, preprocessor=preprocessor, |
|
282
|
|
|
callback=callback) |
|
283
|
|
|
domain = data.domain |
|
284
|
|
|
X = data.X.copy() |
|
285
|
|
|
Y = data._Y.copy() |
|
|
|
|
|
|
286
|
|
|
metas = data.metas.copy() |
|
287
|
|
|
|
|
288
|
|
|
teX, trX = X[:1], X[1:] |
|
289
|
|
|
teY, trY = Y[:1], Y[1:] |
|
290
|
|
|
te_metas, tr_metas = metas[:1], metas[1:] |
|
291
|
|
|
if data.has_weights(): |
|
292
|
|
|
W = data.W.copy() |
|
293
|
|
|
teW, trW = W[:1], W[1:] |
|
294
|
|
|
else: |
|
295
|
|
|
W = teW = trW = None |
|
296
|
|
|
|
|
297
|
|
|
self.row_indices = np.arange(len(data)) |
|
298
|
|
|
if self.store_models: |
|
299
|
|
|
self.models = [] |
|
300
|
|
|
self.actual = Y.flatten() |
|
301
|
|
|
nmethods = len(learners) |
|
302
|
|
|
n_callbacks = nmethods * len(data) |
|
303
|
|
|
for test_idx in self.row_indices: |
|
304
|
|
|
X[[0, test_idx]] = X[[test_idx, 0]] |
|
305
|
|
|
Y[[0, test_idx]] = Y[[test_idx, 0]] |
|
306
|
|
|
metas[[0, test_idx]] = metas[[test_idx, 0]] |
|
307
|
|
|
if W: |
|
308
|
|
|
W[[0, test_idx]] = W[[test_idx, 0]] |
|
309
|
|
|
test_data = Table.from_numpy(domain, teX, teY, te_metas, teW) |
|
310
|
|
|
train_data = Table.from_numpy(domain, trX, trY, tr_metas, trW) |
|
311
|
|
|
if self.preprocessor is not None: |
|
312
|
|
|
train_data = self.preprocessor(train_data) |
|
313
|
|
|
if self.store_models: |
|
314
|
|
|
fold_models = [None] * nmethods |
|
315
|
|
|
self.models.append(fold_models) |
|
316
|
|
|
for i, learner in enumerate(learners): |
|
317
|
|
|
model = self.train_if_succ(i, learner, train_data) |
|
318
|
|
|
self.call_callback((test_idx * nmethods + i) / n_callbacks) |
|
319
|
|
|
if not model: |
|
320
|
|
|
continue |
|
321
|
|
|
if self.store_models: |
|
322
|
|
|
fold_models[i] = model |
|
323
|
|
|
if data.domain.has_discrete_class: |
|
|
|
|
|
|
324
|
|
|
values, probs = model(test_data, model.ValueProbs) |
|
325
|
|
|
self.predicted[i][test_idx] = values |
|
326
|
|
|
self.probabilities[i][test_idx, :] = probs |
|
327
|
|
|
elif data.domain.has_continuous_class: |
|
328
|
|
|
values = model(test_data, model.Value) |
|
329
|
|
|
self.predicted[i][test_idx] = values |
|
330
|
|
|
self.call_callback(1) |
|
331
|
|
|
|
|
332
|
|
|
|
|
333
|
|
|
class TestOnTrainingData(Results): |
|
334
|
|
|
"""Trains and test on the same data""" |
|
335
|
|
|
|
|
336
|
|
|
def __init__(self, data, learners, store_data=False, store_models=False, |
|
337
|
|
|
preprocessor=None, callback=None): |
|
338
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
|
339
|
|
|
store_models=store_models, preprocessor=preprocessor, |
|
340
|
|
|
callback=callback) |
|
341
|
|
|
self.row_indices = np.arange(len(data)) |
|
342
|
|
|
nmethods = len(learners) |
|
343
|
|
|
if self.store_models: |
|
344
|
|
|
models = [None] * nmethods |
|
345
|
|
|
self.models = [models] |
|
346
|
|
|
self.actual = data.Y.flatten() |
|
347
|
|
|
if self.preprocessor is not None: |
|
348
|
|
|
train_data = self.preprocessor(data) |
|
349
|
|
|
else: |
|
350
|
|
|
train_data = data |
|
351
|
|
|
for i, learner in enumerate(learners): |
|
352
|
|
|
model = self.train_if_succ(i, learner, train_data) |
|
353
|
|
|
self.call_callback(i / nmethods) |
|
354
|
|
|
if not model: |
|
355
|
|
|
continue |
|
356
|
|
|
if self.store_models: |
|
357
|
|
|
models[i] = model |
|
358
|
|
|
if data.domain.has_discrete_class: |
|
359
|
|
|
values, probs = model(data, model.ValueProbs) |
|
360
|
|
|
self.predicted[i] = values |
|
361
|
|
|
self.probabilities[i] = probs |
|
362
|
|
|
elif data.domain.has_continuous_class: |
|
363
|
|
|
values = model(data, model.Value) |
|
364
|
|
|
self.predicted[i] = values |
|
365
|
|
|
self.call_callback(1) |
|
366
|
|
|
|
|
367
|
|
|
|
|
368
|
|
|
class ShuffleSplit(Results): |
|
369
|
|
|
def __init__(self, data, learners, n_resamples=10, train_size=None, |
|
370
|
|
|
test_size=0.1, random_state=0, store_data=False, |
|
371
|
|
|
store_models=False, preprocessor=None, callback=None): |
|
372
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
|
373
|
|
|
store_models=store_models, preprocessor=preprocessor, |
|
374
|
|
|
callback=callback) |
|
375
|
|
|
self.store_models = store_models |
|
376
|
|
|
self.n_resamples = n_resamples |
|
377
|
|
|
self.train_size = train_size |
|
378
|
|
|
self.test_size = test_size |
|
379
|
|
|
self.random_state = random_state |
|
380
|
|
|
|
|
381
|
|
|
indices = skl_cross_validation.ShuffleSplit( |
|
382
|
|
|
len(data), n_iter=self.n_resamples, train_size=self.train_size, |
|
383
|
|
|
test_size=test_size, random_state=self.random_state |
|
384
|
|
|
) |
|
385
|
|
|
|
|
386
|
|
|
self.folds = [] |
|
387
|
|
|
if self.store_models: |
|
388
|
|
|
self.models = [] |
|
389
|
|
|
|
|
390
|
|
|
row_indices = [] |
|
391
|
|
|
actual = [] |
|
392
|
|
|
predicted = [[] for _ in learners] |
|
393
|
|
|
probabilities = [[] for _ in learners] |
|
394
|
|
|
fold_start = 0 |
|
395
|
|
|
nmethods = len(learners) |
|
396
|
|
|
n_callbacks = self.n_resamples * nmethods |
|
397
|
|
|
for samp_idx, (train, test) in enumerate(indices): |
|
398
|
|
|
train_data, test_data = data[train], data[test] |
|
399
|
|
|
if preprocessor is not None: |
|
400
|
|
|
train_data = self.preprocessor(train_data) |
|
401
|
|
|
self.folds.append(slice(fold_start, fold_start + len(test))) |
|
402
|
|
|
row_indices.append(test) |
|
403
|
|
|
actual.append(test_data.Y.flatten()) |
|
404
|
|
|
if self.store_models: |
|
405
|
|
|
fold_models = [None] * nmethods |
|
406
|
|
|
self.models.append(fold_models) |
|
407
|
|
|
|
|
408
|
|
|
for i, learner in enumerate(learners): |
|
409
|
|
|
model = self.train_if_succ(i, learner, train_data) |
|
410
|
|
|
self.call_callback((samp_idx * nmethods + i ) / n_callbacks) |
|
411
|
|
|
if model: |
|
412
|
|
|
if self.store_models: |
|
413
|
|
|
fold_models[i] = model |
|
414
|
|
|
if data.domain.has_discrete_class: |
|
415
|
|
|
values, probs = model(test_data, model.ValueProbs) |
|
416
|
|
|
predicted[i].append(values) |
|
417
|
|
|
probabilities[i].append(probs) |
|
418
|
|
|
elif data.domain.has_continuous_class: |
|
419
|
|
|
values = model(test_data, model.Value) |
|
420
|
|
|
predicted[i].append(values) |
|
421
|
|
|
else: |
|
422
|
|
|
predicted[i].append(np.zeros((len(test_data),))) |
|
423
|
|
|
if data.domain.has_discrete_class: |
|
424
|
|
|
probabilities[i].append( |
|
425
|
|
|
np.zeros((len(test_data), |
|
426
|
|
|
len(data.domain.class_var.values)))) |
|
427
|
|
|
|
|
428
|
|
|
fold_start += len(test) |
|
429
|
|
|
|
|
430
|
|
|
row_indices = np.hstack(row_indices) |
|
431
|
|
|
actual = np.hstack(actual) |
|
432
|
|
|
predicted = np.array([np.hstack(pred) for pred in predicted]) |
|
433
|
|
|
if data.domain.has_discrete_class: |
|
434
|
|
|
probabilities = np.array([np.vstack(prob) for prob in probabilities]) |
|
435
|
|
|
nrows = len(actual) |
|
436
|
|
|
nmodels = len(predicted) |
|
437
|
|
|
|
|
438
|
|
|
self.nrows = len(actual) |
|
439
|
|
|
self.row_indices = row_indices |
|
440
|
|
|
self.actual = actual |
|
441
|
|
|
self.predicted = predicted.reshape(nmodels, nrows) |
|
442
|
|
|
if data.domain.has_discrete_class: |
|
443
|
|
|
self.probabilities = probabilities |
|
444
|
|
|
self.call_callback(1) |
|
445
|
|
|
|
|
446
|
|
|
|
|
447
|
|
|
class TestOnTestData(Results): |
|
448
|
|
|
""" |
|
449
|
|
|
Test on a separate test data set. |
|
450
|
|
|
""" |
|
451
|
|
|
def __init__(self, train_data, test_data, learners, store_data=False, |
|
452
|
|
|
store_models=False, preprocessor=None, callback=None): |
|
453
|
|
|
super().__init__(test_data, len(learners), store_data=store_data, |
|
454
|
|
|
store_models=store_models, preprocessor=preprocessor, |
|
455
|
|
|
callback=callback) |
|
456
|
|
|
nmethods = len(learners) |
|
457
|
|
|
if self.store_models: |
|
458
|
|
|
self.models = [None] * nmethods |
|
459
|
|
|
|
|
460
|
|
|
self.row_indices = np.arange(len(test_data)) |
|
461
|
|
|
self.actual = test_data.Y.flatten() |
|
462
|
|
|
|
|
463
|
|
|
if self.preprocessor is not None: |
|
464
|
|
|
train_data = self.preprocessor(train_data) |
|
465
|
|
|
for i, learner in enumerate(learners): |
|
466
|
|
|
model = self.train_if_succ(i, learner, train_data) |
|
467
|
|
|
self.call_callback(i / nmethods) |
|
468
|
|
|
if not model: |
|
469
|
|
|
continue |
|
470
|
|
|
if train_data.domain.has_discrete_class: |
|
471
|
|
|
values, probs = model(test_data, model.ValueProbs) |
|
472
|
|
|
self.predicted[i] = values |
|
473
|
|
|
self.probabilities[i][:, :] = probs |
|
474
|
|
|
elif train_data.domain.has_continuous_class: |
|
475
|
|
|
values = model(test_data, model.Value) |
|
476
|
|
|
self.predicted[i] = values |
|
477
|
|
|
if self.store_models: |
|
478
|
|
|
self.models[i] = model |
|
479
|
|
|
|
|
480
|
|
|
self.nrows = len(test_data) |
|
481
|
|
|
self.folds = [slice(0, len(test_data))] |
|
482
|
|
|
self.call_callback(1) |
|
483
|
|
|
|
|
484
|
|
|
|
|
485
|
|
|
def sample(table, n=0.7, stratified=False, replace=False, |
|
486
|
|
|
random_state=None): |
|
487
|
|
|
""" |
|
488
|
|
|
Samples data instances from a data table. Returns the sample and |
|
489
|
|
|
a data set from input data table that are not in the sample. Also |
|
490
|
|
|
uses several sampling functions from |
|
491
|
|
|
`scikit-learn <http://scikit-learn.org>`_. |
|
492
|
|
|
|
|
493
|
|
|
table : data table |
|
494
|
|
|
A data table from which to sample. |
|
495
|
|
|
|
|
496
|
|
|
n : float, int (default = 0.7) |
|
497
|
|
|
If float, should be between 0.0 and 1.0 and represents |
|
498
|
|
|
the proportion of data instances in the resulting sample. If |
|
499
|
|
|
int, n is the number of data instances in the resulting sample. |
|
500
|
|
|
|
|
501
|
|
|
stratified : bool, optional (default = False) |
|
502
|
|
|
If true, sampling will try to consider class values and |
|
503
|
|
|
match distribution of class values |
|
504
|
|
|
in train and test subsets. |
|
505
|
|
|
|
|
506
|
|
|
replace : bool, optional (default = False) |
|
507
|
|
|
sample with replacement |
|
508
|
|
|
|
|
509
|
|
|
random_state : int or RandomState |
|
510
|
|
|
Pseudo-random number generator state used for random sampling. |
|
511
|
|
|
""" |
|
512
|
|
|
|
|
513
|
|
|
if type(n) == float: |
|
514
|
|
|
n = int(n * len(table)) |
|
515
|
|
|
|
|
516
|
|
|
if replace: |
|
517
|
|
|
if random_state is None: |
|
518
|
|
|
rgen = np.random |
|
519
|
|
|
else: |
|
520
|
|
|
rgen = np.random.mtrand.RandomState(random_state) |
|
521
|
|
|
sample = rgen.random_integers(0, len(table) - 1, n) |
|
|
|
|
|
|
522
|
|
|
o = np.ones(len(table)) |
|
523
|
|
|
o[sample] = 0 |
|
524
|
|
|
others = np.nonzero(o)[0] |
|
525
|
|
|
return table[sample], table[others] |
|
526
|
|
|
|
|
527
|
|
|
n = len(table) - n |
|
528
|
|
|
if stratified and table.domain.has_discrete_class: |
|
529
|
|
|
test_size = max(len(table.domain.class_var.values), n) |
|
530
|
|
|
ind = skl_cross_validation.StratifiedShuffleSplit( |
|
531
|
|
|
table.Y.ravel(), n_iter=1, |
|
532
|
|
|
test_size=test_size, train_size=len(table) - test_size, |
|
533
|
|
|
random_state=random_state) |
|
534
|
|
|
else: |
|
535
|
|
|
ind = skl_cross_validation.ShuffleSplit( |
|
536
|
|
|
len(table), n_iter=1, |
|
537
|
|
|
test_size=n, random_state=random_state) |
|
538
|
|
|
ind = next(iter(ind)) |
|
539
|
|
|
return table[ind[0]], table[ind[1]] |
|
540
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.pyfiles in your module folders. Make sure that you place one file in each sub-folder.