1
|
|
|
import numpy as np |
|
|
|
|
2
|
|
|
|
3
|
|
|
import sklearn.cross_validation as skl_cross_validation |
|
|
|
|
4
|
|
|
|
5
|
|
|
from Orange.data import Table |
6
|
|
|
|
7
|
|
|
__all__ = ["Results", "CrossValidation", "LeaveOneOut", "TestOnTrainingData", |
8
|
|
|
"ShuffleSplit", "TestOnTestData", "sample"] |
9
|
|
|
|
10
|
|
|
|
11
|
|
|
class Results: |
12
|
|
|
""" |
13
|
|
|
Class for storing predictions in model testing. |
14
|
|
|
|
15
|
|
|
.. attribute:: data |
16
|
|
|
|
17
|
|
|
Data used for testing (optional; can be `None`). When data is stored, |
18
|
|
|
this is typically not a copy but a reference. |
19
|
|
|
|
20
|
|
|
.. attribute:: row_indices |
21
|
|
|
|
22
|
|
|
Indices of rows in :obj:`data` that were used in testing, stored as |
23
|
|
|
a numpy vector of length `nrows`. Values of `actual[i]`, `predicted[i]` |
24
|
|
|
and `probabilities[i]` refer to the target value of instance |
25
|
|
|
`data[row_indices[i]]`. |
26
|
|
|
|
27
|
|
|
.. attribute:: nrows |
28
|
|
|
|
29
|
|
|
The number of test instances (including duplicates). |
30
|
|
|
|
31
|
|
|
.. attribute:: models |
32
|
|
|
|
33
|
|
|
A list of induced models (optional; can be `None`). |
34
|
|
|
|
35
|
|
|
.. attribute:: actual |
36
|
|
|
|
37
|
|
|
Actual values of target variable; a numpy vector of length `nrows` and |
38
|
|
|
of the same type as `data` (or `np.float32` if the type of data cannot |
39
|
|
|
be determined). |
40
|
|
|
|
41
|
|
|
.. attribute:: predicted |
42
|
|
|
|
43
|
|
|
Predicted values of target variable; a numpy array of shape |
44
|
|
|
(number-of-methods, `nrows`) and of the same type as `data` (or |
45
|
|
|
`np.float32` if the type of data cannot be determined). |
46
|
|
|
|
47
|
|
|
.. attribute:: probabilities |
48
|
|
|
|
49
|
|
|
Predicted probabilities (for discrete target variables); a numpy array |
50
|
|
|
of shape (number-of-methods, `nrows`, number-of-classes) of type |
51
|
|
|
`np.float32`. |
52
|
|
|
|
53
|
|
|
.. attribute:: folds |
54
|
|
|
|
55
|
|
|
A list of indices (or slice objects) corresponding to rows of each |
56
|
|
|
fold; `None` if not applicable. |
57
|
|
|
""" |
58
|
|
|
|
59
|
|
|
# noinspection PyBroadException |
60
|
|
|
# noinspection PyNoneFunctionAssignment |
61
|
|
|
def __init__(self, data=None, nmethods=0, nrows=None, nclasses=None, |
62
|
|
|
store_data=False, store_models=False, domain=None, |
63
|
|
|
actual=None, row_indices=None, |
64
|
|
|
predicted=None, probabilities=None, |
65
|
|
|
preprocessor=None, callback=None): |
66
|
|
|
""" |
67
|
|
|
Construct an instance with default values: `None` for :obj:`data` and |
68
|
|
|
:obj:`models`. |
69
|
|
|
|
70
|
|
|
If the number of rows and/or the number of classes is not given, it is |
71
|
|
|
inferred from :obj:`data`, if provided. The data type for |
72
|
|
|
:obj:`actual` and :obj:`predicted` is determined from the data; if the |
73
|
|
|
latter cannot be find, `np.float32` is used. |
74
|
|
|
|
75
|
|
|
Attribute :obj:`actual` and :obj:`row_indices` are constructed as empty |
76
|
|
|
(uninitialized) arrays of the appropriate size, if the number of rows |
77
|
|
|
is known. Attribute :obj:`predicted` is constructed if the number of |
78
|
|
|
rows and of methods is given; :obj:`probabilities` also requires |
79
|
|
|
knowing the number of classes. |
80
|
|
|
|
81
|
|
|
:param data: Data or domain |
82
|
|
|
:type data: Orange.data.Table or Orange.data.Domain |
83
|
|
|
:param nmethods: The number of methods that will be tested |
84
|
|
|
:type nmethods: int |
85
|
|
|
:param nrows: The number of test instances (including duplicates) |
86
|
|
|
:type nrows: int |
87
|
|
|
:param nclasses: The number of class values |
88
|
|
|
:type nclasses: int |
89
|
|
|
:param store_data: A flag that tells whether to store the data; |
90
|
|
|
this argument can be given only as keyword argument |
91
|
|
|
:type store_data: bool |
92
|
|
|
:param store_models: A flag that tells whether to store the models; |
93
|
|
|
this argument can be given only as keyword argument |
94
|
|
|
:type store_models: bool |
95
|
|
|
:param preprocessor: Preprocessor for training data |
96
|
|
|
:type preprocessor: Orange.preprocess.Preprocess |
97
|
|
|
:param callback: Function for reporting back the progress as a value |
98
|
|
|
between 0 and 1 |
99
|
|
|
:type callback: callable |
100
|
|
|
""" |
101
|
|
|
self.store_data = store_data |
102
|
|
|
self.store_models = store_models |
103
|
|
|
self.data = data if store_data else None |
104
|
|
|
self.models = None |
105
|
|
|
self.folds = None |
106
|
|
|
dtype = np.float32 |
107
|
|
|
self.preprocessor = preprocessor |
108
|
|
|
self.callback = callback |
109
|
|
|
|
110
|
|
|
def set_or_raise(value, exp_values, msg): |
111
|
|
|
for exp_value in exp_values: |
112
|
|
|
if exp_value is False: |
113
|
|
|
continue |
114
|
|
|
if value is None: |
115
|
|
|
value = exp_value |
116
|
|
|
elif value != exp_value: |
117
|
|
|
raise ValueError(msg) |
118
|
|
|
return value |
119
|
|
|
|
120
|
|
|
domain = self.domain = set_or_raise( |
121
|
|
|
domain, [data is not None and data.domain], |
122
|
|
|
"mismatching domain") |
123
|
|
|
nrows = set_or_raise( |
124
|
|
|
nrows, [data is not None and len(data), |
125
|
|
|
actual is not None and len(actual), |
126
|
|
|
row_indices is not None and len(row_indices), |
127
|
|
|
predicted is not None and predicted.shape[1], |
128
|
|
|
probabilities is not None and probabilities.shape[1]], |
129
|
|
|
"mismatching number of rows") |
130
|
|
|
nclasses = set_or_raise( |
131
|
|
|
nclasses, [domain and (len(domain.class_var.values) |
132
|
|
|
if domain.has_discrete_class |
133
|
|
|
else None), |
134
|
|
|
probabilities is not None and probabilities.shape[2]], |
135
|
|
|
"mismatching number of class values") |
136
|
|
|
if nclasses is not None and probabilities is not None: |
137
|
|
|
raise ValueError("regression results cannot have 'probabilities'") |
138
|
|
|
nmethods = set_or_raise( |
139
|
|
|
nmethods, [predicted is not None and predicted.shape[0], |
140
|
|
|
probabilities is not None and probabilities.shape[0]], |
141
|
|
|
"mismatching number of methods") |
142
|
|
|
try: |
143
|
|
|
dtype = data.Y.dtype |
144
|
|
|
except AttributeError: # no data or no Y or not numpy |
|
|
|
|
145
|
|
|
pass |
146
|
|
|
|
147
|
|
|
if nmethods is not None: |
148
|
|
|
self.failed = [False] * nmethods |
149
|
|
|
|
150
|
|
|
if actual is not None: |
151
|
|
|
self.actual = actual |
152
|
|
|
elif nrows is not None: |
153
|
|
|
self.actual = np.empty(nrows, dtype=dtype) |
154
|
|
|
|
155
|
|
|
if row_indices is not None: |
156
|
|
|
self.row_indices = row_indices |
157
|
|
|
elif nrows is not None: |
158
|
|
|
self.row_indices = np.empty(nrows, dtype=np.int32) |
|
|
|
|
159
|
|
|
|
160
|
|
|
if predicted is not None: |
161
|
|
|
self.predicted = predicted |
162
|
|
|
elif nmethods is not None and nrows is not None: |
163
|
|
|
self.predicted = np.empty((nmethods, nrows), dtype=dtype) |
164
|
|
|
|
165
|
|
|
if probabilities is not None: |
166
|
|
|
self.probabilities = probabilities |
167
|
|
|
elif nmethods is not None and nrows is not None and \ |
168
|
|
|
nclasses is not None: |
169
|
|
|
self.probabilities = \ |
170
|
|
|
np.empty((nmethods, nrows, nclasses), dtype=np.float32) |
171
|
|
|
|
172
|
|
|
def train_if_succ(self, learner_index, learner, data): |
173
|
|
|
if self.failed[learner_index]: |
174
|
|
|
return False |
175
|
|
|
try: |
176
|
|
|
return learner(data) |
177
|
|
|
except Exception as ex: |
|
|
|
|
178
|
|
|
self.failed[learner_index] = ex |
179
|
|
|
return False |
180
|
|
|
|
181
|
|
|
def call_callback(self, progress): |
182
|
|
|
if self.callback: |
183
|
|
|
self.callback(progress) |
184
|
|
|
|
185
|
|
|
def get_fold(self, fold): |
186
|
|
|
results = Results() |
187
|
|
|
results.data = self.data |
188
|
|
|
|
189
|
|
|
if self.folds is None: |
190
|
|
|
raise ValueError("This 'Results' instance does not have folds.") |
191
|
|
|
|
192
|
|
|
if self.models is not None: |
193
|
|
|
results.models = self.models[fold] |
194
|
|
|
|
195
|
|
|
results.row_indices = self.row_indices[self.folds[fold]] |
196
|
|
|
results.actual = self.actual[self.folds[fold]] |
197
|
|
|
results.predicted = self.predicted[:, self.folds[fold]] |
198
|
|
|
results.domain = self.domain |
199
|
|
|
|
200
|
|
|
if self.probabilities is not None: |
201
|
|
|
results.probabilities = self.probabilities[:, self.folds[fold]] |
202
|
|
|
|
203
|
|
|
return results |
204
|
|
|
|
205
|
|
|
|
206
|
|
|
class CrossValidation(Results): |
207
|
|
|
""" |
208
|
|
|
K-fold cross validation. |
209
|
|
|
|
210
|
|
|
If the constructor is given the data and a list of learning algorithms, it |
211
|
|
|
runs cross validation and returns an instance of `Results` containing the |
212
|
|
|
predicted values and probabilities. |
213
|
|
|
|
214
|
|
|
.. attribute:: k |
215
|
|
|
|
216
|
|
|
The number of folds. |
217
|
|
|
|
218
|
|
|
.. attribute:: random_state |
219
|
|
|
|
220
|
|
|
""" |
221
|
|
|
def __init__(self, data, learners, k=10, random_state=0, store_data=False, |
222
|
|
|
store_models=False, preprocessor=None, callback=None): |
223
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
224
|
|
|
store_models=store_models, preprocessor=preprocessor, |
225
|
|
|
callback=callback) |
226
|
|
|
self.k = k |
227
|
|
|
self.random_state = random_state |
228
|
|
|
Y = data.Y.copy().flatten() |
229
|
|
|
if data.domain.has_discrete_class: |
230
|
|
|
indices = skl_cross_validation.StratifiedKFold( |
231
|
|
|
Y, self.k, shuffle=True, random_state=self.random_state |
232
|
|
|
) |
233
|
|
|
else: |
234
|
|
|
indices = skl_cross_validation.KFold( |
235
|
|
|
len(Y), self.k, shuffle=True, random_state=self.random_state |
236
|
|
|
) |
237
|
|
|
|
238
|
|
|
self.folds = [] |
239
|
|
|
if self.store_models: |
240
|
|
|
self.models = [] |
241
|
|
|
ptr = 0 |
242
|
|
|
nmethods = len(learners) |
243
|
|
|
n_callbacks = nmethods * self.k |
244
|
|
|
for fold_idx, (train, test) in enumerate(indices): |
245
|
|
|
train_data, test_data = data[train], data[test] |
246
|
|
|
if self.preprocessor is not None: |
247
|
|
|
train_data = self.preprocessor(train_data) |
248
|
|
|
if len(test_data) == 0: |
249
|
|
|
raise RuntimeError("One of the test folds is empty.") |
250
|
|
|
fold_slice = slice(ptr, ptr + len(test)) |
251
|
|
|
self.folds.append(fold_slice) |
252
|
|
|
self.row_indices[fold_slice] = test |
253
|
|
|
self.actual[fold_slice] = test_data.Y.flatten() |
254
|
|
|
if self.store_models: |
255
|
|
|
fold_models = [None] * nmethods |
256
|
|
|
self.models.append(fold_models) |
257
|
|
|
for i, learner in enumerate(learners): |
258
|
|
|
model = self.train_if_succ(i, learner, train_data) |
259
|
|
|
self.call_callback((fold_idx * nmethods + i) / n_callbacks) |
260
|
|
|
if not model: |
261
|
|
|
continue |
262
|
|
|
if self.store_models: |
263
|
|
|
fold_models[i] = model |
264
|
|
|
if data.domain.has_discrete_class: |
|
|
|
|
265
|
|
|
values, probs = model(test_data, model.ValueProbs) |
266
|
|
|
self.predicted[i][fold_slice] = values |
267
|
|
|
self.probabilities[i][fold_slice, :] = probs |
268
|
|
|
elif data.domain.has_continuous_class: |
269
|
|
|
values = model(test_data, model.Value) |
270
|
|
|
self.predicted[i][fold_slice] = values |
271
|
|
|
ptr += len(test) |
272
|
|
|
self.call_callback(1) |
273
|
|
|
|
274
|
|
|
|
275
|
|
|
class LeaveOneOut(Results): |
276
|
|
|
"""Leave-one-out testing""" |
277
|
|
|
|
278
|
|
|
def __init__(self, data, learners, store_data=False, store_models=False, |
279
|
|
|
preprocessor=None, callback=None): |
280
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
281
|
|
|
store_models=store_models, preprocessor=preprocessor, |
282
|
|
|
callback=callback) |
283
|
|
|
domain = data.domain |
284
|
|
|
X = data.X.copy() |
285
|
|
|
Y = data._Y.copy() |
|
|
|
|
286
|
|
|
metas = data.metas.copy() |
287
|
|
|
|
288
|
|
|
teX, trX = X[:1], X[1:] |
289
|
|
|
teY, trY = Y[:1], Y[1:] |
290
|
|
|
te_metas, tr_metas = metas[:1], metas[1:] |
291
|
|
|
if data.has_weights(): |
292
|
|
|
W = data.W.copy() |
293
|
|
|
teW, trW = W[:1], W[1:] |
294
|
|
|
else: |
295
|
|
|
W = teW = trW = None |
296
|
|
|
|
297
|
|
|
self.row_indices = np.arange(len(data)) |
298
|
|
|
if self.store_models: |
299
|
|
|
self.models = [] |
300
|
|
|
self.actual = Y.flatten() |
301
|
|
|
nmethods = len(learners) |
302
|
|
|
n_callbacks = nmethods * len(data) |
303
|
|
|
for test_idx in self.row_indices: |
304
|
|
|
X[[0, test_idx]] = X[[test_idx, 0]] |
305
|
|
|
Y[[0, test_idx]] = Y[[test_idx, 0]] |
306
|
|
|
metas[[0, test_idx]] = metas[[test_idx, 0]] |
307
|
|
|
if W: |
308
|
|
|
W[[0, test_idx]] = W[[test_idx, 0]] |
309
|
|
|
test_data = Table.from_numpy(domain, teX, teY, te_metas, teW) |
310
|
|
|
train_data = Table.from_numpy(domain, trX, trY, tr_metas, trW) |
311
|
|
|
if self.preprocessor is not None: |
312
|
|
|
train_data = self.preprocessor(train_data) |
313
|
|
|
if self.store_models: |
314
|
|
|
fold_models = [None] * nmethods |
315
|
|
|
self.models.append(fold_models) |
316
|
|
|
for i, learner in enumerate(learners): |
317
|
|
|
model = self.train_if_succ(i, learner, train_data) |
318
|
|
|
self.call_callback((test_idx * nmethods + i) / n_callbacks) |
319
|
|
|
if not model: |
320
|
|
|
continue |
321
|
|
|
if self.store_models: |
322
|
|
|
fold_models[i] = model |
323
|
|
|
if data.domain.has_discrete_class: |
|
|
|
|
324
|
|
|
values, probs = model(test_data, model.ValueProbs) |
325
|
|
|
self.predicted[i][test_idx] = values |
326
|
|
|
self.probabilities[i][test_idx, :] = probs |
327
|
|
|
elif data.domain.has_continuous_class: |
328
|
|
|
values = model(test_data, model.Value) |
329
|
|
|
self.predicted[i][test_idx] = values |
330
|
|
|
self.call_callback(1) |
331
|
|
|
|
332
|
|
|
|
333
|
|
|
class TestOnTrainingData(Results): |
334
|
|
|
"""Trains and test on the same data""" |
335
|
|
|
|
336
|
|
|
def __init__(self, data, learners, store_data=False, store_models=False, |
337
|
|
|
preprocessor=None, callback=None): |
338
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
339
|
|
|
store_models=store_models, preprocessor=preprocessor, |
340
|
|
|
callback=callback) |
341
|
|
|
self.row_indices = np.arange(len(data)) |
342
|
|
|
nmethods = len(learners) |
343
|
|
|
if self.store_models: |
344
|
|
|
models = [None] * nmethods |
345
|
|
|
self.models = [models] |
346
|
|
|
self.actual = data.Y.flatten() |
347
|
|
|
if self.preprocessor is not None: |
348
|
|
|
train_data = self.preprocessor(data) |
349
|
|
|
else: |
350
|
|
|
train_data = data |
351
|
|
|
for i, learner in enumerate(learners): |
352
|
|
|
model = self.train_if_succ(i, learner, train_data) |
353
|
|
|
self.call_callback(i / nmethods) |
354
|
|
|
if not model: |
355
|
|
|
continue |
356
|
|
|
if self.store_models: |
357
|
|
|
models[i] = model |
358
|
|
|
if data.domain.has_discrete_class: |
359
|
|
|
values, probs = model(data, model.ValueProbs) |
360
|
|
|
self.predicted[i] = values |
361
|
|
|
self.probabilities[i] = probs |
362
|
|
|
elif data.domain.has_continuous_class: |
363
|
|
|
values = model(data, model.Value) |
364
|
|
|
self.predicted[i] = values |
365
|
|
|
self.call_callback(1) |
366
|
|
|
|
367
|
|
|
|
368
|
|
|
class ShuffleSplit(Results): |
369
|
|
|
def __init__(self, data, learners, n_resamples=10, train_size=None, |
370
|
|
|
test_size=0.1, random_state=0, store_data=False, |
371
|
|
|
store_models=False, preprocessor=None, callback=None): |
372
|
|
|
super().__init__(data, len(learners), store_data=store_data, |
373
|
|
|
store_models=store_models, preprocessor=preprocessor, |
374
|
|
|
callback=callback) |
375
|
|
|
self.store_models = store_models |
376
|
|
|
self.n_resamples = n_resamples |
377
|
|
|
self.train_size = train_size |
378
|
|
|
self.test_size = test_size |
379
|
|
|
self.random_state = random_state |
380
|
|
|
|
381
|
|
|
indices = skl_cross_validation.ShuffleSplit( |
382
|
|
|
len(data), n_iter=self.n_resamples, train_size=self.train_size, |
383
|
|
|
test_size=test_size, random_state=self.random_state |
384
|
|
|
) |
385
|
|
|
|
386
|
|
|
self.folds = [] |
387
|
|
|
if self.store_models: |
388
|
|
|
self.models = [] |
389
|
|
|
|
390
|
|
|
row_indices = [] |
391
|
|
|
actual = [] |
392
|
|
|
predicted = [[] for _ in learners] |
393
|
|
|
probabilities = [[] for _ in learners] |
394
|
|
|
fold_start = 0 |
395
|
|
|
nmethods = len(learners) |
396
|
|
|
n_callbacks = self.n_resamples * nmethods |
397
|
|
|
for samp_idx, (train, test) in enumerate(indices): |
398
|
|
|
train_data, test_data = data[train], data[test] |
399
|
|
|
if preprocessor is not None: |
400
|
|
|
train_data = self.preprocessor(train_data) |
401
|
|
|
self.folds.append(slice(fold_start, fold_start + len(test))) |
402
|
|
|
row_indices.append(test) |
403
|
|
|
actual.append(test_data.Y.flatten()) |
404
|
|
|
if self.store_models: |
405
|
|
|
fold_models = [None] * nmethods |
406
|
|
|
self.models.append(fold_models) |
407
|
|
|
|
408
|
|
|
for i, learner in enumerate(learners): |
409
|
|
|
model = self.train_if_succ(i, learner, train_data) |
410
|
|
|
self.call_callback((samp_idx * nmethods + i ) / n_callbacks) |
411
|
|
|
if model: |
412
|
|
|
if self.store_models: |
413
|
|
|
fold_models[i] = model |
414
|
|
|
if data.domain.has_discrete_class: |
415
|
|
|
values, probs = model(test_data, model.ValueProbs) |
416
|
|
|
predicted[i].append(values) |
417
|
|
|
probabilities[i].append(probs) |
418
|
|
|
elif data.domain.has_continuous_class: |
419
|
|
|
values = model(test_data, model.Value) |
420
|
|
|
predicted[i].append(values) |
421
|
|
|
else: |
422
|
|
|
predicted[i].append(np.zeros((len(test_data),))) |
423
|
|
|
if data.domain.has_discrete_class: |
424
|
|
|
probabilities[i].append( |
425
|
|
|
np.zeros((len(test_data), |
426
|
|
|
len(data.domain.class_var.values)))) |
427
|
|
|
|
428
|
|
|
fold_start += len(test) |
429
|
|
|
|
430
|
|
|
row_indices = np.hstack(row_indices) |
431
|
|
|
actual = np.hstack(actual) |
432
|
|
|
predicted = np.array([np.hstack(pred) for pred in predicted]) |
433
|
|
|
if data.domain.has_discrete_class: |
434
|
|
|
probabilities = np.array([np.vstack(prob) for prob in probabilities]) |
435
|
|
|
nrows = len(actual) |
436
|
|
|
nmodels = len(predicted) |
437
|
|
|
|
438
|
|
|
self.nrows = len(actual) |
439
|
|
|
self.row_indices = row_indices |
440
|
|
|
self.actual = actual |
441
|
|
|
self.predicted = predicted.reshape(nmodels, nrows) |
442
|
|
|
if data.domain.has_discrete_class: |
443
|
|
|
self.probabilities = probabilities |
444
|
|
|
self.call_callback(1) |
445
|
|
|
|
446
|
|
|
|
447
|
|
|
class TestOnTestData(Results): |
448
|
|
|
""" |
449
|
|
|
Test on a separate test data set. |
450
|
|
|
""" |
451
|
|
|
def __init__(self, train_data, test_data, learners, store_data=False, |
452
|
|
|
store_models=False, preprocessor=None, callback=None): |
453
|
|
|
super().__init__(test_data, len(learners), store_data=store_data, |
454
|
|
|
store_models=store_models, preprocessor=preprocessor, |
455
|
|
|
callback=callback) |
456
|
|
|
nmethods = len(learners) |
457
|
|
|
if self.store_models: |
458
|
|
|
self.models = [None] * nmethods |
459
|
|
|
|
460
|
|
|
self.row_indices = np.arange(len(test_data)) |
461
|
|
|
self.actual = test_data.Y.flatten() |
462
|
|
|
|
463
|
|
|
if self.preprocessor is not None: |
464
|
|
|
train_data = self.preprocessor(train_data) |
465
|
|
|
for i, learner in enumerate(learners): |
466
|
|
|
model = self.train_if_succ(i, learner, train_data) |
467
|
|
|
self.call_callback(i / nmethods) |
468
|
|
|
if not model: |
469
|
|
|
continue |
470
|
|
|
if train_data.domain.has_discrete_class: |
471
|
|
|
values, probs = model(test_data, model.ValueProbs) |
472
|
|
|
self.predicted[i] = values |
473
|
|
|
self.probabilities[i][:, :] = probs |
474
|
|
|
elif train_data.domain.has_continuous_class: |
475
|
|
|
values = model(test_data, model.Value) |
476
|
|
|
self.predicted[i] = values |
477
|
|
|
if self.store_models: |
478
|
|
|
self.models[i] = model |
479
|
|
|
|
480
|
|
|
self.nrows = len(test_data) |
481
|
|
|
self.folds = [slice(0, len(test_data))] |
482
|
|
|
self.call_callback(1) |
483
|
|
|
|
484
|
|
|
|
485
|
|
|
def sample(table, n=0.7, stratified=False, replace=False, |
486
|
|
|
random_state=None): |
487
|
|
|
""" |
488
|
|
|
Samples data instances from a data table. Returns the sample and |
489
|
|
|
a data set from input data table that are not in the sample. Also |
490
|
|
|
uses several sampling functions from |
491
|
|
|
`scikit-learn <http://scikit-learn.org>`_. |
492
|
|
|
|
493
|
|
|
table : data table |
494
|
|
|
A data table from which to sample. |
495
|
|
|
|
496
|
|
|
n : float, int (default = 0.7) |
497
|
|
|
If float, should be between 0.0 and 1.0 and represents |
498
|
|
|
the proportion of data instances in the resulting sample. If |
499
|
|
|
int, n is the number of data instances in the resulting sample. |
500
|
|
|
|
501
|
|
|
stratified : bool, optional (default = False) |
502
|
|
|
If true, sampling will try to consider class values and |
503
|
|
|
match distribution of class values |
504
|
|
|
in train and test subsets. |
505
|
|
|
|
506
|
|
|
replace : bool, optional (default = False) |
507
|
|
|
sample with replacement |
508
|
|
|
|
509
|
|
|
random_state : int or RandomState |
510
|
|
|
Pseudo-random number generator state used for random sampling. |
511
|
|
|
""" |
512
|
|
|
|
513
|
|
|
if type(n) == float: |
514
|
|
|
n = int(n * len(table)) |
515
|
|
|
|
516
|
|
|
if replace: |
517
|
|
|
if random_state is None: |
518
|
|
|
rgen = np.random |
519
|
|
|
else: |
520
|
|
|
rgen = np.random.mtrand.RandomState(random_state) |
521
|
|
|
sample = rgen.random_integers(0, len(table) - 1, n) |
|
|
|
|
522
|
|
|
o = np.ones(len(table)) |
523
|
|
|
o[sample] = 0 |
524
|
|
|
others = np.nonzero(o)[0] |
525
|
|
|
return table[sample], table[others] |
526
|
|
|
|
527
|
|
|
n = len(table) - n |
528
|
|
|
if stratified and table.domain.has_discrete_class: |
529
|
|
|
test_size = max(len(table.domain.class_var.values), n) |
530
|
|
|
ind = skl_cross_validation.StratifiedShuffleSplit( |
531
|
|
|
table.Y.ravel(), n_iter=1, |
532
|
|
|
test_size=test_size, train_size=len(table) - test_size, |
533
|
|
|
random_state=random_state) |
534
|
|
|
else: |
535
|
|
|
ind = skl_cross_validation.ShuffleSplit( |
536
|
|
|
len(table), n_iter=1, |
537
|
|
|
test_size=n, random_state=random_state) |
538
|
|
|
ind = next(iter(ind)) |
539
|
|
|
return table[ind[0]], table[ind[1]] |
540
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.