1
|
|
|
import csv |
2
|
|
|
import re |
3
|
|
|
import sys |
4
|
|
|
import pickle |
5
|
|
|
from itertools import chain |
6
|
|
|
|
7
|
|
|
import os |
8
|
|
|
from collections import namedtuple |
9
|
|
|
|
10
|
|
|
import bottlechest as bn |
|
|
|
|
11
|
|
|
import numpy as np |
|
|
|
|
12
|
|
|
from scipy import sparse |
|
|
|
|
13
|
|
|
# We are not loading openpyxl here since it takes some time |
14
|
|
|
|
15
|
|
|
from Orange.data import Domain |
16
|
|
|
from Orange.data.variable import * |
|
|
|
|
17
|
|
|
|
18
|
|
|
|
19
|
|
|
# A singleton simulated with a class |
20
|
|
|
class FileFormats: |
21
|
|
|
formats = [] |
22
|
|
|
names = {} |
23
|
|
|
writers = {} |
24
|
|
|
readers = {} |
25
|
|
|
img_writers = {} |
26
|
|
|
graph_writers = {} |
27
|
|
|
|
28
|
|
|
@classmethod |
29
|
|
|
def register(cls, name, extension): |
30
|
|
|
def f(format): |
|
|
|
|
31
|
|
|
cls.NAME = name |
32
|
|
|
cls.formats.append(format) |
33
|
|
|
cls.names[extension] = name |
34
|
|
|
if hasattr(format, "write_file"): |
35
|
|
|
cls.writers[extension] = format |
36
|
|
|
if hasattr(format, "read_file"): |
37
|
|
|
cls.readers[extension] = format |
38
|
|
|
if hasattr(format, "write_image"): |
39
|
|
|
cls.img_writers[extension] = format |
40
|
|
|
if hasattr(format, "write_graph"): |
41
|
|
|
cls.graph_writers[extension] = format |
42
|
|
|
return format |
43
|
|
|
|
44
|
|
|
return f |
45
|
|
|
|
46
|
|
|
|
47
|
|
|
class FileReader: |
48
|
|
|
def prescan_file(self, f, delim, nvars, disc_cols, cont_cols): |
|
|
|
|
49
|
|
|
values = [set() for _ in range(nvars)] |
50
|
|
|
decimals = [-1] * nvars |
51
|
|
|
for lne in f: |
52
|
|
|
lne = lne.split(delim) |
53
|
|
|
for vs, col in zip(values, disc_cols): |
54
|
|
|
vs[col].add(lne[col]) |
55
|
|
|
for col in cont_cols: |
56
|
|
|
val = lne[col] |
57
|
|
|
if not col in Variable._DefaultUnknownStr and "." in val: |
|
|
|
|
58
|
|
|
decs = len(val) - val.find(".") - 1 |
59
|
|
|
if decs > decimals[col]: |
60
|
|
|
decimals[col] = decs |
61
|
|
|
return values, decimals |
62
|
|
|
|
63
|
|
|
|
64
|
|
|
@FileFormats.register("Tab-delimited file", ".tab") |
65
|
|
|
class TabDelimFormat: |
66
|
|
|
non_escaped_spaces = re.compile(r"(?<!\\) +") |
67
|
|
|
|
68
|
|
|
def read_header(self, f): |
69
|
|
|
f.seek(0) |
70
|
|
|
names = [x.strip() for x in f.readline().strip("\n\r").split("\t")] |
71
|
|
|
types = [x.strip() for x in f.readline().strip("\n\r").split("\t")] |
72
|
|
|
flags = [x.strip() for x in f.readline().strip("\n\r").split("\t")] |
73
|
|
|
self.n_columns = len(names) |
|
|
|
|
74
|
|
|
if len(types) != self.n_columns: |
75
|
|
|
raise ValueError("File contains %i variable names and %i types" % |
76
|
|
|
(len(names), len(types))) |
77
|
|
|
if len(flags) > self.n_columns: |
78
|
|
|
raise ValueError("There are more flags than variables") |
79
|
|
|
else: |
80
|
|
|
flags += [""] * self.n_columns |
81
|
|
|
|
82
|
|
|
attributes = [] |
83
|
|
|
class_vars = [] |
84
|
|
|
metas = [] |
85
|
|
|
|
86
|
|
|
self.attribute_columns = [] |
|
|
|
|
87
|
|
|
self.classvar_columns = [] |
|
|
|
|
88
|
|
|
self.meta_columns = [] |
|
|
|
|
89
|
|
|
self.weight_column = -1 |
|
|
|
|
90
|
|
|
self.basket_column = -1 |
|
|
|
|
91
|
|
|
|
92
|
|
|
for col, (name, tpe, flag) in enumerate(zip(names, types, flags)): |
93
|
|
|
tpe = tpe.strip() |
94
|
|
|
flag = self.non_escaped_spaces.split(flag) |
95
|
|
|
flag = [f.replace("\\ ", " ") for f in flag] |
96
|
|
|
if "i" in flag or "ignore" in flag: |
97
|
|
|
continue |
98
|
|
|
if "b" in flag or "basket" in flag: |
99
|
|
|
self.basket_column = col |
|
|
|
|
100
|
|
|
continue |
101
|
|
|
is_class = "class" in flag |
102
|
|
|
is_meta = "m" in flag or "meta" in flag or tpe in ["s", "string"] |
103
|
|
|
is_weight = "w" in flag or "weight" in flag \ |
104
|
|
|
or tpe in ["w", "weight"] |
105
|
|
|
|
106
|
|
|
attrs = [f.split("=", 1) for f in flag if "=" in f] |
107
|
|
|
|
108
|
|
|
if is_weight: |
109
|
|
|
if is_class: |
110
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
111
|
|
|
"class and weight".format(name, col)) |
112
|
|
|
self.weight_column = col |
|
|
|
|
113
|
|
|
continue |
114
|
|
|
|
115
|
|
|
if tpe in ["c", "continuous"]: |
116
|
|
|
var = ContinuousVariable.make(name) |
117
|
|
|
elif tpe in ["w", "weight"]: |
118
|
|
|
var = None |
119
|
|
|
elif tpe in ["d", "discrete"]: |
120
|
|
|
var = DiscreteVariable() # no name to bypass caching |
121
|
|
|
var.name = name |
122
|
|
|
var.fix_order = True |
123
|
|
|
elif tpe in ["s", "string"]: |
124
|
|
|
var = StringVariable.make(name) |
125
|
|
|
else: |
126
|
|
|
values = [v.replace("\\ ", " ") |
127
|
|
|
for v in self.non_escaped_spaces.split(tpe)] |
128
|
|
|
var = DiscreteVariable.make(name, values, True) |
129
|
|
|
var.attributes.update(attrs) |
130
|
|
|
|
131
|
|
|
if is_class: |
132
|
|
|
if is_meta: |
133
|
|
|
raise ValueError( |
134
|
|
|
"Variable {} (column {}) is marked as " |
135
|
|
|
"class and meta attribute".format(name, col)) |
136
|
|
|
class_vars.append(var) |
137
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
138
|
|
|
elif is_meta: |
139
|
|
|
metas.append(var) |
140
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
141
|
|
|
else: |
142
|
|
|
attributes.append(var) |
143
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
144
|
|
|
|
145
|
|
|
domain = Domain(attributes, class_vars, metas) |
146
|
|
|
return domain |
147
|
|
|
|
148
|
|
|
def count_lines(self, file): |
|
|
|
|
149
|
|
|
file.seek(0) |
150
|
|
|
i = -3 |
151
|
|
|
for _ in file: |
152
|
|
|
i += 1 |
153
|
|
|
return i |
154
|
|
|
|
155
|
|
|
def read_data(self, f, table): |
156
|
|
|
X, Y = table.X, table._Y |
|
|
|
|
157
|
|
|
W = table.W if table.W.shape[-1] else None |
158
|
|
|
f.seek(0) |
159
|
|
|
f.readline() |
160
|
|
|
f.readline() |
161
|
|
|
f.readline() |
162
|
|
|
padding = [""] * self.n_columns |
163
|
|
|
if self.basket_column >= 0: |
164
|
|
|
# TODO how many columns?! |
|
|
|
|
165
|
|
|
table._Xsparse = sparse.lil_matrix(len(X), 100) |
|
|
|
|
166
|
|
|
table.metas = metas = ( |
167
|
|
|
np.empty((len(X), len(self.meta_columns)), dtype=object)) |
168
|
|
|
line_count = 0 |
169
|
|
|
Xr = None |
170
|
|
|
for lne in f: |
171
|
|
|
values = lne |
172
|
|
|
if not values.strip(): |
173
|
|
|
continue |
174
|
|
|
values = values.split("\t") |
175
|
|
|
if len(values) > self.n_columns: |
176
|
|
|
raise ValueError("Too many columns in line {}". |
177
|
|
|
format(4 + line_count)) |
178
|
|
|
elif len(values) < self.n_columns: |
179
|
|
|
values += padding |
180
|
|
|
if self.attribute_columns: |
181
|
|
|
Xr = X[line_count] |
182
|
|
|
for i, (col, reader) in enumerate(self.attribute_columns): |
183
|
|
|
Xr[i] = reader(values[col].strip()) |
184
|
|
|
for i, (col, reader) in enumerate(self.classvar_columns): |
185
|
|
|
Y[line_count, i] = reader(values[col].strip()) |
186
|
|
|
if W is not None: |
187
|
|
|
W[line_count] = float(values[self.weight_column]) |
188
|
|
|
for i, (col, reader) in enumerate(self.meta_columns): |
189
|
|
|
metas[line_count, i] = reader(values[col].strip()) |
190
|
|
|
line_count += 1 |
191
|
|
|
if line_count != len(X): |
|
|
|
|
192
|
|
|
del Xr, X, Y, W, metas |
193
|
|
|
table.X.resize(line_count, len(table.domain.attributes)) |
194
|
|
|
table._Y.resize(line_count, len(table.domain.class_vars)) |
|
|
|
|
195
|
|
|
if table.W.ndim == 1: |
196
|
|
|
table.W.resize(line_count) |
197
|
|
|
else: |
198
|
|
|
table.W.resize((line_count, 0)) |
199
|
|
|
table.metas.resize((line_count, len(self.meta_columns))) |
200
|
|
|
table.n_rows = line_count |
201
|
|
|
|
202
|
|
|
def reorder_values_array(self, arr, variables): |
|
|
|
|
203
|
|
|
newvars = [] |
204
|
|
|
for col, var in enumerate(variables): |
205
|
|
|
if getattr(var, "fix_order", False): |
206
|
|
|
nvar = var.make(var.name, var.values, var.ordered) |
207
|
|
|
nvar.attributes = var.attributes |
208
|
|
|
move = len(var.values) |
209
|
|
|
if nvar.values != var.values: |
210
|
|
|
arr[:, col] += move |
211
|
|
|
for i, val in enumerate(var.values): |
212
|
|
|
bn.replace(arr[:, col], move + i, nvar.values.index(val)) |
213
|
|
|
var = nvar |
214
|
|
|
newvars.append(var) |
215
|
|
|
return newvars |
216
|
|
|
|
217
|
|
|
def reorder_values(self, table): |
218
|
|
|
attrs = self.reorder_values_array(table.X, table.domain.attributes) |
219
|
|
|
classes = self.reorder_values_array(table._Y, table.domain.class_vars) |
|
|
|
|
220
|
|
|
metas = self.reorder_values_array(table.metas, table.domain.metas) |
221
|
|
|
table.domain = Domain(attrs, classes, metas=metas) |
222
|
|
|
|
223
|
|
|
def read_file(self, filename, cls=None): |
224
|
|
|
with open(filename) as file: |
225
|
|
|
return self._read_file(file, cls) |
226
|
|
|
|
227
|
|
|
def _read_file(self, file, cls=None): |
228
|
|
|
from ..data import Table |
229
|
|
|
|
230
|
|
|
if cls is None: |
231
|
|
|
cls = Table |
232
|
|
|
domain = self.read_header(file) |
233
|
|
|
nExamples = self.count_lines(file) |
234
|
|
|
table = cls.from_domain(domain, nExamples, self.weight_column >= 0) |
235
|
|
|
self.read_data(file, table) |
236
|
|
|
self.reorder_values(table) |
237
|
|
|
return table |
238
|
|
|
|
239
|
|
|
@classmethod |
240
|
|
|
def _write_fast(cls, f, data): |
241
|
|
|
wa = [var.str_val for var in data.domain.variables + data.domain.metas] |
242
|
|
|
for Xi, Yi, Mi in zip(data.X, data._Y, data.metas): |
|
|
|
|
243
|
|
|
f.write("\t".join(w(val) for val, w in zip(chain(Xi, Yi, Mi), wa))) |
244
|
|
|
f.write("\n") |
245
|
|
|
|
246
|
|
|
@classmethod |
247
|
|
|
def write_file(cls, filename, data): |
248
|
|
|
""" |
249
|
|
|
Save data to file. |
250
|
|
|
|
251
|
|
|
Function uses fast implementation in case of numpy data, and slower |
252
|
|
|
fall-back for general storage. |
253
|
|
|
|
254
|
|
|
:param filename: the name of the file |
255
|
|
|
:type filename: str |
256
|
|
|
:param data: the data to be saved |
257
|
|
|
:type data: Orange.data.Storage |
258
|
|
|
""" |
259
|
|
|
if isinstance(filename, str): |
260
|
|
|
f = open(filename, "w") |
261
|
|
|
else: |
262
|
|
|
f = filename |
263
|
|
|
domain_vars = data.domain.variables + data.domain.metas |
264
|
|
|
# first line |
265
|
|
|
f.write("\t".join([str(j.name) for j in domain_vars])) |
266
|
|
|
f.write("\n") |
267
|
|
|
|
268
|
|
|
# second line |
269
|
|
|
# TODO Basket column. |
|
|
|
|
270
|
|
|
t = {"ContinuousVariable": "c", "DiscreteVariable": "d", |
271
|
|
|
"StringVariable": "string", "Basket": "basket"} |
272
|
|
|
|
273
|
|
|
f.write("\t".join([t[type(j).__name__] for j in domain_vars])) |
274
|
|
|
f.write("\n") |
275
|
|
|
|
276
|
|
|
# third line |
277
|
|
|
m = list(data.domain.metas) |
278
|
|
|
c = list(data.domain.class_vars) |
279
|
|
|
r = [] |
280
|
|
|
for i in domain_vars: |
281
|
|
|
r1 = ["{}={}".format(k, v).replace(" ", "\\ ") |
282
|
|
|
for k, v in i.attributes.items()] |
283
|
|
|
if i in m: |
284
|
|
|
r1.append("m") |
285
|
|
|
elif i in c: |
286
|
|
|
r1.append("class") |
287
|
|
|
r.append(" ".join(r1)) |
288
|
|
|
f.write("\t".join(r)) |
289
|
|
|
f.write("\n") |
290
|
|
|
|
291
|
|
|
# data |
292
|
|
|
# noinspection PyBroadException |
293
|
|
|
try: |
294
|
|
|
cls._write_fast(f, data) |
295
|
|
|
except: |
|
|
|
|
296
|
|
|
domain_vars = [data.domain.index(var) for var in domain_vars] |
297
|
|
|
for i in data: |
298
|
|
|
f.write("\t".join(str(i[j]) for j in domain_vars) + "\n") |
299
|
|
|
f.close() |
300
|
|
|
|
301
|
|
|
def write(self, filename, data): |
302
|
|
|
self.write_file(filename, data) |
303
|
|
|
|
304
|
|
|
|
305
|
|
|
@FileFormats.register("Comma-separated file", ".csv") |
306
|
|
|
class TxtFormat: |
307
|
|
|
MISSING_VALUES = frozenset({"", "NA", "?"}) |
308
|
|
|
|
309
|
|
|
@staticmethod |
310
|
|
|
def read_header(file, delimiter=None): |
311
|
|
|
first_line = file.readline() |
312
|
|
|
file.seek(0) |
313
|
|
|
if delimiter is None: |
314
|
|
|
for delimiter in "\t,; ": |
315
|
|
|
if delimiter in first_line: |
316
|
|
|
break |
317
|
|
|
else: |
318
|
|
|
delimiter = None |
319
|
|
|
if delimiter == " ": |
320
|
|
|
delimiter = None |
321
|
|
|
atoms = first_line.split(delimiter) |
322
|
|
|
try: |
323
|
|
|
[float(atom) for atom in set(atoms) - TxtFormat.MISSING_VALUES] |
|
|
|
|
324
|
|
|
header_lines = 0 |
325
|
|
|
names = ["Var{:04}".format(i + 1) for i in range(len(atoms))] |
326
|
|
|
except ValueError: |
327
|
|
|
names = [atom.strip() for atom in atoms] |
328
|
|
|
header_lines = 1 |
329
|
|
|
domain = Domain([ContinuousVariable.make(name) for name in names]) |
330
|
|
|
return domain, header_lines, delimiter |
331
|
|
|
|
332
|
|
|
def read_file(self, filename, cls=None): |
333
|
|
|
from ..data import Table |
334
|
|
|
|
335
|
|
|
if cls is None: |
336
|
|
|
cls = Table |
337
|
|
|
with open(filename, "rt") as file: |
338
|
|
|
domain, header_lines, delimiter = self.read_header(file) |
339
|
|
|
with open(filename, "rb") as file: |
340
|
|
|
arr = np.genfromtxt(file, delimiter=delimiter, |
341
|
|
|
skip_header=header_lines, |
342
|
|
|
missing_values=self.MISSING_VALUES) |
343
|
|
|
table = cls.from_numpy(domain, arr) |
344
|
|
|
return table |
345
|
|
|
|
346
|
|
|
@classmethod |
347
|
|
|
def csv_saver(cls, filename, data, delimiter='\t'): |
348
|
|
|
with open(filename, 'w') as csvfile: |
349
|
|
|
writer = csv.writer(csvfile, delimiter=delimiter) |
350
|
|
|
all_vars = data.domain.variables + data.domain.metas |
351
|
|
|
writer.writerow([v.name for v in all_vars]) # write variable names |
352
|
|
|
if delimiter == '\t': |
353
|
|
|
flags = ([''] * len(data.domain.attributes)) + \ |
354
|
|
|
(['class'] * len(data.domain.class_vars)) + \ |
355
|
|
|
(['m'] * len(data.domain.metas)) |
356
|
|
|
|
357
|
|
|
for i, var in enumerate(all_vars): |
358
|
|
|
attrs = ["{0!s}={1!s}".format(*item).replace(" ", "\\ ") |
359
|
|
|
for item in var.attributes.items()] |
360
|
|
|
if attrs: |
361
|
|
|
flags[i] += (" " if flags[i] else "") + (" ".join(attrs)) |
362
|
|
|
|
363
|
|
|
writer.writerow([type(v).__name__.replace("Variable", "").lower() |
364
|
|
|
for v in all_vars]) # write variable types |
365
|
|
|
writer.writerow(flags) # write flags |
366
|
|
|
for ex in data: # write examples |
367
|
|
|
writer.writerow(ex) |
368
|
|
|
|
369
|
|
|
@classmethod |
370
|
|
|
def write_file(cls, filename, data): |
371
|
|
|
cls.csv_saver(filename, data, ',') |
372
|
|
|
|
373
|
|
|
def write(self, filename, data): |
374
|
|
|
self.write_file(filename, data) |
375
|
|
|
|
376
|
|
|
|
377
|
|
|
@FileFormats.register("Basket file", ".basket") |
378
|
|
|
class BasketFormat: |
379
|
|
|
@classmethod |
380
|
|
|
def read_file(cls, filename, storage_class=None): |
381
|
|
|
from Orange.data import _io |
382
|
|
|
|
383
|
|
|
if storage_class is None: |
384
|
|
|
from ..data import Table as storage_class |
385
|
|
|
|
386
|
|
|
def constr_vars(inds): |
387
|
|
|
if inds: |
388
|
|
|
return [ContinuousVariable(x.decode("utf-8")) for _, x in |
389
|
|
|
sorted((ind, name) for name, ind in inds.items())] |
390
|
|
|
|
391
|
|
|
X, Y, metas, attr_indices, class_indices, meta_indices = \ |
392
|
|
|
_io.sparse_read_float(filename.encode(sys.getdefaultencoding())) |
393
|
|
|
|
394
|
|
|
attrs = constr_vars(attr_indices) |
395
|
|
|
classes = constr_vars(class_indices) |
396
|
|
|
meta_attrs = constr_vars(meta_indices) |
397
|
|
|
domain = Domain(attrs, classes, meta_attrs) |
398
|
|
|
return storage_class.from_numpy( |
399
|
|
|
domain, attrs and X, classes and Y, metas and meta_attrs) |
400
|
|
|
|
401
|
|
|
|
402
|
|
|
@FileFormats.register("Excel file", ".xlsx") |
403
|
|
|
class ExcelFormat: |
404
|
|
|
non_escaped_spaces = re.compile(r"(?<!\\) +") |
405
|
|
|
|
406
|
|
|
def __init__(self): |
407
|
|
|
self.attribute_columns = [] |
408
|
|
|
self.classvar_columns = [] |
409
|
|
|
self.meta_columns = [] |
410
|
|
|
self.weight_column = -1 |
411
|
|
|
self.basket_column = -1 |
412
|
|
|
|
413
|
|
|
self.n_columns = self.first_data_row = 0 |
414
|
|
|
|
415
|
|
|
def open_workbook(self, f): |
416
|
|
|
from openpyxl import load_workbook |
|
|
|
|
417
|
|
|
|
418
|
|
|
if isinstance(f, str) and ":" in f[2:]: |
419
|
|
|
f, sheet = f.rsplit(":", 1) |
420
|
|
|
else: |
421
|
|
|
sheet = None |
422
|
|
|
wb = load_workbook(f, use_iterators=True, |
423
|
|
|
read_only=True, data_only=True) |
424
|
|
|
ws = wb.get_sheet_by_name(sheet) if sheet else wb.get_active_sheet() |
425
|
|
|
self.n_columns = ws.get_highest_column() |
426
|
|
|
return ws |
427
|
|
|
|
428
|
|
|
# noinspection PyBroadException |
429
|
|
|
def read_header_3(self, worksheet): |
430
|
|
|
cols = self.n_columns |
431
|
|
|
try: |
432
|
|
|
names, types, flags = [ |
433
|
|
|
[cell.value.strip() if cell.value is not None else "" |
434
|
|
|
for cell in row] |
435
|
|
|
for row in worksheet.get_squared_range(1, 1, cols, 3)] |
436
|
|
|
except: |
|
|
|
|
437
|
|
|
return False |
438
|
|
|
if not (all(tpe in ("", "c", "d", "s", "continuous", "discrete", |
439
|
|
|
"string", "w", "weight") or " " in tpe |
440
|
|
|
for tpe in types) and |
441
|
|
|
all(flg in ("", "i", "ignore", "m", "meta", "w", "weight", |
442
|
|
|
"b", "basket", "class") or "=" in flg |
443
|
|
|
for flg in flags)): |
444
|
|
|
return False |
445
|
|
|
attributes = [] |
446
|
|
|
class_vars = [] |
447
|
|
|
metas = [] |
448
|
|
|
for col, (name, tpe, flag) in enumerate(zip(names, types, flags)): |
449
|
|
|
flag = self.non_escaped_spaces.split(flag) |
450
|
|
|
if "i" in flag or "ignore" in flag: |
451
|
|
|
continue |
452
|
|
|
if "b" in flag or "basket" in flag: |
453
|
|
|
self.basket_column = col |
454
|
|
|
continue |
455
|
|
|
is_class = "class" in flag |
456
|
|
|
is_meta = "m" in flag or "meta" in flag or tpe in ["s", "string"] |
457
|
|
|
is_weight = "w" in flag or "weight" in flag \ |
458
|
|
|
or tpe in ["w", "weight"] |
459
|
|
|
attrs = [f.split("=", 1) for f in flag if "=" in f] |
460
|
|
|
if is_weight: |
461
|
|
|
if is_class: |
462
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
463
|
|
|
"class and weight".format(name, col + 1)) |
464
|
|
|
self.weight_column = col |
465
|
|
|
continue |
466
|
|
|
if tpe in ["c", "continuous"]: |
467
|
|
|
var = ContinuousVariable.make(name) |
468
|
|
|
elif tpe in ["w", "weight"]: |
469
|
|
|
var = None |
470
|
|
|
elif tpe in ["d", "discrete"]: |
471
|
|
|
var = DiscreteVariable.make(name) |
472
|
|
|
var.fix_order = True |
473
|
|
|
elif tpe in ["s", "string"]: |
474
|
|
|
var = StringVariable.make(name) |
475
|
|
|
else: |
476
|
|
|
values = [v.replace("\\ ", " ") |
477
|
|
|
for v in self.non_escaped_spaces.split(tpe)] |
478
|
|
|
var = DiscreteVariable.make(name, values, True) |
479
|
|
|
var.attributes.update(attrs) |
480
|
|
|
if is_class: |
481
|
|
|
if is_meta: |
482
|
|
|
raise ValueError( |
483
|
|
|
"Variable {} (column {}) is marked as " |
484
|
|
|
"class and meta attribute".format(name, col)) |
485
|
|
|
class_vars.append(var) |
486
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
487
|
|
|
elif is_meta: |
488
|
|
|
metas.append(var) |
489
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
490
|
|
|
else: |
491
|
|
|
attributes.append(var) |
492
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
493
|
|
|
self.first_data_row = 4 |
494
|
|
|
return Domain(attributes, class_vars, metas) |
495
|
|
|
|
496
|
|
|
# noinspection PyBroadException |
497
|
|
|
def read_header_0(self, worksheet): |
498
|
|
|
try: |
499
|
|
|
[float(cell.value) if cell.value is not None else None |
500
|
|
|
for cell in |
501
|
|
|
worksheet.get_squared_range(1, 1, self.n_columns, 3).__next__()] |
502
|
|
|
except: |
|
|
|
|
503
|
|
|
return False |
504
|
|
|
self.first_data_row = 1 |
505
|
|
|
attrs = [ContinuousVariable.make("Var{:04}".format(i + 1)) |
506
|
|
|
for i in range(self.n_columns)] |
507
|
|
|
self.attribute_columns = [(i, var.val_from_str_add) |
508
|
|
|
for i, var in enumerate(attrs)] |
509
|
|
|
return Domain(attrs) |
510
|
|
|
|
511
|
|
|
def read_header_1(self, worksheet): |
512
|
|
|
import openpyxl.cell.cell |
|
|
|
|
513
|
|
|
|
514
|
|
|
if worksheet.get_highest_column() < 2 or \ |
515
|
|
|
worksheet.get_highest_row() < 2: |
516
|
|
|
return False |
517
|
|
|
cols = self.n_columns |
518
|
|
|
names = [cell.value.strip() if cell.value is not None else "" |
519
|
|
|
for cell in |
520
|
|
|
worksheet.get_squared_range(1, 1, cols, 3).__next__()] |
521
|
|
|
row2 = list(worksheet.get_squared_range(1, 2, cols, 3).__next__()) |
522
|
|
|
attributes = [] |
523
|
|
|
class_vars = [] |
524
|
|
|
metas = [] |
525
|
|
|
for col, name in enumerate(names): |
526
|
|
|
if "#" in name: |
527
|
|
|
flags, name = name.split("#", 1) |
528
|
|
|
else: |
529
|
|
|
flags = "" |
530
|
|
|
if "i" in flags: |
531
|
|
|
continue |
532
|
|
|
if "b" in flags: |
533
|
|
|
self.basket_column = col |
534
|
|
|
continue |
535
|
|
|
is_class = "c" in flags |
536
|
|
|
is_meta = "m" in flags or "s" in flags |
537
|
|
|
is_weight = "W" in flags or "w" in flags |
538
|
|
|
if is_weight: |
539
|
|
|
if is_class: |
540
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
541
|
|
|
"class and weight".format(name, col)) |
542
|
|
|
self.weight_column = col |
543
|
|
|
continue |
544
|
|
|
if "C" in flags: |
545
|
|
|
var = ContinuousVariable.make(name) |
546
|
|
|
elif is_weight: |
547
|
|
|
var = None |
548
|
|
|
elif "D" in flags: |
549
|
|
|
var = DiscreteVariable.make(name) |
550
|
|
|
var.fix_order = True |
551
|
|
|
elif "S" in flags: |
552
|
|
|
var = StringVariable.make(name) |
553
|
|
|
elif row2[col].data_type == "n": |
554
|
|
|
var = ContinuousVariable.make(name) |
555
|
|
|
else: |
556
|
|
|
if len(set(row[col].value for row in worksheet.rows)) > 20: |
557
|
|
|
var = StringVariable.make(name) |
558
|
|
|
is_meta = True |
559
|
|
|
else: |
560
|
|
|
var = DiscreteVariable.make(name) |
561
|
|
|
var.fix_order = True |
562
|
|
|
if is_class: |
563
|
|
|
if is_meta: |
564
|
|
|
raise ValueError( |
565
|
|
|
"Variable {} (column {}) is marked as " |
566
|
|
|
"class and meta attribute".format( |
567
|
|
|
name, openpyxl.cell.cell.get_column_letter(col + 1)) |
568
|
|
|
) |
569
|
|
|
class_vars.append(var) |
570
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
571
|
|
|
elif is_meta: |
572
|
|
|
metas.append(var) |
573
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
574
|
|
|
else: |
575
|
|
|
attributes.append(var) |
576
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
577
|
|
|
if attributes and not class_vars: |
578
|
|
|
class_vars.append(attributes.pop(-1)) |
579
|
|
|
self.classvar_columns.append(self.attribute_columns.pop(-1)) |
580
|
|
|
self.first_data_row = 2 |
581
|
|
|
return Domain(attributes, class_vars, metas) |
582
|
|
|
|
583
|
|
|
def read_header(self, worksheet): |
584
|
|
|
domain = self.read_header_3(worksheet) or \ |
585
|
|
|
self.read_header_0(worksheet) or \ |
586
|
|
|
self.read_header_1(worksheet) |
587
|
|
|
if domain is False: |
588
|
|
|
raise ValueError("Invalid header") |
589
|
|
|
return domain |
590
|
|
|
|
591
|
|
|
# noinspection PyPep8Naming,PyProtectedMember |
592
|
|
|
def read_data(self, worksheet, table): |
593
|
|
|
X, Y = table.X, table._Y |
|
|
|
|
594
|
|
|
W = table.W if table.W.shape[-1] else None |
595
|
|
|
if self.basket_column >= 0: |
596
|
|
|
# TODO how many columns?! |
|
|
|
|
597
|
|
|
table._Xsparse = sparse.lil_matrix(len(X), 100) |
|
|
|
|
598
|
|
|
table.metas = metas = ( |
599
|
|
|
np.empty((len(X), len(self.meta_columns)), dtype=object)) |
600
|
|
|
sheet_rows = worksheet.rows |
601
|
|
|
for _ in range(1, self.first_data_row): |
602
|
|
|
sheet_rows.__next__() |
603
|
|
|
line_count = 0 |
604
|
|
|
Xr = None |
605
|
|
|
for row in sheet_rows: |
606
|
|
|
values = [cell.value for cell in row] |
607
|
|
|
if all(value is None for value in values): |
608
|
|
|
continue |
609
|
|
|
if self.attribute_columns: |
610
|
|
|
Xr = X[line_count] |
611
|
|
|
for i, (col, reader) in enumerate(self.attribute_columns): |
612
|
|
|
v = values[col] |
613
|
|
|
Xr[i] = reader(v.strip() if isinstance(v, str) else v) |
614
|
|
|
for i, (col, reader) in enumerate(self.classvar_columns): |
615
|
|
|
v = values[col] |
616
|
|
|
Y[line_count, i] = reader( |
617
|
|
|
v.strip() if isinstance(v, str) else v) |
618
|
|
|
if W is not None: |
619
|
|
|
W[line_count] = float(values[self.weight_column]) |
620
|
|
|
for i, (col, reader) in enumerate(self.meta_columns): |
621
|
|
|
v = values[col] |
622
|
|
|
metas[line_count, i] = reader( |
623
|
|
|
v.strip() if isinstance(v, str) else v) |
624
|
|
|
line_count += 1 |
625
|
|
|
if line_count != len(X): |
|
|
|
|
626
|
|
|
del Xr, X, Y, W, metas |
627
|
|
|
table.X.resize(line_count, len(table.domain.attributes)) |
628
|
|
|
table._Y.resize(line_count, len(table.domain.class_vars)) |
|
|
|
|
629
|
|
|
if table.W.ndim == 1: |
630
|
|
|
table.W.resize(line_count) |
631
|
|
|
else: |
632
|
|
|
table.W.resize((line_count, 0)) |
633
|
|
|
table.metas.resize((line_count, len(self.meta_columns))) |
634
|
|
|
table.n_rows = line_count |
635
|
|
|
|
636
|
|
|
# noinspection PyUnresolvedReferences |
637
|
|
|
@staticmethod |
638
|
|
|
def reorder_values_array(arr, variables): |
639
|
|
|
for col, var in enumerate(variables): |
640
|
|
|
if getattr(var, "fix_order", False) and len(var.values) < 1000: |
641
|
|
|
new_order = var.ordered_values(var.values) |
642
|
|
|
if new_order == var.values: |
643
|
|
|
continue |
644
|
|
|
arr[:, col] += 1000 |
645
|
|
|
for i, val in enumerate(var.values): |
646
|
|
|
bn.replace(arr[:, col], 1000 + i, new_order.index(val)) |
647
|
|
|
var.values = new_order |
648
|
|
|
delattr(var, "fix_order") |
649
|
|
|
|
650
|
|
|
# noinspection PyProtectedMember |
651
|
|
|
def reorder_values(self, table): |
652
|
|
|
self.reorder_values_array(table.X, table.domain.attributes) |
653
|
|
|
self.reorder_values_array(table._Y, table.domain.class_vars) |
|
|
|
|
654
|
|
|
self.reorder_values_array(table.metas, table.domain.metas) |
655
|
|
|
|
656
|
|
|
def read_file(self, file, cls=None): |
657
|
|
|
from Orange.data import Table |
658
|
|
|
|
659
|
|
|
if cls is None: |
660
|
|
|
cls = Table |
661
|
|
|
worksheet = self.open_workbook(file) |
662
|
|
|
domain = self.read_header(worksheet) |
663
|
|
|
table = cls.from_domain( |
664
|
|
|
domain, |
665
|
|
|
worksheet.get_highest_row() - self.first_data_row + 1, |
666
|
|
|
self.weight_column >= 0) |
667
|
|
|
self.read_data(worksheet, table) |
668
|
|
|
self.reorder_values(table) |
669
|
|
|
return table |
670
|
|
|
|
671
|
|
|
|
672
|
|
|
@FileFormats.register("Pickled table", ".pickle") |
673
|
|
|
class PickleFormat: |
674
|
|
|
@classmethod |
675
|
|
|
def read_file(cls, file, _=None): |
676
|
|
|
with open(file, "rb") as f: |
677
|
|
|
return pickle.load(f) |
678
|
|
|
|
679
|
|
|
@classmethod |
680
|
|
|
def write_file(cls, filename, table): |
681
|
|
|
with open(filename, "wb") as f: |
682
|
|
|
pickle.dump(table, f) |
683
|
|
|
|
684
|
|
|
def write(self, filename, table): |
685
|
|
|
self.write_file(filename, table) |
686
|
|
|
|
687
|
|
|
|
688
|
|
|
@FileFormats.register("Dot Tree File", ".dot") |
689
|
|
|
class DotFormat: |
690
|
|
|
@classmethod |
691
|
|
|
def write_graph(cls, filename, graph): |
692
|
|
|
from sklearn import tree |
|
|
|
|
693
|
|
|
|
694
|
|
|
tree.export_graphviz(graph, out_file=filename) |
695
|
|
|
|
696
|
|
|
def write(self, filename, tree): |
697
|
|
|
if type(tree) == dict: |
698
|
|
|
tree = tree['tree'] |
699
|
|
|
self.write_graph(filename, tree) |
700
|
|
|
|
701
|
|
|
|
702
|
|
|
@FileFormats.register("Fixed width textfile", ".fixed") |
703
|
|
|
class FixedWidthFormat(TabDelimFormat): |
704
|
|
|
""" |
705
|
|
|
FixedWidthFormat reads tables from files where the columns have a |
706
|
|
|
fixed width. The cells are space-padded to the left. |
707
|
|
|
See datasets/glass.fixed and tests/test_fixedwidth_reader.py |
708
|
|
|
|
|
|
|
|
709
|
|
|
It is possible to determine the exact cell location of a specific |
710
|
|
|
table cell within the file because of the fixed width columns. |
711
|
|
|
This allows the FixedWidthFormat to be used with the LazyFile |
712
|
|
|
widget to 'read' extremely large files. |
713
|
|
|
|
|
|
|
|
714
|
|
|
TODO: |
715
|
|
|
- Add read_row() without reading entire file. |
716
|
|
|
- Allow spaces in column names and cell values. |
717
|
|
|
- Ensure compatibility with all tables in the tests directory. |
718
|
|
|
- Do metas and class properly. |
719
|
|
|
""" |
720
|
|
|
def read_ends_columns(self, filename): |
|
|
|
|
721
|
|
|
""" |
722
|
|
|
Returns the location where each column ends in a line in the |
723
|
|
|
file. |
724
|
|
|
TODO: |
725
|
|
|
- Cleanup. |
726
|
|
|
""" |
727
|
|
|
ColumnInfo = namedtuple( |
728
|
|
|
'ColumnInfo', |
729
|
|
|
['name', 'tpe', 'flag', 'start', 'end', 'width', 'index'], |
730
|
|
|
) |
731
|
|
|
with open(filename) as f: |
732
|
|
|
f.seek(0) |
733
|
|
|
l_names = f.readline() |
734
|
|
|
l_types = f.readline() |
735
|
|
|
l_flags = f.readline() |
736
|
|
|
types = l_types.split() |
737
|
|
|
ends = [] |
738
|
|
|
for n in types: |
739
|
|
|
position_start = ends[-1] if len(ends) else 0 |
740
|
|
|
end = (" "+l_types.replace("\n"," ")).find(" "+n+" ", position_start) + len(n) |
741
|
|
|
ends.append(end) |
742
|
|
|
info_columns = [ |
743
|
|
|
ColumnInfo( |
744
|
|
|
name=l_names[start:end].strip(), |
745
|
|
|
flag=l_flags[start:end].strip(), |
746
|
|
|
tpe=tpe, |
747
|
|
|
start=start, |
748
|
|
|
end=end, |
749
|
|
|
width=end-start, |
750
|
|
|
index=inde, |
751
|
|
|
) for (inde, (start, end, tpe)) in enumerate(zip( |
752
|
|
|
[0] + ends[:-1], |
753
|
|
|
ends, |
754
|
|
|
types, |
755
|
|
|
)) |
756
|
|
|
] |
757
|
|
|
return info_columns |
758
|
|
|
|
|
|
|
|
759
|
|
|
def read_header(self, filename): |
760
|
|
|
""" |
761
|
|
|
Reads the header of the fixed width file and returns the |
762
|
|
|
Domain of the table. |
763
|
|
|
|
|
|
|
|
764
|
|
|
TODO: |
765
|
|
|
- Use read_ends_columns() to determine the width of the |
766
|
|
|
columns and use that to parse the lines, because this |
767
|
|
|
will allow the use of spaces in column names. |
768
|
|
|
""" |
769
|
|
|
ends = self.read_ends_columns(filename) |
770
|
|
|
names = [end.name for end in ends] |
771
|
|
|
types = [end.tpe for end in ends] |
772
|
|
|
flags = [end.flag for end in ends] |
773
|
|
|
with open(filename) as f: |
774
|
|
|
# Function based on read_header from TabDelimReader. |
775
|
|
|
f.seek(0) |
776
|
|
|
#names = f.readline().strip("\n\r").split() |
777
|
|
|
#types = f.readline().strip("\n\r").split() |
778
|
|
|
#flags = f.readline().strip("\n\r").split() |
779
|
|
|
f.readline() |
780
|
|
|
f.readline() |
781
|
|
|
f.readline() |
782
|
|
|
# Changed split on "\t" to split on spaces. |
783
|
|
|
self.n_columns = len(names) |
|
|
|
|
784
|
|
|
if len(types) != self.n_columns: |
785
|
|
|
raise ValueError("File contains %i variable names and %i types" % |
786
|
|
|
(len(names), len(types))) |
787
|
|
|
if len(flags) > self.n_columns: |
788
|
|
|
raise ValueError("There are more flags than variables") |
789
|
|
|
else: |
790
|
|
|
flags += [""] * self.n_columns |
791
|
|
|
attributes = [] |
792
|
|
|
class_vars = [] |
793
|
|
|
metas = [] |
794
|
|
|
self.attribute_columns = [] |
|
|
|
|
795
|
|
|
self.classvar_columns = [] |
|
|
|
|
796
|
|
|
self.meta_columns = [] |
|
|
|
|
797
|
|
|
self.weight_column = -1 |
|
|
|
|
798
|
|
|
self.basket_column = -1 |
|
|
|
|
799
|
|
|
for col, (name, tpe, flag) in enumerate(zip(names, types, flags)): |
800
|
|
|
tpe = tpe.strip() |
801
|
|
|
flag = flag.split() |
802
|
|
|
if "i" in flag or "ignore" in flag: |
803
|
|
|
continue |
804
|
|
|
if "b" in flag or "basket" in flag: |
805
|
|
|
self.basket_column = col |
|
|
|
|
806
|
|
|
continue |
807
|
|
|
is_class = "class" in flag |
808
|
|
|
is_meta = "m" in flag or "meta" in flag or tpe in ["s", "string"] |
809
|
|
|
is_weight = "w" in flag or "weight" in flag \ |
810
|
|
|
or tpe in ["w", "weight"] |
811
|
|
|
if is_weight: |
812
|
|
|
if is_class: |
813
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
814
|
|
|
"class and weight".format(name, col)) |
815
|
|
|
self.weight_column = col |
|
|
|
|
816
|
|
|
continue |
817
|
|
|
if tpe in ["c", "continuous"]: |
818
|
|
|
var = ContinuousVariable.make(name) |
819
|
|
|
elif tpe in ["w", "weight"]: |
820
|
|
|
var = None |
821
|
|
|
elif tpe in ["d", "discrete"]: |
822
|
|
|
var = DiscreteVariable.make(name) |
823
|
|
|
elif tpe in ["s", "string"]: |
824
|
|
|
var = StringVariable.make(name) |
825
|
|
|
else: |
826
|
|
|
values = [v.replace("\\ ", " ") |
827
|
|
|
for v in self.non_escaped_spaces.split(tpe)] |
828
|
|
|
var = DiscreteVariable.make(name, values, True) |
829
|
|
|
var.fix_order = (isinstance(var, DiscreteVariable) |
830
|
|
|
and not var.values) |
831
|
|
|
if is_class: |
832
|
|
|
if is_meta: |
833
|
|
|
raise ValueError( |
834
|
|
|
"Variable {} (column {}) is marked as " |
835
|
|
|
"class and meta attribute".format(name, col)) |
836
|
|
|
class_vars.append(var) |
837
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
838
|
|
|
elif is_meta: |
839
|
|
|
metas.append(var) |
840
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
841
|
|
|
else: |
842
|
|
|
attributes.append(var) |
843
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
844
|
|
|
domain = Domain(attributes, class_vars, metas) |
845
|
|
|
return domain |
846
|
|
|
def count_lines(self, filename): |
847
|
|
|
""" |
848
|
|
|
Counts the number of lines in the file. This can be done |
849
|
|
|
without reading the entire file because the file |
850
|
|
|
has fixed width columns. |
851
|
|
|
""" |
852
|
|
|
len_file = os.stat(filename).st_size |
853
|
|
|
with open(filename) as f: |
854
|
|
|
f.seek(0) |
855
|
|
|
line = f.readline() |
856
|
|
|
len_line = len(line) |
857
|
|
|
|
|
|
|
|
858
|
|
|
count = int(len_file / len_line) - 3 |
859
|
|
|
return count |
860
|
|
|
def read_cell(self, filename, index_row, name_attribute): |
861
|
|
|
""" |
862
|
|
|
Reads one specific cell value without reading the entire file. |
863
|
|
|
|
|
|
|
|
864
|
|
|
TODO: |
865
|
|
|
- Cleanup this function. |
866
|
|
|
- Test with discrete and class attributes. |
867
|
|
|
- Cache the header information. |
868
|
|
|
""" |
869
|
|
|
info_columns = self.read_ends_columns(filename) |
870
|
|
|
header = self.read_header(filename) |
|
|
|
|
871
|
|
|
with open(filename) as f: |
872
|
|
|
f.seek(0) |
873
|
|
|
line = f.readline() |
874
|
|
|
len_line1 = len(line) |
|
|
|
|
875
|
|
|
len_line = sum(ic.width for ic in info_columns) + 1 # for \n |
876
|
|
|
col = [ic for ic in info_columns if ic.name == name_attribute][0] |
877
|
|
|
with open(filename) as f: |
878
|
|
|
f.seek( (3+index_row) * len_line + col.start ) |
879
|
|
|
value = f.read(col.width) |
880
|
|
|
value_n = None |
881
|
|
|
# Parse the string in the correct format. This is a kludge |
882
|
|
|
# based on code from read_data(). |
883
|
|
|
if self.attribute_columns: |
884
|
|
|
for i, (coli, reader) in enumerate(self.attribute_columns): |
|
|
|
|
885
|
|
|
if coli == col.index: |
886
|
|
|
value_n = reader(value.strip()) |
887
|
|
|
for i, (coli, reader) in enumerate(self.classvar_columns): |
888
|
|
|
if coli == col.index: |
889
|
|
|
value_n = reader(value.strip()) |
890
|
|
|
return value_n |
891
|
|
|
|
|
|
|
|
892
|
|
|
def read_data(self, filename, table): |
893
|
|
|
""" |
894
|
|
|
Read the data portion of the file. |
895
|
|
|
|
|
|
|
|
896
|
|
|
This function is based on the one in TabDelimFormat. |
897
|
|
|
TODO: |
898
|
|
|
- Use the actual known width of the columns instead |
899
|
|
|
of splitting on space, because that will allow spaces |
900
|
|
|
to be part of the cell values. |
901
|
|
|
That is, use read_ends_columns. |
902
|
|
|
""" |
|
|
|
|
903
|
|
|
with open(filename) as f: |
904
|
|
|
#X, Y = table.X, table.Y |
905
|
|
|
X, Y = table.X, table._Y |
|
|
|
|
906
|
|
|
W = table.W if table.W.shape[-1] else None |
907
|
|
|
f.seek(0) |
908
|
|
|
f.readline() |
909
|
|
|
f.readline() |
910
|
|
|
f.readline() |
911
|
|
|
padding = [""] * self.n_columns |
912
|
|
|
if self.basket_column >= 0: |
913
|
|
|
# TODO how many columns?! |
|
|
|
|
914
|
|
|
table._Xsparse = sparse.lil_matrix(len(X), 100) |
|
|
|
|
915
|
|
|
table.metas = metas = ( |
916
|
|
|
np.empty((len(X), len(self.meta_columns)), dtype=object)) |
917
|
|
|
line_count = 0 |
918
|
|
|
Xr = None |
919
|
|
|
for lne in f: |
920
|
|
|
values = lne.strip() |
921
|
|
|
if not values: |
922
|
|
|
continue |
923
|
|
|
# Only difference with TabDelimReader |
924
|
|
|
#values = values.split("\t") |
925
|
|
|
values = values.split() |
926
|
|
|
if len(values) > self.n_columns: |
927
|
|
|
raise ValueError("Too many columns in line {}". |
928
|
|
|
format(4 + line_count)) |
929
|
|
|
elif len(values) < self.n_columns: |
930
|
|
|
values += padding |
931
|
|
|
if self.attribute_columns: |
932
|
|
|
Xr = X[line_count] |
933
|
|
|
for i, (col, reader) in enumerate(self.attribute_columns): |
934
|
|
|
Xr[i] = reader(values[col].strip()) |
935
|
|
|
for i, (col, reader) in enumerate(self.classvar_columns): |
936
|
|
|
Y[line_count, i] = reader(values[col].strip()) |
937
|
|
|
if W is not None: |
938
|
|
|
W[line_count] = float(values[self.weight_column]) |
939
|
|
|
for i, (col, reader) in enumerate(self.meta_columns): |
940
|
|
|
metas[line_count, i] = reader(values[col].strip()) |
941
|
|
|
line_count += 1 |
942
|
|
|
if line_count != len(X): |
|
|
|
|
943
|
|
|
del Xr, X, Y, W, metas |
944
|
|
|
table.X.resize(line_count, len(table.domain.attributes)) |
945
|
|
|
table.Y.resize(line_count, len(table.domain.class_vars)) |
946
|
|
|
if table.W.ndim == 1: |
947
|
|
|
table.W.resize(line_count) |
948
|
|
|
else: |
949
|
|
|
table.W.resize((line_count, 0)) |
950
|
|
|
table.metas.resize((line_count, len(self.meta_columns))) |
951
|
|
|
table.n_rows = line_count |
952
|
|
|
def read_file(self, filename, cls=None): |
953
|
|
|
""" |
954
|
|
|
Read a file. |
955
|
|
|
|
|
|
|
|
956
|
|
|
The distinction between read_file and _read_file cannot |
957
|
|
|
be made because we cannot get the length of a stream etc. |
958
|
|
|
""" |
959
|
|
|
from ..data import Table |
960
|
|
|
if cls is None: |
961
|
|
|
cls = Table |
962
|
|
|
domain = self.read_header(filename) |
963
|
|
|
nExamples = self.count_lines(filename) |
964
|
|
|
table = cls.from_domain(domain, nExamples, self.weight_column >= 0) |
965
|
|
|
self.read_data(filename, table) |
966
|
|
|
self.reorder_values(table) |
967
|
|
|
return table |
968
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.