|
1
|
|
|
import csv |
|
2
|
|
|
import re |
|
3
|
|
|
import sys |
|
4
|
|
|
import pickle |
|
5
|
|
|
from itertools import chain |
|
6
|
|
|
|
|
7
|
|
|
import os |
|
8
|
|
|
from collections import namedtuple |
|
9
|
|
|
|
|
10
|
|
|
import bottlechest as bn |
|
|
|
|
|
|
11
|
|
|
import numpy as np |
|
|
|
|
|
|
12
|
|
|
from scipy import sparse |
|
|
|
|
|
|
13
|
|
|
# We are not loading openpyxl here since it takes some time |
|
14
|
|
|
|
|
15
|
|
|
from Orange.data import Domain |
|
16
|
|
|
from Orange.data.variable import * |
|
|
|
|
|
|
17
|
|
|
|
|
18
|
|
|
|
|
19
|
|
|
# A singleton simulated with a class |
|
20
|
|
|
class FileFormats: |
|
21
|
|
|
formats = [] |
|
22
|
|
|
names = {} |
|
23
|
|
|
writers = {} |
|
24
|
|
|
readers = {} |
|
25
|
|
|
img_writers = {} |
|
26
|
|
|
graph_writers = {} |
|
27
|
|
|
|
|
28
|
|
|
@classmethod |
|
29
|
|
|
def register(cls, name, extension): |
|
30
|
|
|
def f(format): |
|
|
|
|
|
|
31
|
|
|
cls.NAME = name |
|
32
|
|
|
cls.formats.append(format) |
|
33
|
|
|
cls.names[extension] = name |
|
34
|
|
|
if hasattr(format, "write_file"): |
|
35
|
|
|
cls.writers[extension] = format |
|
36
|
|
|
if hasattr(format, "read_file"): |
|
37
|
|
|
cls.readers[extension] = format |
|
38
|
|
|
if hasattr(format, "write_image"): |
|
39
|
|
|
cls.img_writers[extension] = format |
|
40
|
|
|
if hasattr(format, "write_graph"): |
|
41
|
|
|
cls.graph_writers[extension] = format |
|
42
|
|
|
return format |
|
43
|
|
|
|
|
44
|
|
|
return f |
|
45
|
|
|
|
|
46
|
|
|
|
|
47
|
|
|
class FileReader: |
|
48
|
|
|
def prescan_file(self, f, delim, nvars, disc_cols, cont_cols): |
|
|
|
|
|
|
49
|
|
|
values = [set() for _ in range(nvars)] |
|
50
|
|
|
decimals = [-1] * nvars |
|
51
|
|
|
for lne in f: |
|
52
|
|
|
lne = lne.split(delim) |
|
53
|
|
|
for vs, col in zip(values, disc_cols): |
|
54
|
|
|
vs[col].add(lne[col]) |
|
55
|
|
|
for col in cont_cols: |
|
56
|
|
|
val = lne[col] |
|
57
|
|
|
if not col in Variable._DefaultUnknownStr and "." in val: |
|
|
|
|
|
|
58
|
|
|
decs = len(val) - val.find(".") - 1 |
|
59
|
|
|
if decs > decimals[col]: |
|
60
|
|
|
decimals[col] = decs |
|
61
|
|
|
return values, decimals |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
@FileFormats.register("Tab-delimited file", ".tab") |
|
65
|
|
|
class TabDelimFormat: |
|
66
|
|
|
non_escaped_spaces = re.compile(r"(?<!\\) +") |
|
67
|
|
|
|
|
68
|
|
|
def read_header(self, f): |
|
69
|
|
|
f.seek(0) |
|
70
|
|
|
names = [x.strip() for x in f.readline().strip("\n\r").split("\t")] |
|
71
|
|
|
types = [x.strip() for x in f.readline().strip("\n\r").split("\t")] |
|
72
|
|
|
flags = [x.strip() for x in f.readline().strip("\n\r").split("\t")] |
|
73
|
|
|
self.n_columns = len(names) |
|
|
|
|
|
|
74
|
|
|
if len(types) != self.n_columns: |
|
75
|
|
|
raise ValueError("File contains %i variable names and %i types" % |
|
76
|
|
|
(len(names), len(types))) |
|
77
|
|
|
if len(flags) > self.n_columns: |
|
78
|
|
|
raise ValueError("There are more flags than variables") |
|
79
|
|
|
else: |
|
80
|
|
|
flags += [""] * self.n_columns |
|
81
|
|
|
|
|
82
|
|
|
attributes = [] |
|
83
|
|
|
class_vars = [] |
|
84
|
|
|
metas = [] |
|
85
|
|
|
|
|
86
|
|
|
self.attribute_columns = [] |
|
|
|
|
|
|
87
|
|
|
self.classvar_columns = [] |
|
|
|
|
|
|
88
|
|
|
self.meta_columns = [] |
|
|
|
|
|
|
89
|
|
|
self.weight_column = -1 |
|
|
|
|
|
|
90
|
|
|
self.basket_column = -1 |
|
|
|
|
|
|
91
|
|
|
|
|
92
|
|
|
for col, (name, tpe, flag) in enumerate(zip(names, types, flags)): |
|
93
|
|
|
tpe = tpe.strip() |
|
94
|
|
|
flag = self.non_escaped_spaces.split(flag) |
|
95
|
|
|
flag = [f.replace("\\ ", " ") for f in flag] |
|
96
|
|
|
if "i" in flag or "ignore" in flag: |
|
97
|
|
|
continue |
|
98
|
|
|
if "b" in flag or "basket" in flag: |
|
99
|
|
|
self.basket_column = col |
|
|
|
|
|
|
100
|
|
|
continue |
|
101
|
|
|
is_class = "class" in flag |
|
102
|
|
|
is_meta = "m" in flag or "meta" in flag or tpe in ["s", "string"] |
|
103
|
|
|
is_weight = "w" in flag or "weight" in flag \ |
|
104
|
|
|
or tpe in ["w", "weight"] |
|
105
|
|
|
|
|
106
|
|
|
attrs = [f.split("=", 1) for f in flag if "=" in f] |
|
107
|
|
|
|
|
108
|
|
|
if is_weight: |
|
109
|
|
|
if is_class: |
|
110
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
|
111
|
|
|
"class and weight".format(name, col)) |
|
112
|
|
|
self.weight_column = col |
|
|
|
|
|
|
113
|
|
|
continue |
|
114
|
|
|
|
|
115
|
|
|
if tpe in ["c", "continuous"]: |
|
116
|
|
|
var = ContinuousVariable.make(name) |
|
117
|
|
|
elif tpe in ["w", "weight"]: |
|
118
|
|
|
var = None |
|
119
|
|
|
elif tpe in ["d", "discrete"]: |
|
120
|
|
|
var = DiscreteVariable() # no name to bypass caching |
|
121
|
|
|
var.name = name |
|
122
|
|
|
var.fix_order = True |
|
123
|
|
|
elif tpe in ["s", "string"]: |
|
124
|
|
|
var = StringVariable.make(name) |
|
125
|
|
|
else: |
|
126
|
|
|
values = [v.replace("\\ ", " ") |
|
127
|
|
|
for v in self.non_escaped_spaces.split(tpe)] |
|
128
|
|
|
var = DiscreteVariable.make(name, values, True) |
|
129
|
|
|
var.attributes.update(attrs) |
|
130
|
|
|
|
|
131
|
|
|
if is_class: |
|
132
|
|
|
if is_meta: |
|
133
|
|
|
raise ValueError( |
|
134
|
|
|
"Variable {} (column {}) is marked as " |
|
135
|
|
|
"class and meta attribute".format(name, col)) |
|
136
|
|
|
class_vars.append(var) |
|
137
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
|
138
|
|
|
elif is_meta: |
|
139
|
|
|
metas.append(var) |
|
140
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
|
141
|
|
|
else: |
|
142
|
|
|
attributes.append(var) |
|
143
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
|
144
|
|
|
|
|
145
|
|
|
domain = Domain(attributes, class_vars, metas) |
|
146
|
|
|
return domain |
|
147
|
|
|
|
|
148
|
|
|
def count_lines(self, file): |
|
|
|
|
|
|
149
|
|
|
file.seek(0) |
|
150
|
|
|
i = -3 |
|
151
|
|
|
for _ in file: |
|
152
|
|
|
i += 1 |
|
153
|
|
|
return i |
|
154
|
|
|
|
|
155
|
|
|
def read_data(self, f, table): |
|
156
|
|
|
X, Y = table.X, table._Y |
|
|
|
|
|
|
157
|
|
|
W = table.W if table.W.shape[-1] else None |
|
158
|
|
|
f.seek(0) |
|
159
|
|
|
f.readline() |
|
160
|
|
|
f.readline() |
|
161
|
|
|
f.readline() |
|
162
|
|
|
padding = [""] * self.n_columns |
|
163
|
|
|
if self.basket_column >= 0: |
|
164
|
|
|
# TODO how many columns?! |
|
|
|
|
|
|
165
|
|
|
table._Xsparse = sparse.lil_matrix(len(X), 100) |
|
|
|
|
|
|
166
|
|
|
table.metas = metas = ( |
|
167
|
|
|
np.empty((len(X), len(self.meta_columns)), dtype=object)) |
|
168
|
|
|
line_count = 0 |
|
169
|
|
|
Xr = None |
|
170
|
|
|
for lne in f: |
|
171
|
|
|
values = lne |
|
172
|
|
|
if not values.strip(): |
|
173
|
|
|
continue |
|
174
|
|
|
values = values.split("\t") |
|
175
|
|
|
if len(values) > self.n_columns: |
|
176
|
|
|
raise ValueError("Too many columns in line {}". |
|
177
|
|
|
format(4 + line_count)) |
|
178
|
|
|
elif len(values) < self.n_columns: |
|
179
|
|
|
values += padding |
|
180
|
|
|
if self.attribute_columns: |
|
181
|
|
|
Xr = X[line_count] |
|
182
|
|
|
for i, (col, reader) in enumerate(self.attribute_columns): |
|
183
|
|
|
Xr[i] = reader(values[col].strip()) |
|
184
|
|
|
for i, (col, reader) in enumerate(self.classvar_columns): |
|
185
|
|
|
Y[line_count, i] = reader(values[col].strip()) |
|
186
|
|
|
if W is not None: |
|
187
|
|
|
W[line_count] = float(values[self.weight_column]) |
|
188
|
|
|
for i, (col, reader) in enumerate(self.meta_columns): |
|
189
|
|
|
metas[line_count, i] = reader(values[col].strip()) |
|
190
|
|
|
line_count += 1 |
|
191
|
|
|
if line_count != len(X): |
|
|
|
|
|
|
192
|
|
|
del Xr, X, Y, W, metas |
|
193
|
|
|
table.X.resize(line_count, len(table.domain.attributes)) |
|
194
|
|
|
table._Y.resize(line_count, len(table.domain.class_vars)) |
|
|
|
|
|
|
195
|
|
|
if table.W.ndim == 1: |
|
196
|
|
|
table.W.resize(line_count) |
|
197
|
|
|
else: |
|
198
|
|
|
table.W.resize((line_count, 0)) |
|
199
|
|
|
table.metas.resize((line_count, len(self.meta_columns))) |
|
200
|
|
|
table.n_rows = line_count |
|
201
|
|
|
|
|
202
|
|
|
def reorder_values_array(self, arr, variables): |
|
|
|
|
|
|
203
|
|
|
newvars = [] |
|
204
|
|
|
for col, var in enumerate(variables): |
|
205
|
|
|
if getattr(var, "fix_order", False): |
|
206
|
|
|
nvar = var.make(var.name, var.values, var.ordered) |
|
207
|
|
|
nvar.attributes = var.attributes |
|
208
|
|
|
move = len(var.values) |
|
209
|
|
|
if nvar.values != var.values: |
|
210
|
|
|
arr[:, col] += move |
|
211
|
|
|
for i, val in enumerate(var.values): |
|
212
|
|
|
bn.replace(arr[:, col], move + i, nvar.values.index(val)) |
|
213
|
|
|
var = nvar |
|
214
|
|
|
newvars.append(var) |
|
215
|
|
|
return newvars |
|
216
|
|
|
|
|
217
|
|
|
def reorder_values(self, table): |
|
218
|
|
|
attrs = self.reorder_values_array(table.X, table.domain.attributes) |
|
219
|
|
|
classes = self.reorder_values_array(table._Y, table.domain.class_vars) |
|
|
|
|
|
|
220
|
|
|
metas = self.reorder_values_array(table.metas, table.domain.metas) |
|
221
|
|
|
table.domain = Domain(attrs, classes, metas=metas) |
|
222
|
|
|
|
|
223
|
|
|
def read_file(self, filename, cls=None): |
|
224
|
|
|
with open(filename) as file: |
|
225
|
|
|
return self._read_file(file, cls) |
|
226
|
|
|
|
|
227
|
|
|
def _read_file(self, file, cls=None): |
|
228
|
|
|
from ..data import Table |
|
229
|
|
|
|
|
230
|
|
|
if cls is None: |
|
231
|
|
|
cls = Table |
|
232
|
|
|
domain = self.read_header(file) |
|
233
|
|
|
nExamples = self.count_lines(file) |
|
234
|
|
|
table = cls.from_domain(domain, nExamples, self.weight_column >= 0) |
|
235
|
|
|
self.read_data(file, table) |
|
236
|
|
|
self.reorder_values(table) |
|
237
|
|
|
return table |
|
238
|
|
|
|
|
239
|
|
|
@classmethod |
|
240
|
|
|
def _write_fast(cls, f, data): |
|
241
|
|
|
wa = [var.str_val for var in data.domain.variables + data.domain.metas] |
|
242
|
|
|
for Xi, Yi, Mi in zip(data.X, data._Y, data.metas): |
|
|
|
|
|
|
243
|
|
|
f.write("\t".join(w(val) for val, w in zip(chain(Xi, Yi, Mi), wa))) |
|
244
|
|
|
f.write("\n") |
|
245
|
|
|
|
|
246
|
|
|
@classmethod |
|
247
|
|
|
def write_file(cls, filename, data): |
|
248
|
|
|
""" |
|
249
|
|
|
Save data to file. |
|
250
|
|
|
|
|
251
|
|
|
Function uses fast implementation in case of numpy data, and slower |
|
252
|
|
|
fall-back for general storage. |
|
253
|
|
|
|
|
254
|
|
|
:param filename: the name of the file |
|
255
|
|
|
:type filename: str |
|
256
|
|
|
:param data: the data to be saved |
|
257
|
|
|
:type data: Orange.data.Storage |
|
258
|
|
|
""" |
|
259
|
|
|
if isinstance(filename, str): |
|
260
|
|
|
f = open(filename, "w") |
|
261
|
|
|
else: |
|
262
|
|
|
f = filename |
|
263
|
|
|
domain_vars = data.domain.variables + data.domain.metas |
|
264
|
|
|
# first line |
|
265
|
|
|
f.write("\t".join([str(j.name) for j in domain_vars])) |
|
266
|
|
|
f.write("\n") |
|
267
|
|
|
|
|
268
|
|
|
# second line |
|
269
|
|
|
# TODO Basket column. |
|
|
|
|
|
|
270
|
|
|
t = {"ContinuousVariable": "c", "DiscreteVariable": "d", |
|
271
|
|
|
"StringVariable": "string", "Basket": "basket"} |
|
272
|
|
|
|
|
273
|
|
|
f.write("\t".join([t[type(j).__name__] for j in domain_vars])) |
|
274
|
|
|
f.write("\n") |
|
275
|
|
|
|
|
276
|
|
|
# third line |
|
277
|
|
|
m = list(data.domain.metas) |
|
278
|
|
|
c = list(data.domain.class_vars) |
|
279
|
|
|
r = [] |
|
280
|
|
|
for i in domain_vars: |
|
281
|
|
|
r1 = ["{}={}".format(k, v).replace(" ", "\\ ") |
|
282
|
|
|
for k, v in i.attributes.items()] |
|
283
|
|
|
if i in m: |
|
284
|
|
|
r1.append("m") |
|
285
|
|
|
elif i in c: |
|
286
|
|
|
r1.append("class") |
|
287
|
|
|
r.append(" ".join(r1)) |
|
288
|
|
|
f.write("\t".join(r)) |
|
289
|
|
|
f.write("\n") |
|
290
|
|
|
|
|
291
|
|
|
# data |
|
292
|
|
|
# noinspection PyBroadException |
|
293
|
|
|
try: |
|
294
|
|
|
cls._write_fast(f, data) |
|
295
|
|
|
except: |
|
|
|
|
|
|
296
|
|
|
domain_vars = [data.domain.index(var) for var in domain_vars] |
|
297
|
|
|
for i in data: |
|
298
|
|
|
f.write("\t".join(str(i[j]) for j in domain_vars) + "\n") |
|
299
|
|
|
f.close() |
|
300
|
|
|
|
|
301
|
|
|
def write(self, filename, data): |
|
302
|
|
|
self.write_file(filename, data) |
|
303
|
|
|
|
|
304
|
|
|
|
|
305
|
|
|
@FileFormats.register("Comma-separated file", ".csv") |
|
306
|
|
|
class TxtFormat: |
|
307
|
|
|
MISSING_VALUES = frozenset({"", "NA", "?"}) |
|
308
|
|
|
|
|
309
|
|
|
@staticmethod |
|
310
|
|
|
def read_header(file, delimiter=None): |
|
311
|
|
|
first_line = file.readline() |
|
312
|
|
|
file.seek(0) |
|
313
|
|
|
if delimiter is None: |
|
314
|
|
|
for delimiter in "\t,; ": |
|
315
|
|
|
if delimiter in first_line: |
|
316
|
|
|
break |
|
317
|
|
|
else: |
|
318
|
|
|
delimiter = None |
|
319
|
|
|
if delimiter == " ": |
|
320
|
|
|
delimiter = None |
|
321
|
|
|
atoms = first_line.split(delimiter) |
|
322
|
|
|
try: |
|
323
|
|
|
[float(atom) for atom in set(atoms) - TxtFormat.MISSING_VALUES] |
|
|
|
|
|
|
324
|
|
|
header_lines = 0 |
|
325
|
|
|
names = ["Var{:04}".format(i + 1) for i in range(len(atoms))] |
|
326
|
|
|
except ValueError: |
|
327
|
|
|
names = [atom.strip() for atom in atoms] |
|
328
|
|
|
header_lines = 1 |
|
329
|
|
|
domain = Domain([ContinuousVariable.make(name) for name in names]) |
|
330
|
|
|
return domain, header_lines, delimiter |
|
331
|
|
|
|
|
332
|
|
|
def read_file(self, filename, cls=None): |
|
333
|
|
|
from ..data import Table |
|
334
|
|
|
|
|
335
|
|
|
if cls is None: |
|
336
|
|
|
cls = Table |
|
337
|
|
|
with open(filename, "rt") as file: |
|
338
|
|
|
domain, header_lines, delimiter = self.read_header(file) |
|
339
|
|
|
with open(filename, "rb") as file: |
|
340
|
|
|
arr = np.genfromtxt(file, delimiter=delimiter, |
|
341
|
|
|
skip_header=header_lines, |
|
342
|
|
|
missing_values=self.MISSING_VALUES) |
|
343
|
|
|
table = cls.from_numpy(domain, arr) |
|
344
|
|
|
return table |
|
345
|
|
|
|
|
346
|
|
|
@classmethod |
|
347
|
|
|
def csv_saver(cls, filename, data, delimiter='\t'): |
|
348
|
|
|
with open(filename, 'w') as csvfile: |
|
349
|
|
|
writer = csv.writer(csvfile, delimiter=delimiter) |
|
350
|
|
|
all_vars = data.domain.variables + data.domain.metas |
|
351
|
|
|
writer.writerow([v.name for v in all_vars]) # write variable names |
|
352
|
|
|
if delimiter == '\t': |
|
353
|
|
|
flags = ([''] * len(data.domain.attributes)) + \ |
|
354
|
|
|
(['class'] * len(data.domain.class_vars)) + \ |
|
355
|
|
|
(['m'] * len(data.domain.metas)) |
|
356
|
|
|
|
|
357
|
|
|
for i, var in enumerate(all_vars): |
|
358
|
|
|
attrs = ["{0!s}={1!s}".format(*item).replace(" ", "\\ ") |
|
359
|
|
|
for item in var.attributes.items()] |
|
360
|
|
|
if attrs: |
|
361
|
|
|
flags[i] += (" " if flags[i] else "") + (" ".join(attrs)) |
|
362
|
|
|
|
|
363
|
|
|
writer.writerow([type(v).__name__.replace("Variable", "").lower() |
|
364
|
|
|
for v in all_vars]) # write variable types |
|
365
|
|
|
writer.writerow(flags) # write flags |
|
366
|
|
|
for ex in data: # write examples |
|
367
|
|
|
writer.writerow(ex) |
|
368
|
|
|
|
|
369
|
|
|
@classmethod |
|
370
|
|
|
def write_file(cls, filename, data): |
|
371
|
|
|
cls.csv_saver(filename, data, ',') |
|
372
|
|
|
|
|
373
|
|
|
def write(self, filename, data): |
|
374
|
|
|
self.write_file(filename, data) |
|
375
|
|
|
|
|
376
|
|
|
|
|
377
|
|
|
@FileFormats.register("Basket file", ".basket") |
|
378
|
|
|
class BasketFormat: |
|
379
|
|
|
@classmethod |
|
380
|
|
|
def read_file(cls, filename, storage_class=None): |
|
381
|
|
|
from Orange.data import _io |
|
382
|
|
|
|
|
383
|
|
|
if storage_class is None: |
|
384
|
|
|
from ..data import Table as storage_class |
|
385
|
|
|
|
|
386
|
|
|
def constr_vars(inds): |
|
387
|
|
|
if inds: |
|
388
|
|
|
return [ContinuousVariable(x.decode("utf-8")) for _, x in |
|
389
|
|
|
sorted((ind, name) for name, ind in inds.items())] |
|
390
|
|
|
|
|
391
|
|
|
X, Y, metas, attr_indices, class_indices, meta_indices = \ |
|
392
|
|
|
_io.sparse_read_float(filename.encode(sys.getdefaultencoding())) |
|
393
|
|
|
|
|
394
|
|
|
attrs = constr_vars(attr_indices) |
|
395
|
|
|
classes = constr_vars(class_indices) |
|
396
|
|
|
meta_attrs = constr_vars(meta_indices) |
|
397
|
|
|
domain = Domain(attrs, classes, meta_attrs) |
|
398
|
|
|
return storage_class.from_numpy( |
|
399
|
|
|
domain, attrs and X, classes and Y, metas and meta_attrs) |
|
400
|
|
|
|
|
401
|
|
|
|
|
402
|
|
|
@FileFormats.register("Excel file", ".xlsx") |
|
403
|
|
|
class ExcelFormat: |
|
404
|
|
|
non_escaped_spaces = re.compile(r"(?<!\\) +") |
|
405
|
|
|
|
|
406
|
|
|
def __init__(self): |
|
407
|
|
|
self.attribute_columns = [] |
|
408
|
|
|
self.classvar_columns = [] |
|
409
|
|
|
self.meta_columns = [] |
|
410
|
|
|
self.weight_column = -1 |
|
411
|
|
|
self.basket_column = -1 |
|
412
|
|
|
|
|
413
|
|
|
self.n_columns = self.first_data_row = 0 |
|
414
|
|
|
|
|
415
|
|
|
def open_workbook(self, f): |
|
416
|
|
|
from openpyxl import load_workbook |
|
|
|
|
|
|
417
|
|
|
|
|
418
|
|
|
if isinstance(f, str) and ":" in f[2:]: |
|
419
|
|
|
f, sheet = f.rsplit(":", 1) |
|
420
|
|
|
else: |
|
421
|
|
|
sheet = None |
|
422
|
|
|
wb = load_workbook(f, use_iterators=True, |
|
423
|
|
|
read_only=True, data_only=True) |
|
424
|
|
|
ws = wb.get_sheet_by_name(sheet) if sheet else wb.get_active_sheet() |
|
425
|
|
|
self.n_columns = ws.get_highest_column() |
|
426
|
|
|
return ws |
|
427
|
|
|
|
|
428
|
|
|
# noinspection PyBroadException |
|
429
|
|
|
def read_header_3(self, worksheet): |
|
430
|
|
|
cols = self.n_columns |
|
431
|
|
|
try: |
|
432
|
|
|
names, types, flags = [ |
|
433
|
|
|
[cell.value.strip() if cell.value is not None else "" |
|
434
|
|
|
for cell in row] |
|
435
|
|
|
for row in worksheet.get_squared_range(1, 1, cols, 3)] |
|
436
|
|
|
except: |
|
|
|
|
|
|
437
|
|
|
return False |
|
438
|
|
|
if not (all(tpe in ("", "c", "d", "s", "continuous", "discrete", |
|
439
|
|
|
"string", "w", "weight") or " " in tpe |
|
440
|
|
|
for tpe in types) and |
|
441
|
|
|
all(flg in ("", "i", "ignore", "m", "meta", "w", "weight", |
|
442
|
|
|
"b", "basket", "class") or "=" in flg |
|
443
|
|
|
for flg in flags)): |
|
444
|
|
|
return False |
|
445
|
|
|
attributes = [] |
|
446
|
|
|
class_vars = [] |
|
447
|
|
|
metas = [] |
|
448
|
|
|
for col, (name, tpe, flag) in enumerate(zip(names, types, flags)): |
|
449
|
|
|
flag = self.non_escaped_spaces.split(flag) |
|
450
|
|
|
if "i" in flag or "ignore" in flag: |
|
451
|
|
|
continue |
|
452
|
|
|
if "b" in flag or "basket" in flag: |
|
453
|
|
|
self.basket_column = col |
|
454
|
|
|
continue |
|
455
|
|
|
is_class = "class" in flag |
|
456
|
|
|
is_meta = "m" in flag or "meta" in flag or tpe in ["s", "string"] |
|
457
|
|
|
is_weight = "w" in flag or "weight" in flag \ |
|
458
|
|
|
or tpe in ["w", "weight"] |
|
459
|
|
|
attrs = [f.split("=", 1) for f in flag if "=" in f] |
|
460
|
|
|
if is_weight: |
|
461
|
|
|
if is_class: |
|
462
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
|
463
|
|
|
"class and weight".format(name, col + 1)) |
|
464
|
|
|
self.weight_column = col |
|
465
|
|
|
continue |
|
466
|
|
|
if tpe in ["c", "continuous"]: |
|
467
|
|
|
var = ContinuousVariable.make(name) |
|
468
|
|
|
elif tpe in ["w", "weight"]: |
|
469
|
|
|
var = None |
|
470
|
|
|
elif tpe in ["d", "discrete"]: |
|
471
|
|
|
var = DiscreteVariable.make(name) |
|
472
|
|
|
var.fix_order = True |
|
473
|
|
|
elif tpe in ["s", "string"]: |
|
474
|
|
|
var = StringVariable.make(name) |
|
475
|
|
|
else: |
|
476
|
|
|
values = [v.replace("\\ ", " ") |
|
477
|
|
|
for v in self.non_escaped_spaces.split(tpe)] |
|
478
|
|
|
var = DiscreteVariable.make(name, values, True) |
|
479
|
|
|
var.attributes.update(attrs) |
|
480
|
|
|
if is_class: |
|
481
|
|
|
if is_meta: |
|
482
|
|
|
raise ValueError( |
|
483
|
|
|
"Variable {} (column {}) is marked as " |
|
484
|
|
|
"class and meta attribute".format(name, col)) |
|
485
|
|
|
class_vars.append(var) |
|
486
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
|
487
|
|
|
elif is_meta: |
|
488
|
|
|
metas.append(var) |
|
489
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
|
490
|
|
|
else: |
|
491
|
|
|
attributes.append(var) |
|
492
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
|
493
|
|
|
self.first_data_row = 4 |
|
494
|
|
|
return Domain(attributes, class_vars, metas) |
|
495
|
|
|
|
|
496
|
|
|
# noinspection PyBroadException |
|
497
|
|
|
def read_header_0(self, worksheet): |
|
498
|
|
|
try: |
|
499
|
|
|
[float(cell.value) if cell.value is not None else None |
|
500
|
|
|
for cell in |
|
501
|
|
|
worksheet.get_squared_range(1, 1, self.n_columns, 3).__next__()] |
|
502
|
|
|
except: |
|
|
|
|
|
|
503
|
|
|
return False |
|
504
|
|
|
self.first_data_row = 1 |
|
505
|
|
|
attrs = [ContinuousVariable.make("Var{:04}".format(i + 1)) |
|
506
|
|
|
for i in range(self.n_columns)] |
|
507
|
|
|
self.attribute_columns = [(i, var.val_from_str_add) |
|
508
|
|
|
for i, var in enumerate(attrs)] |
|
509
|
|
|
return Domain(attrs) |
|
510
|
|
|
|
|
511
|
|
|
def read_header_1(self, worksheet): |
|
512
|
|
|
import openpyxl.cell.cell |
|
|
|
|
|
|
513
|
|
|
|
|
514
|
|
|
if worksheet.get_highest_column() < 2 or \ |
|
515
|
|
|
worksheet.get_highest_row() < 2: |
|
516
|
|
|
return False |
|
517
|
|
|
cols = self.n_columns |
|
518
|
|
|
names = [cell.value.strip() if cell.value is not None else "" |
|
519
|
|
|
for cell in |
|
520
|
|
|
worksheet.get_squared_range(1, 1, cols, 3).__next__()] |
|
521
|
|
|
row2 = list(worksheet.get_squared_range(1, 2, cols, 3).__next__()) |
|
522
|
|
|
attributes = [] |
|
523
|
|
|
class_vars = [] |
|
524
|
|
|
metas = [] |
|
525
|
|
|
for col, name in enumerate(names): |
|
526
|
|
|
if "#" in name: |
|
527
|
|
|
flags, name = name.split("#", 1) |
|
528
|
|
|
else: |
|
529
|
|
|
flags = "" |
|
530
|
|
|
if "i" in flags: |
|
531
|
|
|
continue |
|
532
|
|
|
if "b" in flags: |
|
533
|
|
|
self.basket_column = col |
|
534
|
|
|
continue |
|
535
|
|
|
is_class = "c" in flags |
|
536
|
|
|
is_meta = "m" in flags or "s" in flags |
|
537
|
|
|
is_weight = "W" in flags or "w" in flags |
|
538
|
|
|
if is_weight: |
|
539
|
|
|
if is_class: |
|
540
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
|
541
|
|
|
"class and weight".format(name, col)) |
|
542
|
|
|
self.weight_column = col |
|
543
|
|
|
continue |
|
544
|
|
|
if "C" in flags: |
|
545
|
|
|
var = ContinuousVariable.make(name) |
|
546
|
|
|
elif is_weight: |
|
547
|
|
|
var = None |
|
548
|
|
|
elif "D" in flags: |
|
549
|
|
|
var = DiscreteVariable.make(name) |
|
550
|
|
|
var.fix_order = True |
|
551
|
|
|
elif "S" in flags: |
|
552
|
|
|
var = StringVariable.make(name) |
|
553
|
|
|
elif row2[col].data_type == "n": |
|
554
|
|
|
var = ContinuousVariable.make(name) |
|
555
|
|
|
else: |
|
556
|
|
|
if len(set(row[col].value for row in worksheet.rows)) > 20: |
|
557
|
|
|
var = StringVariable.make(name) |
|
558
|
|
|
is_meta = True |
|
559
|
|
|
else: |
|
560
|
|
|
var = DiscreteVariable.make(name) |
|
561
|
|
|
var.fix_order = True |
|
562
|
|
|
if is_class: |
|
563
|
|
|
if is_meta: |
|
564
|
|
|
raise ValueError( |
|
565
|
|
|
"Variable {} (column {}) is marked as " |
|
566
|
|
|
"class and meta attribute".format( |
|
567
|
|
|
name, openpyxl.cell.cell.get_column_letter(col + 1)) |
|
568
|
|
|
) |
|
569
|
|
|
class_vars.append(var) |
|
570
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
|
571
|
|
|
elif is_meta: |
|
572
|
|
|
metas.append(var) |
|
573
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
|
574
|
|
|
else: |
|
575
|
|
|
attributes.append(var) |
|
576
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
|
577
|
|
|
if attributes and not class_vars: |
|
578
|
|
|
class_vars.append(attributes.pop(-1)) |
|
579
|
|
|
self.classvar_columns.append(self.attribute_columns.pop(-1)) |
|
580
|
|
|
self.first_data_row = 2 |
|
581
|
|
|
return Domain(attributes, class_vars, metas) |
|
582
|
|
|
|
|
583
|
|
|
def read_header(self, worksheet): |
|
584
|
|
|
domain = self.read_header_3(worksheet) or \ |
|
585
|
|
|
self.read_header_0(worksheet) or \ |
|
586
|
|
|
self.read_header_1(worksheet) |
|
587
|
|
|
if domain is False: |
|
588
|
|
|
raise ValueError("Invalid header") |
|
589
|
|
|
return domain |
|
590
|
|
|
|
|
591
|
|
|
# noinspection PyPep8Naming,PyProtectedMember |
|
592
|
|
|
def read_data(self, worksheet, table): |
|
593
|
|
|
X, Y = table.X, table._Y |
|
|
|
|
|
|
594
|
|
|
W = table.W if table.W.shape[-1] else None |
|
595
|
|
|
if self.basket_column >= 0: |
|
596
|
|
|
# TODO how many columns?! |
|
|
|
|
|
|
597
|
|
|
table._Xsparse = sparse.lil_matrix(len(X), 100) |
|
|
|
|
|
|
598
|
|
|
table.metas = metas = ( |
|
599
|
|
|
np.empty((len(X), len(self.meta_columns)), dtype=object)) |
|
600
|
|
|
sheet_rows = worksheet.rows |
|
601
|
|
|
for _ in range(1, self.first_data_row): |
|
602
|
|
|
sheet_rows.__next__() |
|
603
|
|
|
line_count = 0 |
|
604
|
|
|
Xr = None |
|
605
|
|
|
for row in sheet_rows: |
|
606
|
|
|
values = [cell.value for cell in row] |
|
607
|
|
|
if all(value is None for value in values): |
|
608
|
|
|
continue |
|
609
|
|
|
if self.attribute_columns: |
|
610
|
|
|
Xr = X[line_count] |
|
611
|
|
|
for i, (col, reader) in enumerate(self.attribute_columns): |
|
612
|
|
|
v = values[col] |
|
613
|
|
|
Xr[i] = reader(v.strip() if isinstance(v, str) else v) |
|
614
|
|
|
for i, (col, reader) in enumerate(self.classvar_columns): |
|
615
|
|
|
v = values[col] |
|
616
|
|
|
Y[line_count, i] = reader( |
|
617
|
|
|
v.strip() if isinstance(v, str) else v) |
|
618
|
|
|
if W is not None: |
|
619
|
|
|
W[line_count] = float(values[self.weight_column]) |
|
620
|
|
|
for i, (col, reader) in enumerate(self.meta_columns): |
|
621
|
|
|
v = values[col] |
|
622
|
|
|
metas[line_count, i] = reader( |
|
623
|
|
|
v.strip() if isinstance(v, str) else v) |
|
624
|
|
|
line_count += 1 |
|
625
|
|
|
if line_count != len(X): |
|
|
|
|
|
|
626
|
|
|
del Xr, X, Y, W, metas |
|
627
|
|
|
table.X.resize(line_count, len(table.domain.attributes)) |
|
628
|
|
|
table._Y.resize(line_count, len(table.domain.class_vars)) |
|
|
|
|
|
|
629
|
|
|
if table.W.ndim == 1: |
|
630
|
|
|
table.W.resize(line_count) |
|
631
|
|
|
else: |
|
632
|
|
|
table.W.resize((line_count, 0)) |
|
633
|
|
|
table.metas.resize((line_count, len(self.meta_columns))) |
|
634
|
|
|
table.n_rows = line_count |
|
635
|
|
|
|
|
636
|
|
|
# noinspection PyUnresolvedReferences |
|
637
|
|
|
@staticmethod |
|
638
|
|
|
def reorder_values_array(arr, variables): |
|
639
|
|
|
for col, var in enumerate(variables): |
|
640
|
|
|
if getattr(var, "fix_order", False) and len(var.values) < 1000: |
|
641
|
|
|
new_order = var.ordered_values(var.values) |
|
642
|
|
|
if new_order == var.values: |
|
643
|
|
|
continue |
|
644
|
|
|
arr[:, col] += 1000 |
|
645
|
|
|
for i, val in enumerate(var.values): |
|
646
|
|
|
bn.replace(arr[:, col], 1000 + i, new_order.index(val)) |
|
647
|
|
|
var.values = new_order |
|
648
|
|
|
delattr(var, "fix_order") |
|
649
|
|
|
|
|
650
|
|
|
# noinspection PyProtectedMember |
|
651
|
|
|
def reorder_values(self, table): |
|
652
|
|
|
self.reorder_values_array(table.X, table.domain.attributes) |
|
653
|
|
|
self.reorder_values_array(table._Y, table.domain.class_vars) |
|
|
|
|
|
|
654
|
|
|
self.reorder_values_array(table.metas, table.domain.metas) |
|
655
|
|
|
|
|
656
|
|
|
def read_file(self, file, cls=None): |
|
657
|
|
|
from Orange.data import Table |
|
658
|
|
|
|
|
659
|
|
|
if cls is None: |
|
660
|
|
|
cls = Table |
|
661
|
|
|
worksheet = self.open_workbook(file) |
|
662
|
|
|
domain = self.read_header(worksheet) |
|
663
|
|
|
table = cls.from_domain( |
|
664
|
|
|
domain, |
|
665
|
|
|
worksheet.get_highest_row() - self.first_data_row + 1, |
|
666
|
|
|
self.weight_column >= 0) |
|
667
|
|
|
self.read_data(worksheet, table) |
|
668
|
|
|
self.reorder_values(table) |
|
669
|
|
|
return table |
|
670
|
|
|
|
|
671
|
|
|
|
|
672
|
|
|
@FileFormats.register("Pickled table", ".pickle") |
|
673
|
|
|
class PickleFormat: |
|
674
|
|
|
@classmethod |
|
675
|
|
|
def read_file(cls, file, _=None): |
|
676
|
|
|
with open(file, "rb") as f: |
|
677
|
|
|
return pickle.load(f) |
|
678
|
|
|
|
|
679
|
|
|
@classmethod |
|
680
|
|
|
def write_file(cls, filename, table): |
|
681
|
|
|
with open(filename, "wb") as f: |
|
682
|
|
|
pickle.dump(table, f) |
|
683
|
|
|
|
|
684
|
|
|
def write(self, filename, table): |
|
685
|
|
|
self.write_file(filename, table) |
|
686
|
|
|
|
|
687
|
|
|
|
|
688
|
|
|
@FileFormats.register("Dot Tree File", ".dot") |
|
689
|
|
|
class DotFormat: |
|
690
|
|
|
@classmethod |
|
691
|
|
|
def write_graph(cls, filename, graph): |
|
692
|
|
|
from sklearn import tree |
|
|
|
|
|
|
693
|
|
|
|
|
694
|
|
|
tree.export_graphviz(graph, out_file=filename) |
|
695
|
|
|
|
|
696
|
|
|
def write(self, filename, tree): |
|
697
|
|
|
if type(tree) == dict: |
|
698
|
|
|
tree = tree['tree'] |
|
699
|
|
|
self.write_graph(filename, tree) |
|
700
|
|
|
|
|
701
|
|
|
|
|
702
|
|
|
@FileFormats.register("Fixed width textfile", ".fixed") |
|
703
|
|
|
class FixedWidthFormat(TabDelimFormat): |
|
704
|
|
|
""" |
|
705
|
|
|
FixedWidthFormat reads tables from files where the columns have a |
|
706
|
|
|
fixed width. The cells are space-padded to the left. |
|
707
|
|
|
See datasets/glass.fixed and tests/test_fixedwidth_reader.py |
|
708
|
|
|
|
|
|
|
|
|
|
709
|
|
|
It is possible to determine the exact cell location of a specific |
|
710
|
|
|
table cell within the file because of the fixed width columns. |
|
711
|
|
|
This allows the FixedWidthFormat to be used with the LazyFile |
|
712
|
|
|
widget to 'read' extremely large files. |
|
713
|
|
|
|
|
|
|
|
|
|
714
|
|
|
TODO: |
|
715
|
|
|
- Add read_row() without reading entire file. |
|
716
|
|
|
- Allow spaces in column names and cell values. |
|
717
|
|
|
- Ensure compatibility with all tables in the tests directory. |
|
718
|
|
|
- Do metas and class properly. |
|
719
|
|
|
""" |
|
720
|
|
|
def read_ends_columns(self, filename): |
|
|
|
|
|
|
721
|
|
|
""" |
|
722
|
|
|
Returns the location where each column ends in a line in the |
|
723
|
|
|
file. |
|
724
|
|
|
TODO: |
|
725
|
|
|
- Cleanup. |
|
726
|
|
|
""" |
|
727
|
|
|
ColumnInfo = namedtuple( |
|
728
|
|
|
'ColumnInfo', |
|
729
|
|
|
['name', 'tpe', 'flag', 'start', 'end', 'width', 'index'], |
|
730
|
|
|
) |
|
731
|
|
|
with open(filename) as f: |
|
732
|
|
|
f.seek(0) |
|
733
|
|
|
l_names = f.readline() |
|
734
|
|
|
l_types = f.readline() |
|
735
|
|
|
l_flags = f.readline() |
|
736
|
|
|
types = l_types.split() |
|
737
|
|
|
ends = [] |
|
738
|
|
|
for n in types: |
|
739
|
|
|
position_start = ends[-1] if len(ends) else 0 |
|
740
|
|
|
end = (" "+l_types.replace("\n"," ")).find(" "+n+" ", position_start) + len(n) |
|
741
|
|
|
ends.append(end) |
|
742
|
|
|
info_columns = [ |
|
743
|
|
|
ColumnInfo( |
|
744
|
|
|
name=l_names[start:end].strip(), |
|
745
|
|
|
flag=l_flags[start:end].strip(), |
|
746
|
|
|
tpe=tpe, |
|
747
|
|
|
start=start, |
|
748
|
|
|
end=end, |
|
749
|
|
|
width=end-start, |
|
750
|
|
|
index=inde, |
|
751
|
|
|
) for (inde, (start, end, tpe)) in enumerate(zip( |
|
752
|
|
|
[0] + ends[:-1], |
|
753
|
|
|
ends, |
|
754
|
|
|
types, |
|
755
|
|
|
)) |
|
756
|
|
|
] |
|
757
|
|
|
return info_columns |
|
758
|
|
|
|
|
|
|
|
|
|
759
|
|
|
def read_header(self, filename): |
|
760
|
|
|
""" |
|
761
|
|
|
Reads the header of the fixed width file and returns the |
|
762
|
|
|
Domain of the table. |
|
763
|
|
|
|
|
|
|
|
|
|
764
|
|
|
TODO: |
|
765
|
|
|
- Use read_ends_columns() to determine the width of the |
|
766
|
|
|
columns and use that to parse the lines, because this |
|
767
|
|
|
will allow the use of spaces in column names. |
|
768
|
|
|
""" |
|
769
|
|
|
ends = self.read_ends_columns(filename) |
|
770
|
|
|
names = [end.name for end in ends] |
|
771
|
|
|
types = [end.tpe for end in ends] |
|
772
|
|
|
flags = [end.flag for end in ends] |
|
773
|
|
|
with open(filename) as f: |
|
774
|
|
|
# Function based on read_header from TabDelimReader. |
|
775
|
|
|
f.seek(0) |
|
776
|
|
|
#names = f.readline().strip("\n\r").split() |
|
777
|
|
|
#types = f.readline().strip("\n\r").split() |
|
778
|
|
|
#flags = f.readline().strip("\n\r").split() |
|
779
|
|
|
f.readline() |
|
780
|
|
|
f.readline() |
|
781
|
|
|
f.readline() |
|
782
|
|
|
# Changed split on "\t" to split on spaces. |
|
783
|
|
|
self.n_columns = len(names) |
|
|
|
|
|
|
784
|
|
|
if len(types) != self.n_columns: |
|
785
|
|
|
raise ValueError("File contains %i variable names and %i types" % |
|
786
|
|
|
(len(names), len(types))) |
|
787
|
|
|
if len(flags) > self.n_columns: |
|
788
|
|
|
raise ValueError("There are more flags than variables") |
|
789
|
|
|
else: |
|
790
|
|
|
flags += [""] * self.n_columns |
|
791
|
|
|
attributes = [] |
|
792
|
|
|
class_vars = [] |
|
793
|
|
|
metas = [] |
|
794
|
|
|
self.attribute_columns = [] |
|
|
|
|
|
|
795
|
|
|
self.classvar_columns = [] |
|
|
|
|
|
|
796
|
|
|
self.meta_columns = [] |
|
|
|
|
|
|
797
|
|
|
self.weight_column = -1 |
|
|
|
|
|
|
798
|
|
|
self.basket_column = -1 |
|
|
|
|
|
|
799
|
|
|
for col, (name, tpe, flag) in enumerate(zip(names, types, flags)): |
|
800
|
|
|
tpe = tpe.strip() |
|
801
|
|
|
flag = flag.split() |
|
802
|
|
|
if "i" in flag or "ignore" in flag: |
|
803
|
|
|
continue |
|
804
|
|
|
if "b" in flag or "basket" in flag: |
|
805
|
|
|
self.basket_column = col |
|
|
|
|
|
|
806
|
|
|
continue |
|
807
|
|
|
is_class = "class" in flag |
|
808
|
|
|
is_meta = "m" in flag or "meta" in flag or tpe in ["s", "string"] |
|
809
|
|
|
is_weight = "w" in flag or "weight" in flag \ |
|
810
|
|
|
or tpe in ["w", "weight"] |
|
811
|
|
|
if is_weight: |
|
812
|
|
|
if is_class: |
|
813
|
|
|
raise ValueError("Variable {} (column {}) is marked as " |
|
814
|
|
|
"class and weight".format(name, col)) |
|
815
|
|
|
self.weight_column = col |
|
|
|
|
|
|
816
|
|
|
continue |
|
817
|
|
|
if tpe in ["c", "continuous"]: |
|
818
|
|
|
var = ContinuousVariable.make(name) |
|
819
|
|
|
elif tpe in ["w", "weight"]: |
|
820
|
|
|
var = None |
|
821
|
|
|
elif tpe in ["d", "discrete"]: |
|
822
|
|
|
var = DiscreteVariable.make(name) |
|
823
|
|
|
elif tpe in ["s", "string"]: |
|
824
|
|
|
var = StringVariable.make(name) |
|
825
|
|
|
else: |
|
826
|
|
|
values = [v.replace("\\ ", " ") |
|
827
|
|
|
for v in self.non_escaped_spaces.split(tpe)] |
|
828
|
|
|
var = DiscreteVariable.make(name, values, True) |
|
829
|
|
|
var.fix_order = (isinstance(var, DiscreteVariable) |
|
830
|
|
|
and not var.values) |
|
831
|
|
|
if is_class: |
|
832
|
|
|
if is_meta: |
|
833
|
|
|
raise ValueError( |
|
834
|
|
|
"Variable {} (column {}) is marked as " |
|
835
|
|
|
"class and meta attribute".format(name, col)) |
|
836
|
|
|
class_vars.append(var) |
|
837
|
|
|
self.classvar_columns.append((col, var.val_from_str_add)) |
|
838
|
|
|
elif is_meta: |
|
839
|
|
|
metas.append(var) |
|
840
|
|
|
self.meta_columns.append((col, var.val_from_str_add)) |
|
841
|
|
|
else: |
|
842
|
|
|
attributes.append(var) |
|
843
|
|
|
self.attribute_columns.append((col, var.val_from_str_add)) |
|
844
|
|
|
domain = Domain(attributes, class_vars, metas) |
|
845
|
|
|
return domain |
|
846
|
|
|
def count_lines(self, filename): |
|
847
|
|
|
""" |
|
848
|
|
|
Counts the number of lines in the file. This can be done |
|
849
|
|
|
without reading the entire file because the file |
|
850
|
|
|
has fixed width columns. |
|
851
|
|
|
""" |
|
852
|
|
|
len_file = os.stat(filename).st_size |
|
853
|
|
|
with open(filename) as f: |
|
854
|
|
|
f.seek(0) |
|
855
|
|
|
line = f.readline() |
|
856
|
|
|
len_line = len(line) |
|
857
|
|
|
|
|
|
|
|
|
|
858
|
|
|
count = int(len_file / len_line) - 3 |
|
859
|
|
|
return count |
|
860
|
|
|
def read_cell(self, filename, index_row, name_attribute): |
|
861
|
|
|
""" |
|
862
|
|
|
Reads one specific cell value without reading the entire file. |
|
863
|
|
|
|
|
|
|
|
|
|
864
|
|
|
TODO: |
|
865
|
|
|
- Cleanup this function. |
|
866
|
|
|
- Test with discrete and class attributes. |
|
867
|
|
|
- Cache the header information. |
|
868
|
|
|
""" |
|
869
|
|
|
info_columns = self.read_ends_columns(filename) |
|
870
|
|
|
header = self.read_header(filename) |
|
|
|
|
|
|
871
|
|
|
with open(filename) as f: |
|
872
|
|
|
f.seek(0) |
|
873
|
|
|
line = f.readline() |
|
874
|
|
|
len_line1 = len(line) |
|
|
|
|
|
|
875
|
|
|
len_line = sum(ic.width for ic in info_columns) + 1 # for \n |
|
876
|
|
|
col = [ic for ic in info_columns if ic.name == name_attribute][0] |
|
877
|
|
|
with open(filename) as f: |
|
878
|
|
|
f.seek( (3+index_row) * len_line + col.start ) |
|
879
|
|
|
value = f.read(col.width) |
|
880
|
|
|
value_n = None |
|
881
|
|
|
# Parse the string in the correct format. This is a kludge |
|
882
|
|
|
# based on code from read_data(). |
|
883
|
|
|
if self.attribute_columns: |
|
884
|
|
|
for i, (coli, reader) in enumerate(self.attribute_columns): |
|
|
|
|
|
|
885
|
|
|
if coli == col.index: |
|
886
|
|
|
value_n = reader(value.strip()) |
|
887
|
|
|
for i, (coli, reader) in enumerate(self.classvar_columns): |
|
888
|
|
|
if coli == col.index: |
|
889
|
|
|
value_n = reader(value.strip()) |
|
890
|
|
|
return value_n |
|
891
|
|
|
|
|
|
|
|
|
|
892
|
|
|
def read_data(self, filename, table): |
|
893
|
|
|
""" |
|
894
|
|
|
Read the data portion of the file. |
|
895
|
|
|
|
|
|
|
|
|
|
896
|
|
|
This function is based on the one in TabDelimFormat. |
|
897
|
|
|
TODO: |
|
898
|
|
|
- Use the actual known width of the columns instead |
|
899
|
|
|
of splitting on space, because that will allow spaces |
|
900
|
|
|
to be part of the cell values. |
|
901
|
|
|
That is, use read_ends_columns. |
|
902
|
|
|
""" |
|
|
|
|
|
|
903
|
|
|
with open(filename) as f: |
|
904
|
|
|
#X, Y = table.X, table.Y |
|
905
|
|
|
X, Y = table.X, table._Y |
|
|
|
|
|
|
906
|
|
|
W = table.W if table.W.shape[-1] else None |
|
907
|
|
|
f.seek(0) |
|
908
|
|
|
f.readline() |
|
909
|
|
|
f.readline() |
|
910
|
|
|
f.readline() |
|
911
|
|
|
padding = [""] * self.n_columns |
|
912
|
|
|
if self.basket_column >= 0: |
|
913
|
|
|
# TODO how many columns?! |
|
|
|
|
|
|
914
|
|
|
table._Xsparse = sparse.lil_matrix(len(X), 100) |
|
|
|
|
|
|
915
|
|
|
table.metas = metas = ( |
|
916
|
|
|
np.empty((len(X), len(self.meta_columns)), dtype=object)) |
|
917
|
|
|
line_count = 0 |
|
918
|
|
|
Xr = None |
|
919
|
|
|
for lne in f: |
|
920
|
|
|
values = lne.strip() |
|
921
|
|
|
if not values: |
|
922
|
|
|
continue |
|
923
|
|
|
# Only difference with TabDelimReader |
|
924
|
|
|
#values = values.split("\t") |
|
925
|
|
|
values = values.split() |
|
926
|
|
|
if len(values) > self.n_columns: |
|
927
|
|
|
raise ValueError("Too many columns in line {}". |
|
928
|
|
|
format(4 + line_count)) |
|
929
|
|
|
elif len(values) < self.n_columns: |
|
930
|
|
|
values += padding |
|
931
|
|
|
if self.attribute_columns: |
|
932
|
|
|
Xr = X[line_count] |
|
933
|
|
|
for i, (col, reader) in enumerate(self.attribute_columns): |
|
934
|
|
|
Xr[i] = reader(values[col].strip()) |
|
935
|
|
|
for i, (col, reader) in enumerate(self.classvar_columns): |
|
936
|
|
|
Y[line_count, i] = reader(values[col].strip()) |
|
937
|
|
|
if W is not None: |
|
938
|
|
|
W[line_count] = float(values[self.weight_column]) |
|
939
|
|
|
for i, (col, reader) in enumerate(self.meta_columns): |
|
940
|
|
|
metas[line_count, i] = reader(values[col].strip()) |
|
941
|
|
|
line_count += 1 |
|
942
|
|
|
if line_count != len(X): |
|
|
|
|
|
|
943
|
|
|
del Xr, X, Y, W, metas |
|
944
|
|
|
table.X.resize(line_count, len(table.domain.attributes)) |
|
945
|
|
|
table.Y.resize(line_count, len(table.domain.class_vars)) |
|
946
|
|
|
if table.W.ndim == 1: |
|
947
|
|
|
table.W.resize(line_count) |
|
948
|
|
|
else: |
|
949
|
|
|
table.W.resize((line_count, 0)) |
|
950
|
|
|
table.metas.resize((line_count, len(self.meta_columns))) |
|
951
|
|
|
table.n_rows = line_count |
|
952
|
|
|
def read_file(self, filename, cls=None): |
|
953
|
|
|
""" |
|
954
|
|
|
Read a file. |
|
955
|
|
|
|
|
|
|
|
|
|
956
|
|
|
The distinction between read_file and _read_file cannot |
|
957
|
|
|
be made because we cannot get the length of a stream etc. |
|
958
|
|
|
""" |
|
959
|
|
|
from ..data import Table |
|
960
|
|
|
if cls is None: |
|
961
|
|
|
cls = Table |
|
962
|
|
|
domain = self.read_header(filename) |
|
963
|
|
|
nExamples = self.count_lines(filename) |
|
964
|
|
|
table = cls.from_domain(domain, nExamples, self.weight_column >= 0) |
|
965
|
|
|
self.read_data(filename, table) |
|
966
|
|
|
self.reorder_values(table) |
|
967
|
|
|
return table |
|
968
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.pyfiles in your module folders. Make sure that you place one file in each sub-folder.