1 | """ |
||
2 | LazyTable is a data structure derived from Table that doesn't |
||
3 | necessarily have all the data it represents directly available. |
||
4 | |||
5 | TODO: |
||
6 | - Harmonize naming to the Orange naming convention. |
||
7 | - Think of how to use unique keys. The row_index should work, but this should |
||
8 | then also be stored somewhere because X, Y and metas do not have to have |
||
9 | the correct order. That is, if a widget first asks for self.data[10] |
||
10 | and subsequently for self.data[5], then the first row in X will be row 10 |
||
11 | and the second row 5 etc. |
||
12 | """ |
||
13 | |||
14 | # pylint: disable=line-too-long, trailing-whitespace, fixme |
||
15 | |||
16 | |||
17 | from numbers import Integral |
||
18 | # TODO: Use Real instead of numpy.float ? |
||
19 | |||
20 | from Orange.data.table import Instance, RowInstance, Table |
||
21 | from Orange.data.value import Value |
||
22 | |||
23 | from Orange.data import filter as orange_filter |
||
24 | |||
25 | import numpy |
||
0 ignored issues
–
show
|
|||
26 | import threading |
||
27 | import copy |
||
28 | |||
29 | import collections.abc |
||
30 | |||
31 | def len_lazyaware(data): |
||
32 | """ |
||
33 | Returns the length of data. |
||
34 | |||
35 | For normal Table instances this is simply len(data), which is the length |
||
36 | of the table as loaded into memory. However, for LazyTables not all the |
||
37 | data might be loaded into memory. Nonetheless, __len__() of a LazyTable |
||
38 | has to return the actual number of rows stored in memory in order to |
||
39 | allow the LazyTable to be used with all existing widgets. Smart widgets, |
||
40 | like this one, can use the len_full_data() in order to get the length |
||
41 | of the full dataset. They can subsequently ask for the data they need |
||
42 | in order to get it instantiated. |
||
43 | """ |
||
44 | length = data.len_full_data() if isinstance(data, LazyTable) else len(data) |
||
45 | return length |
||
46 | |||
47 | def eq_lazyaware(data1, data2): |
||
48 | """ |
||
49 | Lazy-aware equality test between two tables. |
||
50 | |||
51 | The Lazy widgets send LazyTables with only a few materialized rows in the |
||
52 | X, Y and metas attributes. The lazy-aware widgets will ignore these |
||
53 | attributes (as much as possible) and only access the instances through |
||
54 | __getitem__() and __iter__(). These widgets will therefore be able to |
||
55 | use all the data they need (and not more). |
||
56 | |||
57 | Non-lazy-aware widgets, however, will access the X, Y and metas attributes |
||
58 | directly and would thus perform their job only on the very small subset |
||
59 | of the data that they initially received. New data is 'send' occasionally |
||
60 | to compensate for this. This 'new' data will be a 'new' LazyTable that is |
||
61 | identical to the previously send table, but with more materialized rows. |
||
62 | |||
63 | Lazy-aware widgets will also receive this 'new' table that is not actually |
||
64 | new. They should use this lazy-aware equality function to test whether |
||
65 | new data has been received or whether this is the data they already had. |
||
66 | |||
67 | TODO: make this less hacky, it will give false positives now. |
||
68 | """ |
||
69 | if isinstance(data1, LazyTable) and isinstance(data2, LazyTable): |
||
70 | if data1 is data2: |
||
71 | equal = True |
||
72 | else: |
||
73 | equal_domains = data1.domain == data2.domain |
||
74 | equal_lengths = len_lazyaware(data1) == len_lazyaware(data2) |
||
75 | equal = equal_domains and equal_lengths |
||
76 | else: |
||
77 | equal = data1 == data2 |
||
78 | return equal |
||
79 | |||
80 | |||
81 | class LazyRowInstance(RowInstance): |
||
82 | """ |
||
83 | LazyRowInstance is a lazy version of RowInstance. |
||
84 | |||
85 | This is a very early rudimentary version. |
||
86 | """ |
||
87 | |||
88 | # There are four different identifiers in use for a LazyRowInstance: |
||
89 | # - row_index_full: |
||
90 | # The number the row in the full conceptual table. |
||
91 | # This is a sequential integer starting at 0 and ending at |
||
92 | # self.table.len_full_data(). |
||
93 | # The instances in identical LazyTables should always have identical |
||
94 | # row_index_full values. |
||
95 | # This index is (currently) used when widgets ask for a specific row, |
||
96 | # because 1) the widgets should not be aware of row_index_materialized |
||
97 | # and 2) the widgets cannot yet use row_index_global. |
||
98 | # - row_index_materialized: |
||
99 | # The identifier of the row in table.X, table.Y and table.metas. |
||
100 | # This is also a sequential integer starting at 0 and ending at |
||
101 | # self.table.len_full_data(). However, the order depends on the order |
||
102 | # in which the rows are materialized and therefore does not have to be |
||
103 | # same as those of row_index_full. |
||
104 | # The instances of identical LazyTables could have different |
||
105 | # row_index_materialized values. |
||
106 | # This index should only be used internally, since its value is |
||
107 | # essentially meaningless outside self.table. |
||
108 | # - instance_index_global: |
||
109 | # A unique identifier of the instance. Conceptually this is like a |
||
110 | # name or label of the instance and therefore does not have to be |
||
111 | # numerical. |
||
112 | # The same instance in several tables will have the same |
||
113 | # instance_index_global. |
||
114 | # This identifier cannot yet be used. |
||
115 | # - row_index: |
||
116 | # For external use, that is, in __getitem__ of LazyTable, row_index is |
||
117 | # referring to row_index_full. This ensures that the interface between |
||
118 | # the widgets and LazyTable is the same as with the normal Table. |
||
119 | # For internal use, that is, in __getitem__ of LazyRowInstance, row_index |
||
120 | # is referring to row_index_materialized. This ensures that the Table, |
||
121 | # being the superclass of LazyTable, works as expected. |
||
122 | # Within the class, row_index_full or row_index_materialized should be |
||
123 | # used instead of row_index whenever possible for clarity. |
||
124 | # For example when rows are removed from table.X, Y and metas because of |
||
125 | # memory constraints, then the row_index_full of each row will stay the |
||
126 | # same, but the row_index_materialized will change. Such functionality |
||
127 | # has not yet been implemented. |
||
128 | |||
129 | row_index_full = None |
||
130 | row_index_materialized = None |
||
131 | instance_index_global = None |
||
132 | |||
133 | def __init__(self, table, row_index, region_of_interest_only=False): |
||
134 | """ |
||
135 | Construct a data instance representing the given row of the table. |
||
136 | row_index is the real row of the data set, which might not be the |
||
137 | materialized row. |
||
138 | |||
139 | When region_of_interest_only is set, then the row is only stored |
||
140 | in the table if it's in the region_of_interest. It should only be |
||
141 | necessary to set this flag internally. |
||
142 | |||
143 | TODO: |
||
144 | - Ensure that rows that are not in the region of interest are |
||
145 | removed from memory because saving memory is the reason they are |
||
146 | not appended to the table. |
||
147 | - Perhaps cache whether an instance is in the region of interest |
||
148 | so they can be skipped later. |
||
149 | """ |
||
150 | |||
151 | # The table that this row belongs to, should be a LazyTable instance. |
||
152 | self.table = table |
||
153 | |||
154 | # row_index_full is enough to get the attribute values of this row. |
||
155 | self.row_index_full = row_index |
||
156 | |||
157 | # TODO: A None for row_index_materialized should not happen anymore |
||
158 | # because this is now checked in LazyTable.__getitem__(). However |
||
159 | # this does mean that the in_roi code is not functional anymore. |
||
160 | # Replace all the RoI code with Filters? |
||
161 | # row_index_materialized is used to cache the attribute values in |
||
162 | # memory in self.table.X, Y and metas. It is set to None if there is |
||
163 | # no corresponding row in self.table. |
||
164 | self.row_index_materialized = table.row_mapping.get(self.row_index_full, None) |
||
165 | |||
166 | if self.row_index_materialized is None: |
||
167 | # The row has not yet been stored in the table. We instantiate |
||
168 | # Instance (super of RowInstance) instead of RowInstance because |
||
169 | # there is no corresponding row in memory yet. |
||
170 | # pylint: disable=non-parent-init-called |
||
171 | Instance.__init__(self, table.domain) |
||
172 | # Nevertheless, from this moment on, we can use this |
||
173 | # LazyRowInstance because all attribute values can be retrieved |
||
174 | # on the fly. |
||
175 | if self.in_filters(): |
||
176 | # The row is new and within the filter. |
||
177 | # Therefore needs to be added to be appended to self.table |
||
178 | # if it is within the region_of_interest as well. |
||
179 | self_in_region_of_interest = self.in_region_of_interest() |
||
180 | if not region_of_interest_only or self_in_region_of_interest: |
||
181 | # TODO: Replace the region_of_interest with Filters. |
||
182 | # The new row_index_materialized |
||
183 | # will be set to the current length of the table in memory. |
||
184 | # This ensures that the row is inserted at the right place |
||
185 | # (that is, at the end) when appending. |
||
186 | self.row_index_materialized = table.len_instantiated_data() |
||
187 | self.row_index = self.row_index_materialized |
||
188 | self.table.append(self) |
||
189 | self.table.row_mapping[self.row_index_full] = self.row_index_materialized |
||
190 | # A full RowInstance can now be initialized because the row |
||
191 | # is indeed available in the table. |
||
192 | RowInstance.__init__(self, table, self.row_index_materialized) |
||
193 | else: |
||
194 | # This new row is not available in the table, and we'd like |
||
195 | # to keep it this way to conserve memory. |
||
196 | self.row_index_materialized = None |
||
197 | self.row_index = self.row_index_materialized |
||
198 | else: |
||
199 | # This new row is not available in the table, and we'd like |
||
200 | # to keep it this way to conserve memory. |
||
201 | self.row_index_materialized = None |
||
202 | self.row_index = self.row_index_materialized |
||
203 | else: |
||
204 | # The row is already available in the table. |
||
205 | RowInstance.__init__(self, table, self.row_index_materialized) |
||
206 | |||
207 | |||
208 | def __getitem__(self, key, key_id=None, key_var=None): |
||
209 | """ |
||
210 | Returns a specific value by asking the table |
||
211 | for the value. |
||
212 | |||
213 | The numerical key_id or Variable key_var can be given explicitly. |
||
214 | This prevents dictionary lookups below. Just 'key' is either a key_id |
||
215 | or a key_var. |
||
216 | |||
217 | TODO: |
||
218 | - Pull from self.table instead of from self.table.widget_origin? |
||
219 | """ |
||
220 | |||
221 | # Convert from 'key' to the numerical 'key_id' |
||
222 | if key_id is None: |
||
223 | if not isinstance(key, Integral): |
||
224 | key_id = self._domain.index(key) |
||
225 | else: |
||
226 | key_id = key |
||
227 | |||
228 | # Get key_var for the Variable itself. |
||
229 | if key_var is None: |
||
230 | key_var = self.table.domain[key_id] |
||
231 | |||
232 | # Get the value cached in memory. |
||
233 | #value = self._values[keyid] |
||
234 | # ._value has been removed in Orange 3.2 |
||
235 | value = RowInstance.__getitem__(self, key=key, key_id=key_id, key_var=key_var) |
||
236 | |||
237 | # A nan means the value is not yet available. |
||
238 | if numpy.isnan(value): |
||
239 | # Pull and cache the value. |
||
240 | # TODO: Pull from self.table.widget_origin? |
||
241 | if self.table.widget_origin is not None: |
||
242 | value = self.table.widget_origin.pull_cell(self.row_index_full, key_var) |
||
243 | elif self.table.table_origin is not None: |
||
244 | value = self.table.table_origin[self.row_index_full][key_var] |
||
245 | |||
246 | # TODO: Is this necessary? Where does the 'int' come from? |
||
247 | if isinstance(value, (int, numpy.float)): |
||
248 | value = float(value) |
||
249 | |||
250 | # Cache the value both in this RowInstance as well as in |
||
251 | # the original table. |
||
252 | # TODO: Can we do everything with only self.table.X? |
||
253 | #self._values[keyid] = value |
||
254 | # ._values is removed in Orange 3.2 |
||
255 | RowInstance.__setitem__(self, key_var, value) |
||
256 | |||
257 | # Only cache in self.table if there is a corresponding row there. |
||
258 | # TODO: Should we do this caching here at all? Probably better |
||
259 | # to do this in the LazyTable itself? E.g. preventing this |
||
260 | # pylint warning: |
||
261 | # pylint: disable=protected-access |
||
262 | if self.row_index_materialized is not None: |
||
263 | if 0 <= key_id < len(self._domain.attributes): |
||
264 | self.table.X[self.row_index_materialized][key_id] = value |
||
265 | elif key_id >= len(self._domain.attributes): |
||
266 | self.table._Y[self.row_index_materialized][key_id - len(self.domain.attributes)] = value |
||
267 | else: |
||
268 | self.table._metas = self._metas[-1 - key_id] |
||
269 | |||
270 | val = Value(self._domain[key_id], value) |
||
271 | |||
272 | return val |
||
273 | |||
274 | def __str__(self): |
||
275 | # TODO: Do something sensible here! |
||
276 | return "Some LazyRowInstance" |
||
277 | |||
278 | def in_filters(self, filters=None): |
||
279 | """ |
||
280 | Return True if this row is in the filters. |
||
281 | """ |
||
282 | if filters is None: |
||
283 | filters = self.table.row_filters |
||
284 | |||
285 | in_filters = True |
||
286 | for filter_ in filters: |
||
287 | in_filters &= filter_(self) |
||
288 | |||
289 | return in_filters |
||
290 | |||
291 | def in_region_of_interest(self, region_of_interest=None): |
||
292 | """ |
||
293 | Returns whether a given instance is in a region of interest. |
||
294 | |||
295 | The region of interest is currently specified as a dictionary. Like |
||
296 | region_of_interest = { |
||
297 | 'attribute_name_1': (minimum_value, maximum_value), |
||
298 | 'attribute_name_2': (minimum_value, maximum_value), |
||
299 | } |
||
300 | This will probably change in the future. E.g. it might be more general to |
||
301 | use an SQL WHERE clause or perhaps use the Filter class. |
||
302 | |||
303 | E.g., the region_of_interest can is specified as SQL WHERE clause like |
||
304 | region_of_interest_in_sql = " AND ".join( |
||
305 | ''' "%s" BETWEEN %f AND %f ''' % ( |
||
306 | name, values[0], values[1] |
||
307 | ) for (name, values) in region_of_interest |
||
308 | ) |
||
309 | |||
310 | TODO: |
||
311 | - Add support for multiple regions of interest. |
||
312 | What if there are multiple widgets, each with their own region |
||
313 | of interest? Track regions_of_interest with some identifier? |
||
314 | and remove the region of interest when the widget doesn't need it |
||
315 | anymore? |
||
316 | """ |
||
317 | # Try to get the region of interest from the data itself. |
||
318 | if region_of_interest is None: |
||
319 | region_of_interest = self.table.region_of_interest |
||
320 | |||
321 | # By default there is no region of interest, which means that 'everything |
||
322 | # is interesting'. |
||
323 | if region_of_interest is None: |
||
324 | in_region = True |
||
325 | elif isinstance(region_of_interest, orange_filter.Filter): |
||
326 | in_region = region_of_interest(self) |
||
327 | else: |
||
328 | # Backwards compatibility with a dictionary as ROI. |
||
329 | # TODO: Remove this at some point when a Filter is always used. |
||
330 | in_region_parts = [ |
||
331 | minimum <= self[attribute_name] <= maximum |
||
332 | for (attribute_name, (minimum, maximum)) in region_of_interest.items() |
||
333 | ] |
||
334 | in_region = all(in_region_parts) |
||
335 | |||
336 | return in_region |
||
337 | |||
338 | |||
339 | |||
340 | class LazyTable(Table): |
||
0 ignored issues
–
show
|
|||
341 | # pylint: disable=too-many-ancestors |
||
342 | """ |
||
343 | LazyTable is a data structure derived from Table that doesn't |
||
344 | necessarily have all the data it represents directly available. |
||
345 | |||
346 | The LazyTable initially does not contain instantiated data. |
||
347 | However, the data can be accessed as if it were a normal Table. |
||
348 | Any data that is not yet available is retrieved from the widget |
||
349 | that created the LazyTable. |
||
350 | |||
351 | The widget_origin must be set to (and by) the widget that has |
||
352 | created this LazyTable instance. widget_origin is used to pull |
||
353 | data that is not yet available in this table. |
||
354 | |||
355 | TODO: |
||
356 | - Let _compute_basic_stats return sensible values. These usually |
||
357 | do not have to be exact, but this depends on the reason why |
||
358 | the stats are requested. |
||
359 | """ |
||
360 | |||
361 | # The widget_origin has created this LazyTable. It is used to |
||
362 | # 1) pull data that is not yet available and |
||
363 | # |
||
364 | # Data pulling (1) might better be implemented in another way. At the |
||
365 | # moment, the LazyTable has to ask widget_origin for more data. It |
||
366 | # might be better if widget_origin tells the LazyTable instance how |
||
367 | # it should retrieve more data itself. That has two benefits: |
||
368 | # - the LazyTable instance is more self-contained and |
||
369 | # - it will be easier for widget_origin to have multiple outputs. |
||
370 | # |
||
371 | # TODO: Implement this 'teaching' of the LazyTable |
||
372 | widget_origin = None |
||
373 | |||
374 | # Or this LazyTable can be created from another LazyTable, by some |
||
375 | # widget like SelectingData. |
||
376 | table_origin = None |
||
377 | |||
378 | # row_mapping is a dictionary that maps other identifiers to rows of |
||
379 | # .X, .Y and .metas. This is necessary because the rows might be fetched |
||
380 | # in non-sequential order. That is, if row 10 is requested first (e.g. |
||
381 | # by table[10]), then the first row in X, Y and metas refers to row |
||
382 | # 10 in the table. |
||
383 | row_mapping = None |
||
384 | |||
385 | # region_of_interest specifies what part of the dataset is interesting |
||
386 | # according to widgets further in the scheme. See in_region_of_interest() |
||
387 | # of LazyRowInstance for information about its structure. |
||
388 | region_of_interest = None |
||
389 | |||
390 | # List of ValueFilters that should filter the rows. |
||
391 | # Similar to SQLTable. |
||
392 | row_filters = None |
||
393 | |||
394 | stop_pulling = False |
||
395 | |||
396 | |||
397 | debug_all_lazytables = [] |
||
398 | |||
399 | |||
400 | def __init__(self, *args, **kwargs): |
||
401 | """ |
||
402 | Initialize this LazyTable. |
||
403 | """ |
||
404 | |||
405 | self.debug_all_lazytables.append(self) |
||
406 | |||
407 | |||
408 | # No rows to map yet. |
||
409 | self.row_mapping = {} |
||
410 | |||
411 | |||
412 | if 'stop_pulling' in kwargs: |
||
413 | self.stop_pulling = kwargs['stop_pulling'] |
||
414 | |||
415 | super().__init__(*args, **kwargs) |
||
416 | |||
417 | self.row_filters = () |
||
418 | # row_filters is used like in SqlTable. |
||
419 | |||
420 | self.widget_origin = kwargs.get('widget_origin', None) |
||
421 | |||
422 | # This name is used for example in the Predictions widget. |
||
423 | self.name = "A LazyTable" |
||
424 | |||
425 | if not self.stop_pulling: |
||
426 | self.pull_in_the_background() |
||
427 | |||
428 | |||
429 | |||
430 | def _fetch_all_values_for_row(self, row): |
||
431 | """ |
||
432 | Fetch all the values for a specific row. This should not be necessary, |
||
433 | but in practice it is because some non-lazy aware widgets will |
||
434 | access the numpy arrays directly. Any NaN's in there will cause |
||
435 | problems. |
||
436 | """ |
||
437 | for key_id, key_var in enumerate(self.domain): |
||
438 | # Directly call __getitem__ with key_id and key_var, because this |
||
439 | # prevents superfluous dictionary lookups. |
||
440 | # TODO: something smarter that doesn't trigger the pylint error. |
||
441 | # pylint: disable=unused-variable |
||
442 | value = row.__getitem__(key=key_id, key_id=key_id, key_var=key_var) |
||
443 | |||
444 | |||
445 | def __getitem__(self, index_row, region_of_interest_only=False): |
||
446 | # pylint: disable=too-many-ancestors, too-many-branches, arguments-differ |
||
447 | """ |
||
448 | Get a row of the table. index_row refers to index_row_full, the |
||
449 | row identifier of the full dataset. |
||
450 | |||
451 | When region_of_interest_only is set, then the row is only stored |
||
452 | in the table if it's in the region_of_interest. It should only be |
||
453 | necessary to set this flag internally. |
||
454 | """ |
||
455 | # TODO: Refactor this function? |
||
456 | if isinstance(index_row, int): |
||
457 | row_index_full = index_row |
||
458 | |||
459 | # TODO: The len_full_data() is not yet implemented for |
||
460 | # tables with .table_origin and this check should therefore |
||
461 | # not be implemented here! |
||
462 | # This raise makes it possible to use the LazyTable as an |
||
463 | # iterator, e.g. in Table.save(). |
||
464 | if index_row >= self.len_full_data(): |
||
465 | raise IndexError |
||
466 | |||
467 | # Just a normal row. |
||
468 | # row_index_materialized is used to cache the attribute values in |
||
469 | # memory in self.table.X, Y and metas. It is set to None if there is |
||
470 | # no corresponding row in self.table. |
||
471 | row_index_materialized = self.row_mapping.get(row_index_full, None) |
||
472 | if row_index_materialized is not None: |
||
473 | # TODO: or row_index_materialized here? |
||
474 | row = LazyRowInstance(self, row_index_full, region_of_interest_only=region_of_interest_only) |
||
475 | elif self.widget_origin is not None: |
||
476 | # Actually do the same thing, since the pulling logic is |
||
477 | # currently implemented in LazyRowInstance. |
||
478 | row = LazyRowInstance(self, row_index_full, region_of_interest_only=region_of_interest_only) |
||
479 | # Prefetch all the attributes for simplicity. |
||
480 | self._fetch_all_values_for_row(row) |
||
481 | elif self.table_origin is not None: |
||
482 | if not self.row_filters: |
||
483 | # The rows of this table are the same as the table_origin, |
||
484 | # therefore we don't need to loop through the rows. |
||
485 | # The columns might be different though, this is handled |
||
486 | # by RowInstance? |
||
487 | row = self.table_origin[row_index_full] |
||
488 | # The code below is copied from the for loop below. |
||
489 | # TODO: Perhaps refactor this. |
||
490 | row.table = self |
||
491 | row.row_index_full = row_index_full |
||
492 | row.row_index_materialized = self.len_instantiated_data() |
||
493 | row.row_index = row.row_index_materialized |
||
494 | self.append(row) |
||
495 | self.row_mapping[row.row_index_full] = row.row_index_materialized |
||
496 | row = LazyRowInstance(self, row.row_index_full, region_of_interest_only=region_of_interest_only) |
||
497 | else: |
||
498 | # The rows of this table might be different from the |
||
499 | # table_origin. |
||
500 | # Go through the original table and see whether we find |
||
501 | # a row that fits in the table. |
||
502 | row_index_counter = 0 |
||
503 | # TODO: Start as far as possible into table_origin instead |
||
504 | # of at the beginning. However, this is only possible if |
||
505 | # we would have kept the row_index_full of the original |
||
506 | # table, because that would tell us were to start.. |
||
507 | # That is, we need instance_identifier_global ! |
||
508 | for row_origin in self.table_origin: |
||
509 | if row_origin.in_filters(self.row_filters): |
||
510 | row_index_counter += 1 |
||
511 | # TODO: Off by one error here? |
||
512 | if row_index_counter > row_index_full: |
||
513 | # Found it! |
||
514 | #row = row_origin.copy() |
||
515 | row = row_origin |
||
516 | row.table = self |
||
517 | row.row_index_full = row_index_full |
||
518 | # TODO: The below is similar to LazyRowInstance. |
||
519 | # __getitem__(), perhaps that code there should |
||
520 | # go to here? |
||
521 | row.row_index_materialized = self.len_instantiated_data() |
||
522 | row.row_index = row.row_index_materialized |
||
523 | self.append(row) |
||
524 | self.row_mapping[row.row_index_full] = row.row_index_materialized |
||
525 | # A full RowInstance can now be initialized because the row |
||
526 | # is indeed available in the table. |
||
527 | row = LazyRowInstance(self, row.row_index_full, region_of_interest_only=region_of_interest_only) |
||
528 | break |
||
529 | else: |
||
530 | # Went through all the rows in origin_table, no dice.. |
||
531 | raise IndexError |
||
532 | else: |
||
533 | raise NotImplementedError |
||
534 | |||
535 | return row |
||
536 | |||
537 | # TODO: See documentation of tabular data classes to determine |
||
538 | # the proper implementation of the next two cases. |
||
539 | elif isinstance(index_row, numpy.ndarray): |
||
540 | # The argument can either be a mask or a list of row indexes. |
||
541 | if index_row.dtype == numpy.dtype('bool'): |
||
542 | # A mask. This mask can only refer to the materialized rows |
||
543 | # because it is not feasible to create a mask for the full |
||
544 | # dataset. Therefore there is no need to use a LazyTable |
||
545 | # here, so converting to a normal Table should work. |
||
546 | # TODO: However, technically it should be possible to create |
||
547 | # such a mask where every item corresponds to the |
||
548 | # row_index_full of an instance. This would cause problems |
||
549 | # with e.g. a LazyFile that is materialized completely, |
||
550 | # but not in the original order. It should be checked |
||
551 | # whether this situation can actually occur anywhere in |
||
552 | # the code base. |
||
553 | # TODO: It might be useful to return a LazyTable even though |
||
554 | # it would not be necessary because the LazyTable might |
||
555 | # offer other benefits. E.g. the LazyTable might be |
||
556 | # useful for linked-views or so. |
||
557 | new_table = super().__getitem__(index_row) |
||
558 | return new_table |
||
559 | elif index_row.dtype == numpy.dtype('int64'): |
||
560 | # A numpy array of indices. Are these materialized indices or |
||
561 | # row indices? |
||
562 | # TODO: Answer the above question with certainty. |
||
563 | #row_mapping_inverse = self.row_mapping_full_from_materialized() |
||
564 | # Assume these are based on the materialized rows. |
||
565 | # This seems to be what the Test Learners widget uses. |
||
566 | # Then we can simply create a new normal Table, where the |
||
567 | # same caveates apply as in the other part of this if-clause. |
||
568 | new_table = super().__getitem__(index_row) |
||
569 | return new_table |
||
570 | elif isinstance(index_row, slice): |
||
571 | # pylint: disable=unused-variable,pointless-statement |
||
572 | # TODO: decide whether these are materialized or full row_indices. |
||
573 | start = index_row.start if index_row.start is not None else 0 |
||
574 | stop = index_row.stop if index_row.stop is not None else self.len_instantiated_data() |
||
575 | step = index_row.step if index_row.step is not None else 1 |
||
576 | row_indices_materialized = list(range(start, stop, step)) |
||
577 | ... |
||
578 | # TODO: slice the table. Probably need to return a new table? |
||
579 | raise NotImplementedError("Slicing of LazyTables is not yet supported.") |
||
580 | |||
581 | def copy(self, stop_pulling=None): |
||
582 | """ |
||
583 | Create a copy of this LazyTable. |
||
584 | .table_origin will be set to self. |
||
585 | .stop_pulling will be copied from self unless explicitly specified. |
||
586 | It is adviced to set stop_pulling to off. |
||
587 | """ |
||
588 | # pylint: disable=arguments-differ |
||
589 | # TODO: Support both these cases in some way?: |
||
590 | # t2.table_origin = self |
||
591 | # t2.widget_origin = self.widget_origin |
||
592 | t2 = LazyTable.from_domain(self.domain) |
||
593 | t2.stop_pulling = self.stop_pulling if stop_pulling is None else stop_pulling |
||
594 | t2.table_origin = self |
||
595 | return t2 |
||
596 | |||
597 | @classmethod |
||
598 | def from_table(cls, domain, source, row_indices=...): |
||
599 | """ |
||
600 | Create a new table from selected columns and/or rows of an existing |
||
601 | one. The columns are chosen using a domain. The domain may also include |
||
602 | variables that do not appear in the source table; they are computed |
||
603 | from source variables if possible. |
||
604 | |||
605 | The resulting data may be a |
||
606 | - new LazyTable if source is a LazyTable, domain contains only |
||
607 | attributes of the source and row_indices is not specified. |
||
608 | This should ensure that the SelectAttributes widget works. |
||
609 | - a normal Table otherwise, which could apparently be view or a copy |
||
610 | of the existing data. However, what happens with a view of |
||
611 | growing data is unknown. |
||
612 | |||
613 | :param domain: the domain for the new table |
||
614 | :type domain: Orange.data.Domain |
||
615 | :param source: the source table |
||
616 | :type source: Orange.data.Table |
||
617 | :param row_indices: indices of the rows to include |
||
618 | :type row_indices: a slice or a sequence |
||
619 | :return: a new table |
||
620 | :rtype: Orange.data.Table |
||
621 | """ |
||
622 | # TODO: Improve the lazyness support for other cases? |
||
623 | # TODO: Investigate this computing of new variables. |
||
624 | subdomain = all(v in source.domain for v in domain) |
||
625 | |||
626 | if isinstance(source, LazyTable) and subdomain: |
||
627 | table_new = LazyTable.from_domain(domain) |
||
628 | table_new.stop_pulling = True # Should only be done by first LazyTable? |
||
629 | table_new.table_origin = source |
||
630 | # Fill the table with the rows that were already materialized. |
||
631 | # TODO: Do something smarter here? |
||
632 | # Definitely, currently we need the copy.copy to prevent |
||
633 | # RuntimeError: dictionary changed size during iteration |
||
634 | for row_index_full in copy.copy(table_new.table_origin.row_mapping): |
||
635 | for variable in table_new.domain: |
||
636 | # pylint: disable=unused-variable |
||
637 | value = table_new[row_index_full][variable] |
||
638 | else: |
||
639 | table_new = Table.from_table( |
||
640 | domain=domain, |
||
641 | source=source, |
||
642 | row_indices=row_indices, |
||
643 | ) |
||
644 | |||
645 | return table_new |
||
646 | |||
647 | |||
648 | def _filter_values(self, f): |
||
649 | # TODO: Docstring. |
||
650 | # Need to copy f because e.g. SelectData will negate it etc. |
||
651 | f2 = copy.deepcopy(f) |
||
652 | # We need to prevent pulling in the new LazyTable. |
||
653 | # TODO: Actually, we need to call it 'stop_pushing' or so? |
||
654 | t2 = self.copy(stop_pulling=True) |
||
655 | t2.row_filters += (f2,) # pylint: disable=no-member |
||
656 | # Apparently there is specific interest for this region, so we should |
||
657 | # set the region_of_interest to this filter. |
||
658 | # TODO: Support for multiple regions of interest would be nice. |
||
659 | self.set_region_of_interest(f) |
||
660 | return t2 |
||
661 | |||
662 | # TODO: Implement _filter_random |
||
663 | def _filter_random(self, prob, negate=False): |
||
664 | raise NotImplementedError |
||
665 | |||
666 | def pull_region_of_interest(self): |
||
667 | """ |
||
668 | Request data for the region of interest. |
||
669 | """ |
||
670 | if self.widget_origin is not None: |
||
671 | self.widget_origin.pull_region_of_interest() |
||
672 | elif self.table_origin is not None: |
||
673 | self.table_origin.pull_region_of_interest() |
||
674 | |||
675 | def pull_in_the_background(self): |
||
676 | """ |
||
677 | Keep pulling data in the background. |
||
678 | |||
679 | TODO: |
||
680 | - Stop pulling when running out of memory. Perhaps start deleting rows |
||
681 | that are not in the region of interest? |
||
682 | - Continue to pull data outside the region_of_interest when we got |
||
683 | all of that? |
||
684 | |||
685 | """ |
||
686 | if (not self.stop_pulling) and threading.main_thread().is_alive(): |
||
687 | self.pull_region_of_interest() |
||
688 | if (not self.stop_pulling) and threading.main_thread().is_alive(): |
||
689 | threading.Timer(10, self.pull_in_the_background).start() |
||
690 | |||
691 | |||
692 | |||
693 | def __str__(self): |
||
694 | """ |
||
695 | Overloaded because table.__str__ performs slicing which is not yet |
||
696 | supported. |
||
697 | """ |
||
698 | ss = [ |
||
699 | "LazyTable %s" % (id(self)), |
||
700 | "- full length: %s" % (self.len_full_data(),), |
||
701 | "- materialized length: %s" % (self.len_instantiated_data(),), |
||
702 | "- stop_pulling: %s" % (self.stop_pulling,), |
||
703 | "- roi: %s" % (self.region_of_interest.conditions if self.region_of_interest is not None else None), |
||
704 | "- row_filters: %s" % ([rf.conditions for rf in self.row_filters],), |
||
705 | ] |
||
706 | s = "\n".join(ss) |
||
707 | return s |
||
708 | |||
709 | # A __repr__ is needed for the interactive Python Script widget. |
||
710 | __repr__ = __str__ |
||
711 | |||
712 | |||
713 | def checksum(self, include_metas=True): |
||
714 | """ |
||
715 | Overloaded because widgets might check whether the send data has the |
||
716 | same checksum as the data they already have. However, the lazy |
||
717 | widgets keep sending the same data instance, except with more data. |
||
718 | So those checking widgets will compare the same object with itself. |
||
719 | |||
720 | TODO: find a proper solution to this, because the legitimate uses |
||
721 | of checksum are also disabled. |
||
722 | """ |
||
723 | return numpy.random.randint(10000000) |
||
724 | |||
725 | |||
726 | def row_mapping_full_from_materialized(self): |
||
727 | # pylint: disable=invalid-name |
||
728 | """ |
||
729 | Invert the row mapping. |
||
730 | """ |
||
731 | row_mapping_inverse = {v:k for (k, v) in self.row_mapping.items()} |
||
732 | return row_mapping_inverse |
||
733 | |||
734 | def set_region_of_interest(self, region_of_interest): |
||
735 | """ |
||
736 | A region of interest has been indicated, probably by the user. |
||
737 | Propagate this information to the widget providing the data, so it |
||
738 | can fetch more data for this region of interest. |
||
739 | """ |
||
740 | # TODO: Add support for multiple concurrent regions of interest, |
||
741 | # e.g. one for each widget this lazytable is sent to. |
||
742 | self.region_of_interest = region_of_interest |
||
743 | # TODO: Perhaps make a LazyWidget base class to isinstance against. |
||
744 | if self.widget_origin and hasattr(self.widget_origin, 'set_region_of_interest'): |
||
745 | self.widget_origin.set_region_of_interest(region_of_interest) |
||
746 | if isinstance(self.table_origin, LazyTable): |
||
747 | self.table_origin.set_region_of_interest(region_of_interest) |
||
748 | |||
749 | def len_full_data(self): |
||
750 | """ |
||
751 | Returns the full length of the dataset. Not all this data might be initialized. |
||
752 | This length can be unknown, if the full data set has not yet been derived. |
||
753 | This length can also be infinite, in case an infinite generator is used to create |
||
754 | this lazy table. |
||
755 | """ |
||
756 | if self.widget_origin is not None: |
||
757 | length = self.widget_origin.pull_length() |
||
758 | elif self.table_origin is not None: |
||
759 | # TODO: The below is incorrect. Either |
||
760 | # - Iterate through all rows and get the result. This materializes |
||
761 | # the entire table. |
||
762 | # - Calculate the len_full_data(). |
||
763 | # - Iterate through the full table without caching the result. |
||
764 | # Cannot be done for very large tables. |
||
765 | # - Raise exception if not all rows have been instantiated yet. |
||
766 | # - ?? |
||
767 | length = self.table_origin.len_full_data() |
||
768 | else: |
||
769 | # Need to do something. |
||
770 | length = self.X.shape[0] |
||
771 | |||
772 | return length |
||
773 | |||
774 | approx_len = len_full_data |
||
775 | |||
776 | def len_instantiated_data(self): |
||
777 | """ |
||
778 | Returns the length of the instantiated data. This is the data that is directly |
||
779 | available to the widgets. The rest of the data can still be requested by accessing |
||
780 | it though. |
||
781 | """ |
||
782 | length = len(self.X) |
||
783 | return length |
||
784 | |||
785 | #take_len_of_instantiated_data = False |
||
786 | take_len_of_instantiated_data = True |
||
787 | def __len__(self): |
||
788 | """ |
||
789 | There are two versions of len(), one to get the size of the dataset irrespective of how much of it is |
||
790 | already available in python and one to get the size of the available data. |
||
791 | |||
792 | The append() and insert() functions below are used to add newly instantiated rows to the already |
||
793 | instantiated data. These should use the instantiated data length and not the full one. |
||
794 | """ |
||
795 | length = self.len_instantiated_data() if self.take_len_of_instantiated_data else self.len_full_data() |
||
796 | return length |
||
797 | |||
798 | approx_len = len_full_data |
||
799 | |||
800 | def extend(self, instances): |
||
801 | """ |
||
802 | Hack to concatenate LazyTables. |
||
803 | """ |
||
804 | # TODO: Properly implement this, think about what it means for |
||
805 | # LazyTables to be extended. |
||
806 | # TODO: What about rowmapping? |
||
807 | # TODO: How to test domains for equality?? |
||
808 | |||
809 | # For now just extend like a normal Table. |
||
810 | super().extend(instances) |
||
811 | |||
812 | # Hack for row_mapping so OWTable works with OWSAMP. |
||
813 | # This destroys all other use of the LazyTable. |
||
814 | new_length = self.len_instantiated_data() |
||
815 | self.row_mapping = {i: i for i in range(new_length)} |
||
816 | |||
817 | |||
818 | def has_weights(self): |
||
819 | """ |
||
820 | Return `True` if the data instances are weighed. |
||
821 | Hacked to return False. |
||
822 | """ |
||
823 | return False |
||
824 | |||
825 | # TODO: Provide some functionality here. E.g. to use instead of |
||
826 | # get_statistics() in owsgd.py. |
||
827 | #def compute_basic_stats(self, include_metas=None): |
||
828 | # """ |
||
829 | # _compute_basic_stats should return stats based on the full table, |
||
830 | # irrespective of what is currently materialized. It can only do this |
||
831 | # by pulling these statistics. There is no functionality to do that |
||
832 | # at the moment. Therefore this function provides some fake statistics. |
||
833 | # However, since the lazy widgets should never send an entirely empty |
||
834 | # table it should be possible to get decent statistics from the few |
||
835 | # materialized rows, making this overloading superfluous. |
||
836 | # |
||
837 | # _compute_basic_stats is faked. |
||
838 | # |
||
839 | # Returns a 6-element tuple or an array of shape (len(x), 6) |
||
840 | # Computed (min, max, mean, 0, #nans and #non-nans) |
||
841 | # |
||
842 | # TODO: Pull these statistics. |
||
843 | # """ |
||
844 | # stats = [(-9000, 9000, 0.0, 0, 0, len(self))] * len(self.domain) |
||
845 | # return stats |
||
846 | |||
847 | def __del__(self): |
||
848 | self.stop_pulling = True |
||
849 | |||
850 | # TODO Figure out wether we can do without a separate class. |
||
851 | def __iter__(self): |
||
852 | return LazyTableIterator(self) |
||
853 | |||
854 | # TODO Figure out wether we can do without a separate class. A more |
||
855 | # pythonic way. |
||
856 | # See discussion: https://docs.python.org/3/glossary.html#term-iterator |
||
857 | class LazyTableIterator(collections.abc.Iterator): |
||
858 | """ |
||
859 | Iterator to iterate over LazyTables. This allows widgets to iterate over |
||
860 | the LazyTable in chunks without interfering with other widgets. |
||
861 | |||
862 | See owsgd.py for an example. |
||
863 | """ |
||
864 | # pylint: disable=too-few-public-methods |
||
865 | def __init__(self, lazy_table): |
||
866 | self.current_index = 0 |
||
867 | self.lazy_table = lazy_table |
||
868 | |||
869 | def __iter__(self): |
||
870 | return self |
||
871 | |||
872 | # TODO: Fix ROI. E.g. through Filter so we don't need this loop. |
||
873 | # Or the loop becomes trivial. |
||
874 | def __next__(self): |
||
875 | #instance = self.lazy_table[self.current_index] |
||
876 | # TODO: Somehow wait to see if more data will be available later? |
||
877 | try: |
||
878 | instance = self.lazy_table.__getitem__(self.current_index, region_of_interest_only=True) |
||
879 | except IndexError: |
||
880 | raise StopIteration |
||
881 | self.current_index = self.current_index + 1 |
||
882 | while not instance.in_region_of_interest(): |
||
883 | #instance = self.lazy_table[self.current_index] |
||
884 | # TODO: Somehow wait to see if more data will be available later? |
||
885 | try: |
||
886 | instance = self.lazy_table.__getitem__(self.current_index, region_of_interest_only=True) |
||
887 | except IndexError: |
||
888 | raise StopIteration |
||
889 | self.current_index = self.current_index + 1 |
||
890 | |||
891 | #instance = self.lazy_table.__getitem__(self.current_index, region_of_interest_only=True) |
||
892 | return instance |
||
893 |
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.