Passed
Push — master ( 523b12...22d804 )
by Matt
56s
created

BTreePage.recover_freeblock_records()   D

Complexity

Conditions 8

Size

Total Lines 81

Duplication

Lines 0
Ratio 0 %

Importance

Changes 4
Bugs 0 Features 0
Metric Value
cc 8
c 4
b 0
f 0
dl 0
loc 81
rs 4.659

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# MIT License
2
#
3
# Copyright (c) 2017 Matt Boyer
4
#
5
# Permission is hereby granted, free of charge, to any person obtaining a copy
6
# of this software and associated documentation files (the "Software"), to deal
7
# in the Software without restriction, including without limitation the rights
8
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
# copies of the Software, and to permit persons to whom the Software is
10
# furnished to do so, subject to the following conditions:
11
#
12
# The above copyright notice and this permission notice shall be included in
13
# all copies or substantial portions of the Software.
14
#
15
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
# SOFTWARE.
22
23
import pdb
24
import struct
25
26
from . import _LOGGER
27
from .record import (Record, MalformedRecord)
28
from .tuples import SQLite_btree_page_header
29
from .utils import (Varint, IndexDict)
30
31
32
class Page(object):
33
    def __init__(self, page_idx, db):
34
        self._page_idx = page_idx
35
        self._db = db
36
        self._bytes = db.page_bytes(self.idx)
37
38
    @property
39
    def idx(self):
40
        return self._page_idx
41
42
    @property
43
    def usable_size(self):
44
        return self._db.header.page_size - self._db.header.reserved_length
45
46
    def __bytes__(self):
47
        return self._bytes
48
49
    @property
50
    def parent(self):
51
        try:
52
            parent_idx = self._db.ptrmap[self.idx].page_ptr
53
        except KeyError:
54
            return None
55
56
        if 0 == parent_idx:
57
            return None
58
        else:
59
            return self._db.pages[parent_idx]
60
61
    def __repr__(self):
62
        return "<SQLite Page {0}>".format(self.idx)
63
64
65
class FreelistTrunkPage(Page):
66
    # XXX Maybe it would make sense to expect a Page instance as constructor
67
    # argument?
68
    def __init__(self, page_idx, db, leaves):
69
        super().__init__(page_idx, db)
70
        self._leaves = leaves
71
72
    def __repr__(self):
73
        return "<SQLite Freelist Trunk Page {0}: {1} leaves>".format(
74
            self.idx, len(self._leaves)
75
        )
76
77
78
class FreelistLeafPage(Page):
79
    # XXX Maybe it would make sense to expect a Page instance as constructor
80
    # argument?
81
    def __init__(self, page_idx, db, trunk_idx):
82
        super().__init__(page_idx, db)
83
        self._trunk = self._db.pages[trunk_idx]
84
85
    def __repr__(self):
86
        return "<SQLite Freelist Leaf Page {0}. Trunk: {1}>".format(
87
            self.idx, self._trunk.idx
88
        )
89
90
91
class PtrmapPage(Page):
92
    # XXX Maybe it would make sense to expect a Page instance as constructor
93
    # argument?
94
    def __init__(self, page_idx, db, ptr_array):
95
        super().__init__(page_idx, db)
96
        self._pointers = ptr_array
97
98
    @property
99
    def pointers(self):
100
        return self._pointers
101
102
    def __repr__(self):
103
        return "<SQLite Ptrmap Page {0}. {1} pointers>".format(
104
            self.idx, len(self.pointers)
105
        )
106
107
108
class OverflowPage(Page):
109
    # XXX Maybe it would make sense to expect a Page instance as constructor
110
    # argument?
111
    def __init__(self, page_idx, db):
112
        super().__init__(page_idx, db)
113
        self._parse()
114
115
    def _parse(self):
116
        # TODO We should have parsing here for the next page index in the
117
        # overflow chain
118
        pass
119
120
    def __repr__(self):
121
        return "<SQLite Overflow Page {0}. Continuation of {1}>".format(
122
            self.idx, self.parent.idx
123
        )
124
125
126
class BTreePage(Page):
127
    btree_page_types = {
128
        0x02:   "Index Interior",
129
        0x05:   "Table Interior",
130
        0x0A:   "Index Leaf",
131
        0x0D:   "Table Leaf",
132
    }
133
134
    def __init__(self, page_idx, db, heuristics):
135
        # XXX We don't know a page's type until we've had a look at the header.
136
        # Or do we?
137
        super().__init__(page_idx, db)
138
        self._heuristics = heuristics
139
        self._header_size = 8
140
        page_header_bytes = self._get_btree_page_header()
141
        self._btree_header = SQLite_btree_page_header(
142
            # Set the right-most page index to None in the 1st pass
143
            *struct.unpack(r'>BHHHB', page_header_bytes), None
144
        )
145
        self._cell_ptr_array = []
146
        self._freeblocks = IndexDict()
147
        self._cells = IndexDict()
148
        self._recovered_records = set()
149
        self._overflow_threshold = self.usable_size - 35
150
151
        if self._btree_header.page_type not in BTreePage.btree_page_types:
152
            # pdb.set_trace()
153
            raise ValueError
154
155
        # We have a twelve-byte header, need to read it again
156
        if self._btree_header.page_type in (0x02, 0x05):
157
            self._header_size = 12
158
            page_header_bytes = self._get_btree_page_header()
159
            self._btree_header = SQLite_btree_page_header(*struct.unpack(
160
                r'>BHHHBI', page_header_bytes
161
            ))
162
163
        # Page 1 (and page 2, but that's the 1st ptrmap page) does not have a
164
        # ptrmap entry.
165
        # The first ptrmap page will contain back pointer information for pages
166
        # 3 through J+2, inclusive.
167
        if self._db.ptrmap:
168
            if self.idx >= 3 and self.idx not in self._db.ptrmap:
169
                _LOGGER.warning(
170
                    "BTree page %d doesn't have ptrmap entry!", self.idx
171
                )
172
173
        if self._btree_header.num_cells > 0:
174
            cell_ptr_bytes = self._get_btree_ptr_array(
175
                self._btree_header.num_cells
176
            )
177
            self._cell_ptr_array = struct.unpack(
178
                r'>{count}H'.format(count=self._btree_header.num_cells),
179
                cell_ptr_bytes
180
            )
181
            smallest_cell_offset = min(self._cell_ptr_array)
182
            if self._btree_header.cell_content_offset != smallest_cell_offset:
183
                _LOGGER.warning(
184
                    (
185
                        "Inconsistent cell ptr array in page %d! Cell content "
186
                        "starts at offset %d, but min cell pointer is %d"
187
                    ),
188
                    self.idx,
189
                    self._btree_header.cell_content_offset,
190
                    smallest_cell_offset
191
                )
192
193
    @property
194
    def btree_header(self):
195
        return self._btree_header
196
197
    @property
198
    def page_type(self):
199
        try:
200
            return self.btree_page_types[self._btree_header.page_type]
201
        except KeyError:
202
            pdb.set_trace()
203
            _LOGGER.warning(
204
                "Unknown B-Tree page type: %d", self._btree_header.page_type
205
            )
206
            raise
207
208
    @property
209
    def freeblocks(self):
210
        return self._freeblocks
211
212
    @property
213
    def cells(self):
214
        return self._cells
215
216
    def __repr__(self):
217
        # TODO Include table in repr, where available
218
        return "<SQLite B-Tree Page {0} ({1}) {2} cells>".format(
219
            self.idx, self.page_type, len(self._cell_ptr_array)
220
        )
221
222
    @property
223
    def table(self):
224
        return self._db.get_page_table(self.idx)
225
226
    def _get_btree_page_header(self):
227
        header_offset = 0
228
        if self.idx == 1:
229
            header_offset += 100
230
        return bytes(self)[header_offset:self._header_size + header_offset]
231
232
    def _get_btree_ptr_array(self, num_cells):
233
        array_offset = self._header_size
234
        if self.idx == 1:
235
            array_offset += 100
236
        return bytes(self)[array_offset:2 * num_cells + array_offset]
237
238
    def parse_cells(self):
239
        if self.btree_header.page_type == 0x05:
240
            self.parse_table_interior_cells()
241
        elif self.btree_header.page_type == 0x0D:
242
            self.parse_table_leaf_cells()
243
        self.parse_freeblocks()
244
245
    def parse_table_interior_cells(self):
246
        if self.btree_header.page_type != 0x05:
247
            assert False
248
249
        _LOGGER.debug("Parsing cells in table interior cell %d", self.idx)
250
        for cell_idx, offset in enumerate(self._cell_ptr_array):
251
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, offset)
252
            left_ptr_bytes = bytes(self)[offset:offset + 4]
253
            left_ptr, = struct.unpack(r'>I', left_ptr_bytes)
254
255
            offset += 4
256
            integer_key = Varint(bytes(self)[offset:offset+9])
257
            self._cells[cell_idx] = (left_ptr, int(integer_key))
258
259
    def parse_table_leaf_cells(self):
260
        if self.btree_header.page_type != 0x0d:
261
            assert False
262
263
        _LOGGER.debug("Parsing cells in table leaf cell %d", self.idx)
264
        for cell_idx, cell_offset in enumerate(self._cell_ptr_array):
265
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, cell_offset)
266
267
            # This is the total size of the payload, which may include overflow
268
            offset = cell_offset
269
            payload_length_varint = Varint(bytes(self)[offset:offset+9])
270
            total_payload_size = int(payload_length_varint)
271
272
            overflow = False
273
            # Let X be U-35. If the payload size P is less than or equal to X
274
            # then the entire payload is stored on the b-tree leaf page. Let M
275
            # be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)). If P is
276
            # greater than X then the number of bytes stored on the table
277
            # b-tree leaf page is K if K is less or equal to X or M otherwise.
278
            # The number of bytes stored on the leaf page is never less than M.
279
            cell_payload_size = 0
280
            if total_payload_size > self._overflow_threshold:
281
                m = int(((self.usable_size - 12) * 32/255)-23)
282
                k = m + ((total_payload_size - m) % (self.usable_size - 4))
283
                if k <= self._overflow_threshold:
284
                    cell_payload_size = k
285
                else:
286
                    cell_payload_size = m
287
                overflow = True
288
            else:
289
                cell_payload_size = total_payload_size
290
291
            offset += len(payload_length_varint)
292
293
            integer_key = Varint(bytes(self)[offset:offset+9])
294
            offset += len(integer_key)
295
296
            overflow_bytes = bytes()
297
            if overflow:
298
                first_oflow_page_bytes = bytes(self)[
299
                    offset + cell_payload_size:offset + cell_payload_size + 4
300
                ]
301
                if not first_oflow_page_bytes:
302
                    continue
303
304
                first_oflow_idx, = struct.unpack(
305
                    r'>I', first_oflow_page_bytes
306
                )
307
                next_oflow_idx = first_oflow_idx
308
                while next_oflow_idx != 0:
309
                    oflow_page_bytes = self._db.page_bytes(next_oflow_idx)
310
311
                    len_overflow = min(
312
                        len(oflow_page_bytes) - 4,
313
                        (
314
                            total_payload_size - cell_payload_size +
315
                            len(overflow_bytes)
316
                        )
317
                    )
318
                    overflow_bytes += oflow_page_bytes[4:4 + len_overflow]
319
320
                    first_four_bytes = oflow_page_bytes[:4]
321
                    next_oflow_idx, = struct.unpack(
322
                        r'>I', first_four_bytes
323
                    )
324
325
            try:
326
                cell_data = bytes(self)[offset:offset + cell_payload_size]
327
                if overflow_bytes:
328
                    cell_data += overflow_bytes
329
330
                # All payload bytes should be accounted for
331
                assert len(cell_data) == total_payload_size
332
333
                record_obj = Record(cell_data)
334
                _LOGGER.debug("Created record: %r", record_obj)
335
336
            except TypeError as ex:
337
                _LOGGER.warning(
338
                    "Caught %r while instantiating record %d",
339
                    ex, int(integer_key)
340
                )
341
                pdb.set_trace()
342
                raise
343
344
            self._cells[cell_idx] = (int(integer_key), record_obj)
345
346
    def parse_freeblocks(self):
347
        # The first 2 bytes of a freeblock are a big-endian integer which is
348
        # the offset in the b-tree page of the next freeblock in the chain, or
349
        # zero if the freeblock is the last on the chain. The third and fourth
350
        # bytes of each freeblock form a big-endian integer which is the size
351
        # of the freeblock in bytes, including the 4-byte header. Freeblocks
352
        # are always connected in order of increasing offset. The second field
353
        # of the b-tree page header is the offset of the first freeblock, or
354
        # zero if there are no freeblocks on the page. In a well-formed b-tree
355
        # page, there will always be at least one cell before the first
356
        # freeblock.
357
        #
358
        # TODO But what about deleted records that exceeded the overflow
359
        # threshold in the past?
360
        block_offset = self.btree_header.first_freeblock_offset
361
        while block_offset != 0:
362
            freeblock_header = bytes(self)[block_offset:block_offset + 4]
363
            # Freeblock_size includes the 4-byte header
364
            next_freeblock_offset, freeblock_size = struct.unpack(
365
                r'>HH',
366
                freeblock_header
367
            )
368
            freeblock_bytes = bytes(self)[
369
                block_offset + 4:block_offset + freeblock_size - 4
370
            ]
371
            self._freeblocks[block_offset] = freeblock_bytes
372
            block_offset = next_freeblock_offset
373
374
    def print_cells(self):
375
        for cell_idx in self.cells.keys():
376
            rowid, record = self.cells[cell_idx]
377
            _LOGGER.info(
378
                "Cell %d, rowid: %d, record: %r",
379
                cell_idx, rowid, record
380
            )
381
            record.print_fields(table=self.table)
382
383
    def recover_freeblock_records(self, grouping):
384
        # If we're lucky (i.e. if no overwriting has taken place), we should be
385
        # able to find whole record headers in freeblocks.
386
        # We need to start from the end of the freeblock and work our way back
387
        # to the start. That means we don't know where a cell header will
388
        # start, but I suppose we can take a guess
389
390
        if not self.table:
391
            return
392
393
        try:
394
            table_heuristic = self._heuristics.get_heuristic(
395
                self.table, grouping
396
            )
397
        except ValueError as ex:
398
            _LOGGER.error(str(ex))
399
            return
400
401
        _LOGGER.info(
402
            "Using heuristic %r on table \"%s\"",
403
            table_heuristic, self.table,
404
        )
405
406
        _LOGGER.info("Attempting to recover records from freeblocks")
407
        for freeblock_idx, freeblock_offset in enumerate(self._freeblocks):
408
            freeblock_bytes = self._freeblocks[freeblock_offset]
409
            if 0 == len(freeblock_bytes):
410
                continue
411
            _LOGGER.debug(
412
                "Freeblock %d/%d in page, offset %d, %d bytes",
413
                1 + freeblock_idx,
414
                len(self._freeblocks),
415
                freeblock_offset,
416
                len(freeblock_bytes)
417
            )
418
419
            recovered_bytes = 0
420
            recovered_in_freeblock = 0
421
422
            # TODO Maybe we need to guess the record header lengths rather than
423
            # try and read them from the freeblocks
424
            for header_start in table_heuristic(freeblock_bytes):
425
                _LOGGER.debug(
426
                    (
427
                        "Trying potential record header start at "
428
                        "freeblock offset %d/%d"
429
                    ),
430
                    header_start, len(freeblock_bytes)
431
                )
432
                _LOGGER.debug("%r", freeblock_bytes)
433
                try:
434
                    # We don't know how to handle overflow in deleted records,
435
                    # so we'll have to truncate the bytes object used to
436
                    # instantiate the Record object
437
                    record_bytes = freeblock_bytes[
438
                        header_start:header_start+self._overflow_threshold
439
                    ]
440
                    record_obj = Record(record_bytes)
441
                except MalformedRecord:
442
                    # This isn't a well-formed record, let's move to the next
443
                    # candidate
444
                    continue
445
446
                field_lengths = sum(
447
                    len(field_obj) for field_obj in record_obj.fields.values()
448
                )
449
                record_obj.truncate(field_lengths + len(record_obj.header))
450
                self._recovered_records.add(record_obj)
451
452
                recovered_bytes += len(bytes(record_obj))
453
                recovered_in_freeblock += 1
454
455
            _LOGGER.info(
456
                (
457
                    "Recovered %d record(s): %d bytes out of %d "
458
                    "freeblock bytes @ offset %d"
459
                ),
460
                recovered_in_freeblock,
461
                recovered_bytes,
462
                len(freeblock_bytes),
463
                freeblock_offset,
464
            )
465
466
    @property
467
    def recovered_records(self):
468
        return self._recovered_records
469
470
    def print_recovered_records(self):
471
        if not self._recovered_records:
472
            return
473
474
        for record_obj in self._recovered_records:
475
            _LOGGER.info("Recovered record: %r", record_obj)
476
            _LOGGER.info("Recovered record header: %s", record_obj.header)
477
            record_obj.print_fields(table=self.table)
478