Issues (17)

src/pages.py (3 issues)

1
# MIT License
2
#
3
# Copyright (c) 2017 Matt Boyer
4
#
5
# Permission is hereby granted, free of charge, to any person obtaining a copy
6
# of this software and associated documentation files (the "Software"), to deal
7
# in the Software without restriction, including without limitation the rights
8
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
# copies of the Software, and to permit persons to whom the Software is
10
# furnished to do so, subject to the following conditions:
11
#
12
# The above copyright notice and this permission notice shall be included in
13
# all copies or substantial portions of the Software.
14
#
15
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
# SOFTWARE.
22
23
import struct
24
25
from . import _LOGGER
26
from .record import (Record, MalformedRecord)
27
from .tuples import SQLite_btree_page_header
28
from .utils import (Varint, IndexDict)
29
30
31 View Code Duplication
class Page(object):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
32
    def __init__(self, page_idx, db):
33
        self._page_idx = page_idx
34
        self._db = db
35
        self._bytes = db.page_bytes(self.idx)
36
37
    @property
38
    def idx(self):
39
        return self._page_idx
40
41
    @property
42
    def usable_size(self):
43
        return self._db.header.page_size - self._db.header.reserved_length
44
45
    def __bytes__(self):
46
        return self._bytes
47
48
    @property
49
    def parent(self):
50
        try:
51
            parent_idx = self._db.ptrmap[self.idx].page_ptr
52
        except KeyError:
53
            return None
54
55
        if 0 == parent_idx:
56
            return None
57
        else:
58
            return self._db.pages[parent_idx]
59
60
    def __repr__(self):
61
        return "<SQLite Page {0}>".format(self.idx)
62
63
64
class FreelistTrunkPage(Page):
65
    # XXX Maybe it would make sense to expect a Page instance as constructor
66
    # argument?
67
    def __init__(self, page_idx, db, leaves):
68
        super().__init__(page_idx, db)
69
        self._leaves = leaves
70
71
    def __repr__(self):
72
        return "<SQLite Freelist Trunk Page {0}: {1} leaves>".format(
73
            self.idx, len(self._leaves)
74
        )
75
76
77
class FreelistLeafPage(Page):
78
    # XXX Maybe it would make sense to expect a Page instance as constructor
79
    # argument?
80
    def __init__(self, page_idx, db, trunk_idx):
81
        super().__init__(page_idx, db)
82
        self._trunk = self._db.pages[trunk_idx]
83
84
    def __repr__(self):
85
        return "<SQLite Freelist Leaf Page {0}. Trunk: {1}>".format(
86
            self.idx, self._trunk.idx
87
        )
88
89
90 View Code Duplication
class PtrmapPage(Page):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
91
    # XXX Maybe it would make sense to expect a Page instance as constructor
92
    # argument?
93
    def __init__(self, page_idx, db, ptr_array):
94
        super().__init__(page_idx, db)
95
        self._pointers = ptr_array
96
97
    @property
98
    def pointers(self):
99
        return self._pointers
100
101
    def __repr__(self):
102
        return "<SQLite Ptrmap Page {0}. {1} pointers>".format(
103
            self.idx, len(self.pointers)
104
        )
105
106
107
class OverflowPage(Page):
108
    # XXX Maybe it would make sense to expect a Page instance as constructor
109
    # argument?
110
    def __init__(self, page_idx, db):
111
        super().__init__(page_idx, db)
112
        self._parse()
113
114
    def _parse(self):
115
        # TODO We should have parsing here for the next page index in the
116
        # overflow chain
117
        pass
118
119
    def __repr__(self):
120
        return "<SQLite Overflow Page {0}. Continuation of {1}>".format(
121
            self.idx, self.parent.idx
122
        )
123
124
125 View Code Duplication
class BTreePage(Page):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
126
    btree_page_types = {
127
        0x02:   "Index Interior",
128
        0x05:   "Table Interior",
129
        0x0A:   "Index Leaf",
130
        0x0D:   "Table Leaf",
131
    }
132
133
    def __init__(self, page_idx, db, heuristics):
134
        # XXX We don't know a page's type until we've had a look at the header.
135
        # Or do we?
136
        super().__init__(page_idx, db)
137
        self._heuristics = heuristics
138
        self._header_size = 8
139
        page_header_bytes = self._get_btree_page_header()
140
        self._btree_header = SQLite_btree_page_header(
141
            # Set the right-most page index to None in the 1st pass
142
            *struct.unpack(r'>BHHHB', page_header_bytes), None
143
        )
144
        self._cell_ptr_array = []
145
        self._freeblocks = IndexDict()
146
        self._cells = IndexDict()
147
        self._recovered_records = set()
148
        self._overflow_threshold = self.usable_size - 35
149
150
        if self._btree_header.page_type not in BTreePage.btree_page_types:
151
            # pdb.set_trace()
152
            raise ValueError
153
154
        # We have a twelve-byte header, need to read it again
155
        if self._btree_header.page_type in (0x02, 0x05):
156
            self._header_size = 12
157
            page_header_bytes = self._get_btree_page_header()
158
            self._btree_header = SQLite_btree_page_header(*struct.unpack(
159
                r'>BHHHBI', page_header_bytes
160
            ))
161
162
        # Page 1 (and page 2, but that's the 1st ptrmap page) does not have a
163
        # ptrmap entry.
164
        # The first ptrmap page will contain back pointer information for pages
165
        # 3 through J+2, inclusive.
166
        if self._db.ptrmap:
167
            if self.idx >= 3 and self.idx not in self._db.ptrmap:
168
                _LOGGER.warning(
169
                    "BTree page %d doesn't have ptrmap entry!", self.idx
170
                )
171
172
        if self._btree_header.num_cells > 0:
173
            cell_ptr_bytes = self._get_btree_ptr_array(
174
                self._btree_header.num_cells
175
            )
176
            self._cell_ptr_array = struct.unpack(
177
                r'>{count}H'.format(count=self._btree_header.num_cells),
178
                cell_ptr_bytes
179
            )
180
            smallest_cell_offset = min(self._cell_ptr_array)
181
            if self._btree_header.cell_content_offset != smallest_cell_offset:
182
                _LOGGER.warning(
183
                    (
184
                        "Inconsistent cell ptr array in page %d! Cell content "
185
                        "starts at offset %d, but min cell pointer is %d"
186
                    ),
187
                    self.idx,
188
                    self._btree_header.cell_content_offset,
189
                    smallest_cell_offset
190
                )
191
192
    @property
193
    def btree_header(self):
194
        return self._btree_header
195
196
    @property
197
    def page_type(self):
198
        try:
199
            return self.btree_page_types[self._btree_header.page_type]
200
        except KeyError:
201
            # pdb.set_trace()
202
            _LOGGER.warning(
203
                "Unknown B-Tree page type: %d", self._btree_header.page_type
204
            )
205
            raise
206
207
    @property
208
    def freeblocks(self):
209
        return self._freeblocks
210
211
    @property
212
    def cells(self):
213
        return self._cells
214
215
    def __repr__(self):
216
        # TODO Include table in repr, where available
217
        return "<SQLite B-Tree Page {0} ({1}) {2} cells>".format(
218
            self.idx, self.page_type, len(self._cell_ptr_array)
219
        )
220
221
    @property
222
    def table(self):
223
        return self._db.get_page_table(self.idx)
224
225
    def _get_btree_page_header(self):
226
        header_offset = 0
227
        if self.idx == 1:
228
            header_offset += 100
229
        return bytes(self)[header_offset:self._header_size + header_offset]
230
231
    def _get_btree_ptr_array(self, num_cells):
232
        array_offset = self._header_size
233
        if self.idx == 1:
234
            array_offset += 100
235
        return bytes(self)[array_offset:2 * num_cells + array_offset]
236
237
    def parse_cells(self):
238
        if self.btree_header.page_type == 0x05:
239
            self.parse_table_interior_cells()
240
        elif self.btree_header.page_type == 0x0D:
241
            self.parse_table_leaf_cells()
242
        self.parse_freeblocks()
243
244
    def parse_table_interior_cells(self):
245
        if self.btree_header.page_type != 0x05:
246
            assert False
247
248
        _LOGGER.debug("Parsing cells in table interior cell %d", self.idx)
249
        for cell_idx, offset in enumerate(self._cell_ptr_array):
250
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, offset)
251
            left_ptr_bytes = bytes(self)[offset:offset + 4]
252
            left_ptr, = struct.unpack(r'>I', left_ptr_bytes)
253
254
            offset += 4
255
            integer_key = Varint(bytes(self)[offset:offset+9])
256
            self._cells[cell_idx] = (left_ptr, int(integer_key))
257
258
    def parse_table_leaf_cells(self):
259
        if self.btree_header.page_type != 0x0d:
260
            assert False
261
262
        _LOGGER.debug("Parsing cells in table leaf cell %d", self.idx)
263
        for cell_idx, cell_offset in enumerate(self._cell_ptr_array):
264
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, cell_offset)
265
266
            # This is the total size of the payload, which may include overflow
267
            offset = cell_offset
268
            payload_length_varint = Varint(bytes(self)[offset:offset+9])
269
            total_payload_size = int(payload_length_varint)
270
271
            overflow = False
272
            # Let X be U-35. If the payload size P is less than or equal to X
273
            # then the entire payload is stored on the b-tree leaf page. Let M
274
            # be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)). If P is
275
            # greater than X then the number of bytes stored on the table
276
            # b-tree leaf page is K if K is less or equal to X or M otherwise.
277
            # The number of bytes stored on the leaf page is never less than M.
278
            cell_payload_size = 0
279
            if total_payload_size > self._overflow_threshold:
280
                m = int(((self.usable_size - 12) * 32/255)-23)
281
                k = m + ((total_payload_size - m) % (self.usable_size - 4))
282
                if k <= self._overflow_threshold:
283
                    cell_payload_size = k
284
                else:
285
                    cell_payload_size = m
286
                overflow = True
287
            else:
288
                cell_payload_size = total_payload_size
289
290
            offset += len(payload_length_varint)
291
292
            integer_key = Varint(bytes(self)[offset:offset+9])
293
            offset += len(integer_key)
294
295
            overflow_bytes = bytes()
296
            if overflow:
297
                first_oflow_page_bytes = bytes(self)[
298
                    offset + cell_payload_size:offset + cell_payload_size + 4
299
                ]
300
                if not first_oflow_page_bytes:
301
                    continue
302
303
                first_oflow_idx, = struct.unpack(
304
                    r'>I', first_oflow_page_bytes
305
                )
306
                next_oflow_idx = first_oflow_idx
307
                while next_oflow_idx != 0:
308
                    oflow_page_bytes = self._db.page_bytes(next_oflow_idx)
309
310
                    len_overflow = min(
311
                        len(oflow_page_bytes) - 4,
312
                        (
313
                            total_payload_size - cell_payload_size -
314
                            len(overflow_bytes)
315
                        )
316
                    )
317
                    overflow_bytes += oflow_page_bytes[4:4 + len_overflow]
318
319
                    first_four_bytes = oflow_page_bytes[:4]
320
                    next_oflow_idx, = struct.unpack(
321
                        r'>I', first_four_bytes
322
                    )
323
324
            try:
325
                cell_data = bytes(self)[offset:offset + cell_payload_size]
326
                if overflow_bytes:
327
                    cell_data += overflow_bytes
328
329
                # All payload bytes should be accounted for
330
                assert len(cell_data) == total_payload_size
331
332
                record_obj = Record(cell_data)
333
                _LOGGER.debug("Created record: %r", record_obj)
334
335
            except TypeError as ex:
336
                _LOGGER.warning(
337
                    "Caught %r while instantiating record %d",
338
                    ex, int(integer_key)
339
                )
340
                # pdb.set_trace()
341
                raise
342
343
            self._cells[cell_idx] = (int(integer_key), record_obj)
344
345
    def parse_freeblocks(self):
346
        # The first 2 bytes of a freeblock are a big-endian integer which is
347
        # the offset in the b-tree page of the next freeblock in the chain, or
348
        # zero if the freeblock is the last on the chain. The third and fourth
349
        # bytes of each freeblock form a big-endian integer which is the size
350
        # of the freeblock in bytes, including the 4-byte header. Freeblocks
351
        # are always connected in order of increasing offset. The second field
352
        # of the b-tree page header is the offset of the first freeblock, or
353
        # zero if there are no freeblocks on the page. In a well-formed b-tree
354
        # page, there will always be at least one cell before the first
355
        # freeblock.
356
        #
357
        # TODO But what about deleted records that exceeded the overflow
358
        # threshold in the past?
359
        block_offset = self.btree_header.first_freeblock_offset
360
        while block_offset != 0:
361
            freeblock_header = bytes(self)[block_offset:block_offset + 4]
362
            # Freeblock_size includes the 4-byte header
363
            next_freeblock_offset, freeblock_size = struct.unpack(
364
                r'>HH',
365
                freeblock_header
366
            )
367
            freeblock_bytes = bytes(self)[
368
                block_offset + 4:block_offset + freeblock_size - 4
369
            ]
370
            self._freeblocks[block_offset] = freeblock_bytes
371
            block_offset = next_freeblock_offset
372
373
    def print_cells(self):
374
        for cell_idx in self.cells.keys():
375
            rowid, record = self.cells[cell_idx]
376
            _LOGGER.info(
377
                "Cell %d, rowid: %d, record: %r",
378
                cell_idx, rowid, record
379
            )
380
            record.print_fields(table=self.table)
381
382
    def recover_freeblock_records(self, grouping):
383
        # If we're lucky (i.e. if no overwriting has taken place), we should be
384
        # able to find whole record headers in freeblocks.
385
        # We need to start from the end of the freeblock and work our way back
386
        # to the start. That means we don't know where a cell header will
387
        # start, but I suppose we can take a guess
388
389
        if not self.table:
390
            return
391
392
        try:
393
            table_heuristic = self._heuristics.get_heuristic(
394
                self.table, grouping
395
            )
396
        except ValueError as ex:
397
            _LOGGER.error(str(ex))
398
            return
399
400
        _LOGGER.info(
401
            "Using heuristic %r on table \"%s\"",
402
            table_heuristic, self.table,
403
        )
404
405
        _LOGGER.info("Attempting to recover records from freeblocks")
406
        for freeblock_idx, freeblock_offset in enumerate(self._freeblocks):
407
            freeblock_bytes = self._freeblocks[freeblock_offset]
408
            if 0 == len(freeblock_bytes):
409
                continue
410
            _LOGGER.debug(
411
                "Freeblock %d/%d in page, offset %d, %d bytes",
412
                1 + freeblock_idx,
413
                len(self._freeblocks),
414
                freeblock_offset,
415
                len(freeblock_bytes)
416
            )
417
418
            recovered_bytes = 0
419
            recovered_in_freeblock = 0
420
421
            # TODO Maybe we need to guess the record header lengths rather than
422
            # try and read them from the freeblocks
423
            for header_start in table_heuristic(freeblock_bytes):
424
                _LOGGER.debug(
425
                    (
426
                        "Trying potential record header start at "
427
                        "freeblock offset %d/%d"
428
                    ),
429
                    header_start, len(freeblock_bytes)
430
                )
431
                _LOGGER.debug("%r", freeblock_bytes)
432
                try:
433
                    # We don't know how to handle overflow in deleted records,
434
                    # so we'll have to truncate the bytes object used to
435
                    # instantiate the Record object
436
                    record_bytes = freeblock_bytes[
437
                        header_start:header_start+self._overflow_threshold
438
                    ]
439
                    record_obj = Record(record_bytes)
440
                except MalformedRecord:
441
                    # This isn't a well-formed record, let's move to the next
442
                    # candidate
443
                    continue
444
445
                field_lengths = sum(
446
                    len(field_obj) for field_obj in record_obj.fields.values()
447
                )
448
                record_obj.truncate(field_lengths + len(record_obj.header))
449
                self._recovered_records.add(record_obj)
450
451
                recovered_bytes += len(bytes(record_obj))
452
                recovered_in_freeblock += 1
453
454
            _LOGGER.info(
455
                (
456
                    "Recovered %d record(s): %d bytes out of %d "
457
                    "freeblock bytes @ offset %d"
458
                ),
459
                recovered_in_freeblock,
460
                recovered_bytes,
461
                len(freeblock_bytes),
462
                freeblock_offset,
463
            )
464
465
    @property
466
    def recovered_records(self):
467
        return self._recovered_records
468
469
    def print_recovered_records(self):
470
        if not self._recovered_records:
471
            return
472
473
        for record_obj in self._recovered_records:
474
            _LOGGER.info("Recovered record: %r", record_obj)
475
            _LOGGER.info("Recovered record header: %s", record_obj.header)
476
            record_obj.print_fields(table=self.table)
477