Issues in pages.py (master) - Issues in master - mattboyer/sqbrite - Measure and Improve Code Quality continuously with Scrutinizer

Issues (17)

src/pages.py (3 issues)

Labels

Duplication 3

Severity

Informational 3

# MIT License
#
# Copyright (c) 2017 Matt Boyer
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import struct

from . import _LOGGER
from .record import (Record, MalformedRecord)
from .tuples import SQLite_btree_page_header
from .utils import (Varint, IndexDict)


class Page(object):

    def __init__(self, page_idx, db):
        self._page_idx = page_idx
        self._db = db
        self._bytes = db.page_bytes(self.idx)

    @property
    def idx(self):
        return self._page_idx

    @property
    def usable_size(self):
        return self._db.header.page_size - self._db.header.reserved_length

    def __bytes__(self):
        return self._bytes

    @property
    def parent(self):
        try:
            parent_idx = self._db.ptrmap[self.idx].page_ptr
        except KeyError:
            return None

        if 0 == parent_idx:
            return None
        else:
            return self._db.pages[parent_idx]

    def __repr__(self):
        return "<SQLite Page {0}>".format(self.idx)


class FreelistTrunkPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db, leaves):
        super().__init__(page_idx, db)
        self._leaves = leaves

    def __repr__(self):
        return "<SQLite Freelist Trunk Page {0}: {1} leaves>".format(
            self.idx, len(self._leaves)
        )


class FreelistLeafPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db, trunk_idx):
        super().__init__(page_idx, db)
        self._trunk = self._db.pages[trunk_idx]

    def __repr__(self):
        return "<SQLite Freelist Leaf Page {0}. Trunk: {1}>".format(
            self.idx, self._trunk.idx
        )


class PtrmapPage(Page):

    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db, ptr_array):
        super().__init__(page_idx, db)
        self._pointers = ptr_array

    @property
    def pointers(self):
        return self._pointers

    def __repr__(self):
        return "<SQLite Ptrmap Page {0}. {1} pointers>".format(
            self.idx, len(self.pointers)
        )


class OverflowPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db):
        super().__init__(page_idx, db)
        self._parse()

    def _parse(self):
        # TODO We should have parsing here for the next page index in the
        # overflow chain
        pass

    def __repr__(self):
        return "<SQLite Overflow Page {0}. Continuation of {1}>".format(
            self.idx, self.parent.idx
        )


class BTreePage(Page):

    btree_page_types = {
        0x02:   "Index Interior",
        0x05:   "Table Interior",
        0x0A:   "Index Leaf",
        0x0D:   "Table Leaf",
    }

    def __init__(self, page_idx, db, heuristics):
        # XXX We don't know a page's type until we've had a look at the header.
        # Or do we?
        super().__init__(page_idx, db)
        self._heuristics = heuristics
        self._header_size = 8
        page_header_bytes = self._get_btree_page_header()
        self._btree_header = SQLite_btree_page_header(
            # Set the right-most page index to None in the 1st pass
            *struct.unpack(r'>BHHHB', page_header_bytes), None
        )
        self._cell_ptr_array = []
        self._freeblocks = IndexDict()
        self._cells = IndexDict()
        self._recovered_records = set()
        self._overflow_threshold = self.usable_size - 35

        if self._btree_header.page_type not in BTreePage.btree_page_types:
            # pdb.set_trace()
            raise ValueError

        # We have a twelve-byte header, need to read it again
        if self._btree_header.page_type in (0x02, 0x05):
            self._header_size = 12
            page_header_bytes = self._get_btree_page_header()
            self._btree_header = SQLite_btree_page_header(*struct.unpack(
                r'>BHHHBI', page_header_bytes
            ))

        # Page 1 (and page 2, but that's the 1st ptrmap page) does not have a
        # ptrmap entry.
        # The first ptrmap page will contain back pointer information for pages
        # 3 through J+2, inclusive.
        if self._db.ptrmap:
            if self.idx >= 3 and self.idx not in self._db.ptrmap:
                _LOGGER.warning(
                    "BTree page %d doesn't have ptrmap entry!", self.idx
                )

        if self._btree_header.num_cells > 0:
            cell_ptr_bytes = self._get_btree_ptr_array(
                self._btree_header.num_cells
            )
            self._cell_ptr_array = struct.unpack(
                r'>{count}H'.format(count=self._btree_header.num_cells),
                cell_ptr_bytes
            )
            smallest_cell_offset = min(self._cell_ptr_array)
            if self._btree_header.cell_content_offset != smallest_cell_offset:
                _LOGGER.warning(
                    (
                        "Inconsistent cell ptr array in page %d! Cell content "
                        "starts at offset %d, but min cell pointer is %d"
                    ),
                    self.idx,
                    self._btree_header.cell_content_offset,
                    smallest_cell_offset
                )

    @property
    def btree_header(self):
        return self._btree_header

    @property
    def page_type(self):
        try:
            return self.btree_page_types[self._btree_header.page_type]
        except KeyError:
            # pdb.set_trace()
            _LOGGER.warning(
                "Unknown B-Tree page type: %d", self._btree_header.page_type
            )
            raise

    @property
    def freeblocks(self):
        return self._freeblocks

    @property
    def cells(self):
        return self._cells

    def __repr__(self):
        # TODO Include table in repr, where available
        return "<SQLite B-Tree Page {0} ({1}) {2} cells>".format(
            self.idx, self.page_type, len(self._cell_ptr_array)
        )

    @property
    def table(self):
        return self._db.get_page_table(self.idx)

    def _get_btree_page_header(self):
        header_offset = 0
        if self.idx == 1:
            header_offset += 100
        return bytes(self)[header_offset:self._header_size + header_offset]

    def _get_btree_ptr_array(self, num_cells):
        array_offset = self._header_size
        if self.idx == 1:
            array_offset += 100
        return bytes(self)[array_offset:2 * num_cells + array_offset]

    def parse_cells(self):
        if self.btree_header.page_type == 0x05:
            self.parse_table_interior_cells()
        elif self.btree_header.page_type == 0x0D:
            self.parse_table_leaf_cells()
        self.parse_freeblocks()

    def parse_table_interior_cells(self):
        if self.btree_header.page_type != 0x05:
            assert False

        _LOGGER.debug("Parsing cells in table interior cell %d", self.idx)
        for cell_idx, offset in enumerate(self._cell_ptr_array):
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, offset)
            left_ptr_bytes = bytes(self)[offset:offset + 4]
            left_ptr, = struct.unpack(r'>I', left_ptr_bytes)

            offset += 4
            integer_key = Varint(bytes(self)[offset:offset+9])
            self._cells[cell_idx] = (left_ptr, int(integer_key))

    def parse_table_leaf_cells(self):
        if self.btree_header.page_type != 0x0d:
            assert False

        _LOGGER.debug("Parsing cells in table leaf cell %d", self.idx)
        for cell_idx, cell_offset in enumerate(self._cell_ptr_array):
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, cell_offset)

            # This is the total size of the payload, which may include overflow
            offset = cell_offset
            payload_length_varint = Varint(bytes(self)[offset:offset+9])
            total_payload_size = int(payload_length_varint)

            overflow = False
            # Let X be U-35. If the payload size P is less than or equal to X
            # then the entire payload is stored on the b-tree leaf page. Let M
            # be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)). If P is
            # greater than X then the number of bytes stored on the table
            # b-tree leaf page is K if K is less or equal to X or M otherwise.
            # The number of bytes stored on the leaf page is never less than M.
            cell_payload_size = 0
            if total_payload_size > self._overflow_threshold:
                m = int(((self.usable_size - 12) * 32/255)-23)
                k = m + ((total_payload_size - m) % (self.usable_size - 4))
                if k <= self._overflow_threshold:
                    cell_payload_size = k
                else:
                    cell_payload_size = m
                overflow = True
            else:
                cell_payload_size = total_payload_size

            offset += len(payload_length_varint)

            integer_key = Varint(bytes(self)[offset:offset+9])
            offset += len(integer_key)

            overflow_bytes = bytes()
            if overflow:
                first_oflow_page_bytes = bytes(self)[
                    offset + cell_payload_size:offset + cell_payload_size + 4
                ]
                if not first_oflow_page_bytes:
                    continue

                first_oflow_idx, = struct.unpack(
                    r'>I', first_oflow_page_bytes
                )
                next_oflow_idx = first_oflow_idx
                while next_oflow_idx != 0:
                    oflow_page_bytes = self._db.page_bytes(next_oflow_idx)

                    len_overflow = min(
                        len(oflow_page_bytes) - 4,
                        (
                            total_payload_size - cell_payload_size -
                            len(overflow_bytes)
                        )
                    )
                    overflow_bytes += oflow_page_bytes[4:4 + len_overflow]

                    first_four_bytes = oflow_page_bytes[:4]
                    next_oflow_idx, = struct.unpack(
                        r'>I', first_four_bytes
                    )

            try:
                cell_data = bytes(self)[offset:offset + cell_payload_size]
                if overflow_bytes:
                    cell_data += overflow_bytes

                # All payload bytes should be accounted for
                assert len(cell_data) == total_payload_size

                record_obj = Record(cell_data)
                _LOGGER.debug("Created record: %r", record_obj)

            except TypeError as ex:
                _LOGGER.warning(
                    "Caught %r while instantiating record %d",
                    ex, int(integer_key)
                )
                # pdb.set_trace()
                raise

            self._cells[cell_idx] = (int(integer_key), record_obj)

    def parse_freeblocks(self):
        # The first 2 bytes of a freeblock are a big-endian integer which is
        # the offset in the b-tree page of the next freeblock in the chain, or
        # zero if the freeblock is the last on the chain. The third and fourth
        # bytes of each freeblock form a big-endian integer which is the size
        # of the freeblock in bytes, including the 4-byte header. Freeblocks
        # are always connected in order of increasing offset. The second field
        # of the b-tree page header is the offset of the first freeblock, or
        # zero if there are no freeblocks on the page. In a well-formed b-tree
        # page, there will always be at least one cell before the first
        # freeblock.
        #
        # TODO But what about deleted records that exceeded the overflow
        # threshold in the past?
        block_offset = self.btree_header.first_freeblock_offset
        while block_offset != 0:
            freeblock_header = bytes(self)[block_offset:block_offset + 4]
            # Freeblock_size includes the 4-byte header
            next_freeblock_offset, freeblock_size = struct.unpack(
                r'>HH',
                freeblock_header
            )
            freeblock_bytes = bytes(self)[
                block_offset + 4:block_offset + freeblock_size - 4
            ]
            self._freeblocks[block_offset] = freeblock_bytes
            block_offset = next_freeblock_offset

    def print_cells(self):
        for cell_idx in self.cells.keys():
            rowid, record = self.cells[cell_idx]
            _LOGGER.info(
                "Cell %d, rowid: %d, record: %r",
                cell_idx, rowid, record
            )
            record.print_fields(table=self.table)

    def recover_freeblock_records(self, grouping):
        # If we're lucky (i.e. if no overwriting has taken place), we should be
        # able to find whole record headers in freeblocks.
        # We need to start from the end of the freeblock and work our way back
        # to the start. That means we don't know where a cell header will
        # start, but I suppose we can take a guess

        if not self.table:
            return

        try:
            table_heuristic = self._heuristics.get_heuristic(
                self.table, grouping
            )
        except ValueError as ex:
            _LOGGER.error(str(ex))
            return

        _LOGGER.info(
            "Using heuristic %r on table \"%s\"",
            table_heuristic, self.table,
        )

        _LOGGER.info("Attempting to recover records from freeblocks")
        for freeblock_idx, freeblock_offset in enumerate(self._freeblocks):
            freeblock_bytes = self._freeblocks[freeblock_offset]
            if 0 == len(freeblock_bytes):
                continue
            _LOGGER.debug(
                "Freeblock %d/%d in page, offset %d, %d bytes",
                1 + freeblock_idx,
                len(self._freeblocks),
                freeblock_offset,
                len(freeblock_bytes)
            )

            recovered_bytes = 0
            recovered_in_freeblock = 0

            # TODO Maybe we need to guess the record header lengths rather than
            # try and read them from the freeblocks
            for header_start in table_heuristic(freeblock_bytes):
                _LOGGER.debug(
                    (
                        "Trying potential record header start at "
                        "freeblock offset %d/%d"
                    ),
                    header_start, len(freeblock_bytes)
                )
                _LOGGER.debug("%r", freeblock_bytes)
                try:
                    # We don't know how to handle overflow in deleted records,
                    # so we'll have to truncate the bytes object used to
                    # instantiate the Record object
                    record_bytes = freeblock_bytes[
                        header_start:header_start+self._overflow_threshold
                    ]
                    record_obj = Record(record_bytes)
                except MalformedRecord:
                    # This isn't a well-formed record, let's move to the next
                    # candidate
                    continue

                field_lengths = sum(
                    len(field_obj) for field_obj in record_obj.fields.values()
                )
                record_obj.truncate(field_lengths + len(record_obj.header))
                self._recovered_records.add(record_obj)

                recovered_bytes += len(bytes(record_obj))
                recovered_in_freeblock += 1

            _LOGGER.info(
                (
                    "Recovered %d record(s): %d bytes out of %d "
                    "freeblock bytes @ offset %d"
                ),
                recovered_in_freeblock,
                recovered_bytes,
                len(freeblock_bytes),
                freeblock_offset,
            )

    @property
    def recovered_records(self):
        return self._recovered_records

    def print_recovered_records(self):
        if not self._recovered_records:
            return

        for record_obj in self._recovered_records:
            _LOGGER.info("Recovered record: %r", record_obj)
            _LOGGER.info("Recovered record header: %s", record_obj.header)
            record_obj.print_fields(table=self.table)


1		# MIT License
2		#
3		# Copyright (c) 2017 Matt Boyer
4		#
5		# Permission is hereby granted, free of charge, to any person obtaining a copy
6		# of this software and associated documentation files (the "Software"), to deal
7		# in the Software without restriction, including without limitation the rights
8		# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9		# copies of the Software, and to permit persons to whom the Software is
10		# furnished to do so, subject to the following conditions:
11		#
12		# The above copyright notice and this permission notice shall be included in
13		# all copies or substantial portions of the Software.
14		#
15		# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16		# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17		# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18		# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19		# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20		# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21		# SOFTWARE.
22
23		import struct
24
25		from . import _LOGGER
26		from .record import (Record, MalformedRecord)
27		from .tuples import SQLite_btree_page_header
28		from .utils import (Varint, IndexDict)
29
30
31	View Code Duplication	class Page(object):
		0 ignored issues – show Duplication introduced 2022-03-20 09:42 UTC by Report Bug Copy Issue Report Show Similar Issues like this This code seems to be duplicated in your project. Loading history...
32		def __init__(self, page_idx, db):
33		self._page_idx = page_idx
34		self._db = db
35		self._bytes = db.page_bytes(self.idx)
36
37		@property
38		def idx(self):
39		return self._page_idx
40
41		@property
42		def usable_size(self):
43		return self._db.header.page_size - self._db.header.reserved_length
44
45		def __bytes__(self):
46		return self._bytes
47
48		@property
49		def parent(self):
50		try:
51		parent_idx = self._db.ptrmap[self.idx].page_ptr
52		except KeyError:
53		return None
54
55		if 0 == parent_idx:
56		return None
57		else:
58		return self._db.pages[parent_idx]
59
60		def __repr__(self):
61		return "<SQLite Page {0}>".format(self.idx)
62
63
64		class FreelistTrunkPage(Page):
65		# XXX Maybe it would make sense to expect a Page instance as constructor
66		# argument?
67		def __init__(self, page_idx, db, leaves):
68		super().__init__(page_idx, db)
69		self._leaves = leaves
70
71		def __repr__(self):
72		return "<SQLite Freelist Trunk Page {0}: {1} leaves>".format(
73		self.idx, len(self._leaves)
74		)
75
76
77		class FreelistLeafPage(Page):
78		# XXX Maybe it would make sense to expect a Page instance as constructor
79		# argument?
80		def __init__(self, page_idx, db, trunk_idx):
81		super().__init__(page_idx, db)
82		self._trunk = self._db.pages[trunk_idx]
83
84		def __repr__(self):
85		return "<SQLite Freelist Leaf Page {0}. Trunk: {1}>".format(
86		self.idx, self._trunk.idx
87		)
88
89
90	View Code Duplication	class PtrmapPage(Page):
		0 ignored issues – show Duplication introduced 2022-03-20 09:42 UTC by Report Bug Copy Issue Report Show Similar Issues like this This code seems to be duplicated in your project. Loading history...
91		# XXX Maybe it would make sense to expect a Page instance as constructor
92		# argument?
93		def __init__(self, page_idx, db, ptr_array):
94		super().__init__(page_idx, db)
95		self._pointers = ptr_array
96
97		@property
98		def pointers(self):
99		return self._pointers
100
101		def __repr__(self):
102		return "<SQLite Ptrmap Page {0}. {1} pointers>".format(
103		self.idx, len(self.pointers)
104		)
105
106
107		class OverflowPage(Page):
108		# XXX Maybe it would make sense to expect a Page instance as constructor
109		# argument?
110		def __init__(self, page_idx, db):
111		super().__init__(page_idx, db)
112		self._parse()
113
114		def _parse(self):
115		# TODO We should have parsing here for the next page index in the
116		# overflow chain
117		pass
118
119		def __repr__(self):
120		return "<SQLite Overflow Page {0}. Continuation of {1}>".format(
121		self.idx, self.parent.idx
122		)
123
124
125	View Code Duplication	class BTreePage(Page):
		0 ignored issues – show Duplication introduced 2022-03-20 09:42 UTC by Report Bug Copy Issue Report Show Similar Issues like this This code seems to be duplicated in your project. Loading history...
126		btree_page_types = {
127		0x02: "Index Interior",
128		0x05: "Table Interior",
129		0x0A: "Index Leaf",
130		0x0D: "Table Leaf",
131		}
132
133		def __init__(self, page_idx, db, heuristics):
134		# XXX We don't know a page's type until we've had a look at the header.
135		# Or do we?
136		super().__init__(page_idx, db)
137		self._heuristics = heuristics
138		self._header_size = 8
139		page_header_bytes = self._get_btree_page_header()
140		self._btree_header = SQLite_btree_page_header(
141		# Set the right-most page index to None in the 1st pass
142		*struct.unpack(r'>BHHHB', page_header_bytes), None
143		)
144		self._cell_ptr_array = []
145		self._freeblocks = IndexDict()
146		self._cells = IndexDict()
147		self._recovered_records = set()
148		self._overflow_threshold = self.usable_size - 35
149
150		if self._btree_header.page_type not in BTreePage.btree_page_types:
151		# pdb.set_trace()
152		raise ValueError
153
154		# We have a twelve-byte header, need to read it again
155		if self._btree_header.page_type in (0x02, 0x05):
156		self._header_size = 12
157		page_header_bytes = self._get_btree_page_header()
158		self._btree_header = SQLite_btree_page_header(*struct.unpack(
159		r'>BHHHBI', page_header_bytes
160		))
161
162		# Page 1 (and page 2, but that's the 1st ptrmap page) does not have a
163		# ptrmap entry.
164		# The first ptrmap page will contain back pointer information for pages
165		# 3 through J+2, inclusive.
166		if self._db.ptrmap:
167		if self.idx >= 3 and self.idx not in self._db.ptrmap:
168		_LOGGER.warning(
169		"BTree page %d doesn't have ptrmap entry!", self.idx
170		)
171
172		if self._btree_header.num_cells > 0:
173		cell_ptr_bytes = self._get_btree_ptr_array(
174		self._btree_header.num_cells
175		)
176		self._cell_ptr_array = struct.unpack(
177		r'>{count}H'.format(count=self._btree_header.num_cells),
178		cell_ptr_bytes
179		)
180		smallest_cell_offset = min(self._cell_ptr_array)
181		if self._btree_header.cell_content_offset != smallest_cell_offset:
182		_LOGGER.warning(
183		(
184		"Inconsistent cell ptr array in page %d! Cell content "
185		"starts at offset %d, but min cell pointer is %d"
186		),
187		self.idx,
188		self._btree_header.cell_content_offset,
189		smallest_cell_offset
190		)
191
192		@property
193		def btree_header(self):
194		return self._btree_header
195
196		@property
197		def page_type(self):
198		try:
199		return self.btree_page_types[self._btree_header.page_type]
200		except KeyError:
201		# pdb.set_trace()
202		_LOGGER.warning(
203		"Unknown B-Tree page type: %d", self._btree_header.page_type
204		)
205		raise
206
207		@property
208		def freeblocks(self):
209		return self._freeblocks
210
211		@property
212		def cells(self):
213		return self._cells
214
215		def __repr__(self):
216		# TODO Include table in repr, where available
217		return "<SQLite B-Tree Page {0} ({1}) {2} cells>".format(
218		self.idx, self.page_type, len(self._cell_ptr_array)
219		)
220
221		@property
222		def table(self):
223		return self._db.get_page_table(self.idx)
224
225		def _get_btree_page_header(self):
226		header_offset = 0
227		if self.idx == 1:
228		header_offset += 100
229		return bytes(self)[header_offset:self._header_size + header_offset]
230
231		def _get_btree_ptr_array(self, num_cells):
232		array_offset = self._header_size
233		if self.idx == 1:
234		array_offset += 100
235		return bytes(self)[array_offset:2 * num_cells + array_offset]
236
237		def parse_cells(self):
238		if self.btree_header.page_type == 0x05:
239		self.parse_table_interior_cells()
240		elif self.btree_header.page_type == 0x0D:
241		self.parse_table_leaf_cells()
242		self.parse_freeblocks()
243
244		def parse_table_interior_cells(self):
245		if self.btree_header.page_type != 0x05:
246		assert False
247
248		_LOGGER.debug("Parsing cells in table interior cell %d", self.idx)
249		for cell_idx, offset in enumerate(self._cell_ptr_array):
250		_LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, offset)
251		left_ptr_bytes = bytes(self)[offset:offset + 4]
252		left_ptr, = struct.unpack(r'>I', left_ptr_bytes)
253
254		offset += 4
255		integer_key = Varint(bytes(self)[offset:offset+9])
256		self._cells[cell_idx] = (left_ptr, int(integer_key))
257
258		def parse_table_leaf_cells(self):
259		if self.btree_header.page_type != 0x0d:
260		assert False
261
262		_LOGGER.debug("Parsing cells in table leaf cell %d", self.idx)
263		for cell_idx, cell_offset in enumerate(self._cell_ptr_array):
264		_LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, cell_offset)
265
266		# This is the total size of the payload, which may include overflow
267		offset = cell_offset
268		payload_length_varint = Varint(bytes(self)[offset:offset+9])
269		total_payload_size = int(payload_length_varint)
270
271		overflow = False
272		# Let X be U-35. If the payload size P is less than or equal to X
273		# then the entire payload is stored on the b-tree leaf page. Let M
274		# be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)). If P is
275		# greater than X then the number of bytes stored on the table
276		# b-tree leaf page is K if K is less or equal to X or M otherwise.
277		# The number of bytes stored on the leaf page is never less than M.
278		cell_payload_size = 0
279		if total_payload_size > self._overflow_threshold:
280		m = int(((self.usable_size - 12) * 32/255)-23)
281		k = m + ((total_payload_size - m) % (self.usable_size - 4))
282		if k <= self._overflow_threshold:
283		cell_payload_size = k
284		else:
285		cell_payload_size = m
286		overflow = True
287		else:
288		cell_payload_size = total_payload_size
289
290		offset += len(payload_length_varint)
291
292		integer_key = Varint(bytes(self)[offset:offset+9])
293		offset += len(integer_key)
294
295		overflow_bytes = bytes()
296		if overflow:
297		first_oflow_page_bytes = bytes(self)[
298		offset + cell_payload_size:offset + cell_payload_size + 4
299		]
300		if not first_oflow_page_bytes:
301		continue
302
303		first_oflow_idx, = struct.unpack(
304		r'>I', first_oflow_page_bytes
305		)
306		next_oflow_idx = first_oflow_idx
307		while next_oflow_idx != 0:
308		oflow_page_bytes = self._db.page_bytes(next_oflow_idx)
309
310		len_overflow = min(
311		len(oflow_page_bytes) - 4,
312		(
313		total_payload_size - cell_payload_size -
314		len(overflow_bytes)
315		)
316		)
317		overflow_bytes += oflow_page_bytes[4:4 + len_overflow]
318
319		first_four_bytes = oflow_page_bytes[:4]
320		next_oflow_idx, = struct.unpack(
321		r'>I', first_four_bytes
322		)
323
324		try:
325		cell_data = bytes(self)[offset:offset + cell_payload_size]
326		if overflow_bytes:
327		cell_data += overflow_bytes
328
329		# All payload bytes should be accounted for
330		assert len(cell_data) == total_payload_size
331
332		record_obj = Record(cell_data)
333		_LOGGER.debug("Created record: %r", record_obj)
334
335		except TypeError as ex:
336		_LOGGER.warning(
337		"Caught %r while instantiating record %d",
338		ex, int(integer_key)
339		)
340		# pdb.set_trace()
341		raise
342
343		self._cells[cell_idx] = (int(integer_key), record_obj)
344
345		def parse_freeblocks(self):
346		# The first 2 bytes of a freeblock are a big-endian integer which is
347		# the offset in the b-tree page of the next freeblock in the chain, or
348		# zero if the freeblock is the last on the chain. The third and fourth
349		# bytes of each freeblock form a big-endian integer which is the size
350		# of the freeblock in bytes, including the 4-byte header. Freeblocks
351		# are always connected in order of increasing offset. The second field
352		# of the b-tree page header is the offset of the first freeblock, or
353		# zero if there are no freeblocks on the page. In a well-formed b-tree
354		# page, there will always be at least one cell before the first
355		# freeblock.
356		#
357		# TODO But what about deleted records that exceeded the overflow
358		# threshold in the past?
359		block_offset = self.btree_header.first_freeblock_offset
360		while block_offset != 0:
361		freeblock_header = bytes(self)[block_offset:block_offset + 4]
362		# Freeblock_size includes the 4-byte header
363		next_freeblock_offset, freeblock_size = struct.unpack(
364		r'>HH',
365		freeblock_header
366		)
367		freeblock_bytes = bytes(self)[
368		block_offset + 4:block_offset + freeblock_size - 4
369		]
370		self._freeblocks[block_offset] = freeblock_bytes
371		block_offset = next_freeblock_offset
372
373		def print_cells(self):
374		for cell_idx in self.cells.keys():
375		rowid, record = self.cells[cell_idx]
376		_LOGGER.info(
377		"Cell %d, rowid: %d, record: %r",
378		cell_idx, rowid, record
379		)
380		record.print_fields(table=self.table)
381
382		def recover_freeblock_records(self, grouping):
383		# If we're lucky (i.e. if no overwriting has taken place), we should be
384		# able to find whole record headers in freeblocks.
385		# We need to start from the end of the freeblock and work our way back
386		# to the start. That means we don't know where a cell header will
387		# start, but I suppose we can take a guess
388
389		if not self.table:
390		return
391
392		try:
393		table_heuristic = self._heuristics.get_heuristic(
394		self.table, grouping
395		)
396		except ValueError as ex:
397		_LOGGER.error(str(ex))
398		return
399
400		_LOGGER.info(
401		"Using heuristic %r on table \"%s\"",
402		table_heuristic, self.table,
403		)
404
405		_LOGGER.info("Attempting to recover records from freeblocks")
406		for freeblock_idx, freeblock_offset in enumerate(self._freeblocks):
407		freeblock_bytes = self._freeblocks[freeblock_offset]
408		if 0 == len(freeblock_bytes):
409		continue
410		_LOGGER.debug(
411		"Freeblock %d/%d in page, offset %d, %d bytes",
412		1 + freeblock_idx,
413		len(self._freeblocks),
414		freeblock_offset,
415		len(freeblock_bytes)
416		)
417
418		recovered_bytes = 0
419		recovered_in_freeblock = 0
420
421		# TODO Maybe we need to guess the record header lengths rather than
422		# try and read them from the freeblocks
423		for header_start in table_heuristic(freeblock_bytes):
424		_LOGGER.debug(
425		(
426		"Trying potential record header start at "
427		"freeblock offset %d/%d"
428		),
429		header_start, len(freeblock_bytes)
430		)
431		_LOGGER.debug("%r", freeblock_bytes)
432		try:
433		# We don't know how to handle overflow in deleted records,
434		# so we'll have to truncate the bytes object used to
435		# instantiate the Record object
436		record_bytes = freeblock_bytes[
437		header_start:header_start+self._overflow_threshold
438		]
439		record_obj = Record(record_bytes)
440		except MalformedRecord:
441		# This isn't a well-formed record, let's move to the next
442		# candidate
443		continue
444
445		field_lengths = sum(
446		len(field_obj) for field_obj in record_obj.fields.values()
447		)
448		record_obj.truncate(field_lengths + len(record_obj.header))
449		self._recovered_records.add(record_obj)
450
451		recovered_bytes += len(bytes(record_obj))
452		recovered_in_freeblock += 1
453
454		_LOGGER.info(
455		(
456		"Recovered %d record(s): %d bytes out of %d "
457		"freeblock bytes @ offset %d"
458		),
459		recovered_in_freeblock,
460		recovered_bytes,
461		len(freeblock_bytes),
462		freeblock_offset,
463		)
464
465		@property
466		def recovered_records(self):
467		return self._recovered_records
468
469		def print_recovered_records(self):
470		if not self._recovered_records:
471		return
472
473		for record_obj in self._recovered_records:
474		_LOGGER.info("Recovered record: %r", record_obj)
475		_LOGGER.info("Recovered record header: %s", record_obj.header)
476		record_obj.print_fields(table=self.table)
477

mattboyer / sqbrite

Issues (17)

src/pages.py (3 issues)

Labels

Severity

Introduced By

Duplication Side-by-Side

Filter issues like