BTreePage.recover_freeblock_records() - Code Metrics - Inspection of "Implement heuristic registry" - mattboyer/sqbrite - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#18)

by Matt

created 2017-07-16 16:35 UTC

BTreePage.recover_freeblock_records() D

↳ Parent: BTreePage

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	6
Bugs	0	Features	0

Metric	Value
cc	8
c	6
b	0
f	0
dl	0
loc	81
rs	4.659

How to fix Long Method

# MIT License
#
# Copyright (c) 2017 Matt Boyer
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import pdb
import struct

from . import _LOGGER
from .record import (Record, MalformedRecord)
from .tuples import SQLite_btree_page_header
from .utils import (Varint, IndexDict)


class Page(object):
    def __init__(self, page_idx, db):
        self._page_idx = page_idx
        self._db = db
        self._bytes = db.page_bytes(self.idx)

    @property
    def idx(self):
        return self._page_idx

    @property
    def usable_size(self):
        return self._db.header.page_size - self._db.header.reserved_length

    def __bytes__(self):
        return self._bytes

    @property
    def parent(self):
        try:
            parent_idx = self._db.ptrmap[self.idx].page_ptr
        except KeyError:
            return None

        if 0 == parent_idx:
            return None
        else:
            return self._db.pages[parent_idx]

    def __repr__(self):
        return "<SQLite Page {0}>".format(self.idx)


class FreelistTrunkPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db, leaves):
        super().__init__(page_idx, db)
        self._leaves = leaves

    def __repr__(self):
        return "<SQLite Freelist Trunk Page {0}: {1} leaves>".format(
            self.idx, len(self._leaves)
        )


class FreelistLeafPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db, trunk_idx):
        super().__init__(page_idx, db)
        self._trunk = self._db.pages[trunk_idx]

    def __repr__(self):
        return "<SQLite Freelist Leaf Page {0}. Trunk: {1}>".format(
            self.idx, self._trunk.idx
        )


class PtrmapPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db, ptr_array):
        super().__init__(page_idx, db)
        self._pointers = ptr_array

    @property
    def pointers(self):
        return self._pointers

    def __repr__(self):
        return "<SQLite Ptrmap Page {0}. {1} pointers>".format(
            self.idx, len(self.pointers)
        )


class OverflowPage(Page):
    # XXX Maybe it would make sense to expect a Page instance as constructor
    # argument?
    def __init__(self, page_idx, db):
        super().__init__(page_idx, db)
        self._parse()

    def _parse(self):
        # TODO We should have parsing here for the next page index in the
        # overflow chain
        pass

    def __repr__(self):
        return "<SQLite Overflow Page {0}. Continuation of {1}>".format(
            self.idx, self.parent.idx
        )


class BTreePage(Page):
    btree_page_types = {
        0x02:   "Index Interior",
        0x05:   "Table Interior",
        0x0A:   "Index Leaf",
        0x0D:   "Table Leaf",
    }

    def __init__(self, page_idx, db, heuristics):
        # XXX We don't know a page's type until we've had a look at the header.
        # Or do we?
        super().__init__(page_idx, db)
        self._heuristics = heuristics
        self._header_size = 8
        page_header_bytes = self._get_btree_page_header()
        self._btree_header = SQLite_btree_page_header(
            # Set the right-most page index to None in the 1st pass
            *struct.unpack(r'>BHHHB', page_header_bytes), None
        )
        self._cell_ptr_array = []
        self._freeblocks = IndexDict()
        self._cells = IndexDict()
        self._recovered_records = set()
        self._overflow_threshold = self.usable_size - 35

        if self._btree_header.page_type not in BTreePage.btree_page_types:
            # pdb.set_trace()
            raise ValueError

        # We have a twelve-byte header, need to read it again
        if self._btree_header.page_type in (0x02, 0x05):
            self._header_size = 12
            page_header_bytes = self._get_btree_page_header()
            self._btree_header = SQLite_btree_page_header(*struct.unpack(
                r'>BHHHBI', page_header_bytes
            ))

        # Page 1 (and page 2, but that's the 1st ptrmap page) does not have a
        # ptrmap entry.
        # The first ptrmap page will contain back pointer information for pages
        # 3 through J+2, inclusive.
        if self._db.ptrmap:
            if self.idx >= 3 and self.idx not in self._db.ptrmap:
                _LOGGER.warning(
                    "BTree page %d doesn't have ptrmap entry!", self.idx
                )

        if self._btree_header.num_cells > 0:
            cell_ptr_bytes = self._get_btree_ptr_array(
                self._btree_header.num_cells
            )
            self._cell_ptr_array = struct.unpack(
                r'>{count}H'.format(count=self._btree_header.num_cells),
                cell_ptr_bytes
            )
            smallest_cell_offset = min(self._cell_ptr_array)
            if self._btree_header.cell_content_offset != smallest_cell_offset:
                _LOGGER.warning(
                    (
                        "Inconsistent cell ptr array in page %d! Cell content "
                        "starts at offset %d, but min cell pointer is %d"
                    ),
                    self.idx,
                    self._btree_header.cell_content_offset,
                    smallest_cell_offset
                )

    @property
    def btree_header(self):
        return self._btree_header

    @property
    def page_type(self):
        try:
            return self.btree_page_types[self._btree_header.page_type]
        except KeyError:
            pdb.set_trace()
            _LOGGER.warning(
                "Unknown B-Tree page type: %d", self._btree_header.page_type
            )
            raise

    @property
    def freeblocks(self):
        return self._freeblocks

    @property
    def cells(self):
        return self._cells

    def __repr__(self):
        # TODO Include table in repr, where available
        return "<SQLite B-Tree Page {0} ({1}) {2} cells>".format(
            self.idx, self.page_type, len(self._cell_ptr_array)
        )

    @property
    def table(self):
        return self._db.get_page_table(self.idx)

    def _get_btree_page_header(self):
        header_offset = 0
        if self.idx == 1:
            header_offset += 100
        return bytes(self)[header_offset:self._header_size + header_offset]

    def _get_btree_ptr_array(self, num_cells):
        array_offset = self._header_size
        if self.idx == 1:
            array_offset += 100
        return bytes(self)[array_offset:2 * num_cells + array_offset]

    def parse_cells(self):
        if self.btree_header.page_type == 0x05:
            self.parse_table_interior_cells()
        elif self.btree_header.page_type == 0x0D:
            self.parse_table_leaf_cells()
        self.parse_freeblocks()

    def parse_table_interior_cells(self):
        if self.btree_header.page_type != 0x05:
            assert False

        _LOGGER.debug("Parsing cells in table interior cell %d", self.idx)
        for cell_idx, offset in enumerate(self._cell_ptr_array):
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, offset)
            left_ptr_bytes = bytes(self)[offset:offset + 4]
            left_ptr, = struct.unpack(r'>I', left_ptr_bytes)

            offset += 4
            integer_key = Varint(bytes(self)[offset:offset+9])
            self._cells[cell_idx] = (left_ptr, int(integer_key))

    def parse_table_leaf_cells(self):
        if self.btree_header.page_type != 0x0d:
            assert False

        _LOGGER.debug("Parsing cells in table leaf cell %d", self.idx)
        for cell_idx, cell_offset in enumerate(self._cell_ptr_array):
            _LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, cell_offset)

            # This is the total size of the payload, which may include overflow
            offset = cell_offset
            payload_length_varint = Varint(bytes(self)[offset:offset+9])
            total_payload_size = int(payload_length_varint)

            overflow = False
            # Let X be U-35. If the payload size P is less than or equal to X
            # then the entire payload is stored on the b-tree leaf page. Let M
            # be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)). If P is
            # greater than X then the number of bytes stored on the table
            # b-tree leaf page is K if K is less or equal to X or M otherwise.
            # The number of bytes stored on the leaf page is never less than M.
            cell_payload_size = 0
            if total_payload_size > self._overflow_threshold:
                m = int(((self.usable_size - 12) * 32/255)-23)
                k = m + ((total_payload_size - m) % (self.usable_size - 4))
                if k <= self._overflow_threshold:
                    cell_payload_size = k
                else:
                    cell_payload_size = m
                overflow = True
            else:
                cell_payload_size = total_payload_size

            offset += len(payload_length_varint)

            integer_key = Varint(bytes(self)[offset:offset+9])
            offset += len(integer_key)

            overflow_bytes = bytes()
            if overflow:
                first_oflow_page_bytes = bytes(self)[
                    offset + cell_payload_size:offset + cell_payload_size + 4
                ]
                if not first_oflow_page_bytes:
                    continue

                first_oflow_idx, = struct.unpack(
                    r'>I', first_oflow_page_bytes
                )
                next_oflow_idx = first_oflow_idx
                while next_oflow_idx != 0:
                    oflow_page_bytes = self._db.page_bytes(next_oflow_idx)

                    len_overflow = min(
                        len(oflow_page_bytes) - 4,
                        (
                            total_payload_size - cell_payload_size +
                            len(overflow_bytes)
                        )
                    )
                    overflow_bytes += oflow_page_bytes[4:4 + len_overflow]

                    first_four_bytes = oflow_page_bytes[:4]
                    next_oflow_idx, = struct.unpack(
                        r'>I', first_four_bytes
                    )

            try:
                cell_data = bytes(self)[offset:offset + cell_payload_size]
                if overflow_bytes:
                    cell_data += overflow_bytes

                # All payload bytes should be accounted for
                assert len(cell_data) == total_payload_size

                record_obj = Record(cell_data)
                _LOGGER.debug("Created record: %r", record_obj)

            except TypeError as ex:
                _LOGGER.warning(
                    "Caught %r while instantiating record %d",
                    ex, int(integer_key)
                )
                pdb.set_trace()
                raise

            self._cells[cell_idx] = (int(integer_key), record_obj)

    def parse_freeblocks(self):
        # The first 2 bytes of a freeblock are a big-endian integer which is
        # the offset in the b-tree page of the next freeblock in the chain, or
        # zero if the freeblock is the last on the chain. The third and fourth
        # bytes of each freeblock form a big-endian integer which is the size
        # of the freeblock in bytes, including the 4-byte header. Freeblocks
        # are always connected in order of increasing offset. The second field
        # of the b-tree page header is the offset of the first freeblock, or
        # zero if there are no freeblocks on the page. In a well-formed b-tree
        # page, there will always be at least one cell before the first
        # freeblock.
        #
        # TODO But what about deleted records that exceeded the overflow
        # threshold in the past?
        block_offset = self.btree_header.first_freeblock_offset
        while block_offset != 0:
            freeblock_header = bytes(self)[block_offset:block_offset + 4]
            # Freeblock_size includes the 4-byte header
            next_freeblock_offset, freeblock_size = struct.unpack(
                r'>HH',
                freeblock_header
            )
            freeblock_bytes = bytes(self)[
                block_offset + 4:block_offset + freeblock_size - 4
            ]
            self._freeblocks[block_offset] = freeblock_bytes
            block_offset = next_freeblock_offset

    def print_cells(self):
        for cell_idx in self.cells.keys():
            rowid, record = self.cells[cell_idx]
            _LOGGER.info(
                "Cell %d, rowid: %d, record: %r",
                cell_idx, rowid, record
            )
            record.print_fields(table=self.table)

    def recover_freeblock_records(self, grouping):
        # If we're lucky (i.e. if no overwriting has taken place), we should be
        # able to find whole record headers in freeblocks.
        # We need to start from the end of the freeblock and work our way back
        # to the start. That means we don't know where a cell header will
        # start, but I suppose we can take a guess

        if not self.table:
            return

        try:
            table_heuristic = self._heuristics.get_heuristic(
                self.table, grouping
            )
        except ValueError as ex:
            _LOGGER.error(str(ex))
            return

        _LOGGER.info(
            "Using heuristic %r on table \"%s\"",
            table_heuristic, self.table,
        )

        _LOGGER.info("Attempting to recover records from freeblocks")
        for freeblock_idx, freeblock_offset in enumerate(self._freeblocks):
            freeblock_bytes = self._freeblocks[freeblock_offset]
            if 0 == len(freeblock_bytes):
                continue
            _LOGGER.debug(
                "Freeblock %d/%d in page, offset %d, %d bytes",
                1 + freeblock_idx,
                len(self._freeblocks),
                freeblock_offset,
                len(freeblock_bytes)
            )

            recovered_bytes = 0
            recovered_in_freeblock = 0

            # TODO Maybe we need to guess the record header lengths rather than
            # try and read them from the freeblocks
            for header_start in table_heuristic(freeblock_bytes):
                _LOGGER.debug(
                    (
                        "Trying potential record header start at "
                        "freeblock offset %d/%d"
                    ),
                    header_start, len(freeblock_bytes)
                )
                _LOGGER.debug("%r", freeblock_bytes)
                try:
                    # We don't know how to handle overflow in deleted records,
                    # so we'll have to truncate the bytes object used to
                    # instantiate the Record object
                    record_bytes = freeblock_bytes[
                        header_start:header_start+self._overflow_threshold
                    ]
                    record_obj = Record(record_bytes)
                except MalformedRecord:
                    # This isn't a well-formed record, let's move to the next
                    # candidate
                    continue

                field_lengths = sum(
                    len(field_obj) for field_obj in record_obj.fields.values()
                )
                record_obj.truncate(field_lengths + len(record_obj.header))
                self._recovered_records.add(record_obj)

                recovered_bytes += len(bytes(record_obj))
                recovered_in_freeblock += 1

            _LOGGER.info(
                (
                    "Recovered %d record(s): %d bytes out of %d "
                    "freeblock bytes @ offset %d"
                ),
                recovered_in_freeblock,
                recovered_bytes,
                len(freeblock_bytes),
                freeblock_offset,
            )

    @property
    def recovered_records(self):
        return self._recovered_records

    def print_recovered_records(self):
        if not self._recovered_records:
            return

        for record_obj in self._recovered_records:
            _LOGGER.info("Recovered record: %r", record_obj)
            _LOGGER.info("Recovered record header: %s", record_obj.header)
            record_obj.print_fields(table=self.table)


1			# MIT License
2			#
3			# Copyright (c) 2017 Matt Boyer
4			#
5			# Permission is hereby granted, free of charge, to any person obtaining a copy
6			# of this software and associated documentation files (the "Software"), to deal
7			# in the Software without restriction, including without limitation the rights
8			# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9			# copies of the Software, and to permit persons to whom the Software is
10			# furnished to do so, subject to the following conditions:
11			#
12			# The above copyright notice and this permission notice shall be included in
13			# all copies or substantial portions of the Software.
14			#
15			# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16			# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17			# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18			# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19			# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20			# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21			# SOFTWARE.
22
23			import pdb
24			import struct
25
26			from . import _LOGGER
27			from .record import (Record, MalformedRecord)
28			from .tuples import SQLite_btree_page_header
29			from .utils import (Varint, IndexDict)
30
31
32			class Page(object):
33			def __init__(self, page_idx, db):
34			self._page_idx = page_idx
35			self._db = db
36			self._bytes = db.page_bytes(self.idx)
37
38			@property
39			def idx(self):
40			return self._page_idx
41
42			@property
43			def usable_size(self):
44			return self._db.header.page_size - self._db.header.reserved_length
45
46			def __bytes__(self):
47			return self._bytes
48
49			@property
50			def parent(self):
51			try:
52			parent_idx = self._db.ptrmap[self.idx].page_ptr
53			except KeyError:
54			return None
55
56			if 0 == parent_idx:
57			return None
58			else:
59			return self._db.pages[parent_idx]
60
61			def __repr__(self):
62			return "<SQLite Page {0}>".format(self.idx)
63
64
65			class FreelistTrunkPage(Page):
66			# XXX Maybe it would make sense to expect a Page instance as constructor
67			# argument?
68			def __init__(self, page_idx, db, leaves):
69			super().__init__(page_idx, db)
70			self._leaves = leaves
71
72			def __repr__(self):
73			return "<SQLite Freelist Trunk Page {0}: {1} leaves>".format(
74			self.idx, len(self._leaves)
75			)
76
77
78			class FreelistLeafPage(Page):
79			# XXX Maybe it would make sense to expect a Page instance as constructor
80			# argument?
81			def __init__(self, page_idx, db, trunk_idx):
82			super().__init__(page_idx, db)
83			self._trunk = self._db.pages[trunk_idx]
84
85			def __repr__(self):
86			return "<SQLite Freelist Leaf Page {0}. Trunk: {1}>".format(
87			self.idx, self._trunk.idx
88			)
89
90
91			class PtrmapPage(Page):
92			# XXX Maybe it would make sense to expect a Page instance as constructor
93			# argument?
94			def __init__(self, page_idx, db, ptr_array):
95			super().__init__(page_idx, db)
96			self._pointers = ptr_array
97
98			@property
99			def pointers(self):
100			return self._pointers
101
102			def __repr__(self):
103			return "<SQLite Ptrmap Page {0}. {1} pointers>".format(
104			self.idx, len(self.pointers)
105			)
106
107
108			class OverflowPage(Page):
109			# XXX Maybe it would make sense to expect a Page instance as constructor
110			# argument?
111			def __init__(self, page_idx, db):
112			super().__init__(page_idx, db)
113			self._parse()
114
115			def _parse(self):
116			# TODO We should have parsing here for the next page index in the
117			# overflow chain
118			pass
119
120			def __repr__(self):
121			return "<SQLite Overflow Page {0}. Continuation of {1}>".format(
122			self.idx, self.parent.idx
123			)
124
125
126			class BTreePage(Page):
127			btree_page_types = {
128			0x02: "Index Interior",
129			0x05: "Table Interior",
130			0x0A: "Index Leaf",
131			0x0D: "Table Leaf",
132			}
133
134			def __init__(self, page_idx, db, heuristics):
135			# XXX We don't know a page's type until we've had a look at the header.
136			# Or do we?
137			super().__init__(page_idx, db)
138			self._heuristics = heuristics
139			self._header_size = 8
140			page_header_bytes = self._get_btree_page_header()
141			self._btree_header = SQLite_btree_page_header(
142			# Set the right-most page index to None in the 1st pass
143			*struct.unpack(r'>BHHHB', page_header_bytes), None
144			)
145			self._cell_ptr_array = []
146			self._freeblocks = IndexDict()
147			self._cells = IndexDict()
148			self._recovered_records = set()
149			self._overflow_threshold = self.usable_size - 35
150
151			if self._btree_header.page_type not in BTreePage.btree_page_types:
152			# pdb.set_trace()
153			raise ValueError
154
155			# We have a twelve-byte header, need to read it again
156			if self._btree_header.page_type in (0x02, 0x05):
157			self._header_size = 12
158			page_header_bytes = self._get_btree_page_header()
159			self._btree_header = SQLite_btree_page_header(*struct.unpack(
160			r'>BHHHBI', page_header_bytes
161			))
162
163			# Page 1 (and page 2, but that's the 1st ptrmap page) does not have a
164			# ptrmap entry.
165			# The first ptrmap page will contain back pointer information for pages
166			# 3 through J+2, inclusive.
167			if self._db.ptrmap:
168			if self.idx >= 3 and self.idx not in self._db.ptrmap:
169			_LOGGER.warning(
170			"BTree page %d doesn't have ptrmap entry!", self.idx
171			)
172
173			if self._btree_header.num_cells > 0:
174			cell_ptr_bytes = self._get_btree_ptr_array(
175			self._btree_header.num_cells
176			)
177			self._cell_ptr_array = struct.unpack(
178			r'>{count}H'.format(count=self._btree_header.num_cells),
179			cell_ptr_bytes
180			)
181			smallest_cell_offset = min(self._cell_ptr_array)
182			if self._btree_header.cell_content_offset != smallest_cell_offset:
183			_LOGGER.warning(
184			(
185			"Inconsistent cell ptr array in page %d! Cell content "
186			"starts at offset %d, but min cell pointer is %d"
187			),
188			self.idx,
189			self._btree_header.cell_content_offset,
190			smallest_cell_offset
191			)
192
193			@property
194			def btree_header(self):
195			return self._btree_header
196
197			@property
198			def page_type(self):
199			try:
200			return self.btree_page_types[self._btree_header.page_type]
201			except KeyError:
202			pdb.set_trace()
203			_LOGGER.warning(
204			"Unknown B-Tree page type: %d", self._btree_header.page_type
205			)
206			raise
207
208			@property
209			def freeblocks(self):
210			return self._freeblocks
211
212			@property
213			def cells(self):
214			return self._cells
215
216			def __repr__(self):
217			# TODO Include table in repr, where available
218			return "<SQLite B-Tree Page {0} ({1}) {2} cells>".format(
219			self.idx, self.page_type, len(self._cell_ptr_array)
220			)
221
222			@property
223			def table(self):
224			return self._db.get_page_table(self.idx)
225
226			def _get_btree_page_header(self):
227			header_offset = 0
228			if self.idx == 1:
229			header_offset += 100
230			return bytes(self)[header_offset:self._header_size + header_offset]
231
232			def _get_btree_ptr_array(self, num_cells):
233			array_offset = self._header_size
234			if self.idx == 1:
235			array_offset += 100
236			return bytes(self)[array_offset:2 * num_cells + array_offset]
237
238			def parse_cells(self):
239			if self.btree_header.page_type == 0x05:
240			self.parse_table_interior_cells()
241			elif self.btree_header.page_type == 0x0D:
242			self.parse_table_leaf_cells()
243			self.parse_freeblocks()
244
245			def parse_table_interior_cells(self):
246			if self.btree_header.page_type != 0x05:
247			assert False
248
249			_LOGGER.debug("Parsing cells in table interior cell %d", self.idx)
250			for cell_idx, offset in enumerate(self._cell_ptr_array):
251			_LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, offset)
252			left_ptr_bytes = bytes(self)[offset:offset + 4]
253			left_ptr, = struct.unpack(r'>I', left_ptr_bytes)
254
255			offset += 4
256			integer_key = Varint(bytes(self)[offset:offset+9])
257			self._cells[cell_idx] = (left_ptr, int(integer_key))
258
259			def parse_table_leaf_cells(self):
260			if self.btree_header.page_type != 0x0d:
261			assert False
262
263			_LOGGER.debug("Parsing cells in table leaf cell %d", self.idx)
264			for cell_idx, cell_offset in enumerate(self._cell_ptr_array):
265			_LOGGER.debug("Parsing cell %d @ offset %d", cell_idx, cell_offset)
266
267			# This is the total size of the payload, which may include overflow
268			offset = cell_offset
269			payload_length_varint = Varint(bytes(self)[offset:offset+9])
270			total_payload_size = int(payload_length_varint)
271
272			overflow = False
273			# Let X be U-35. If the payload size P is less than or equal to X
274			# then the entire payload is stored on the b-tree leaf page. Let M
275			# be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)). If P is
276			# greater than X then the number of bytes stored on the table
277			# b-tree leaf page is K if K is less or equal to X or M otherwise.
278			# The number of bytes stored on the leaf page is never less than M.
279			cell_payload_size = 0
280			if total_payload_size > self._overflow_threshold:
281			m = int(((self.usable_size - 12) * 32/255)-23)
282			k = m + ((total_payload_size - m) % (self.usable_size - 4))
283			if k <= self._overflow_threshold:
284			cell_payload_size = k
285			else:
286			cell_payload_size = m
287			overflow = True
288			else:
289			cell_payload_size = total_payload_size
290
291			offset += len(payload_length_varint)
292
293			integer_key = Varint(bytes(self)[offset:offset+9])
294			offset += len(integer_key)
295
296			overflow_bytes = bytes()
297			if overflow:
298			first_oflow_page_bytes = bytes(self)[
299			offset + cell_payload_size:offset + cell_payload_size + 4
300			]
301			if not first_oflow_page_bytes:
302			continue
303
304			first_oflow_idx, = struct.unpack(
305			r'>I', first_oflow_page_bytes
306			)
307			next_oflow_idx = first_oflow_idx
308			while next_oflow_idx != 0:
309			oflow_page_bytes = self._db.page_bytes(next_oflow_idx)
310
311			len_overflow = min(
312			len(oflow_page_bytes) - 4,
313			(
314			total_payload_size - cell_payload_size +
315			len(overflow_bytes)
316			)
317			)
318			overflow_bytes += oflow_page_bytes[4:4 + len_overflow]
319
320			first_four_bytes = oflow_page_bytes[:4]
321			next_oflow_idx, = struct.unpack(
322			r'>I', first_four_bytes
323			)
324
325			try:
326			cell_data = bytes(self)[offset:offset + cell_payload_size]
327			if overflow_bytes:
328			cell_data += overflow_bytes
329
330			# All payload bytes should be accounted for
331			assert len(cell_data) == total_payload_size
332
333			record_obj = Record(cell_data)
334			_LOGGER.debug("Created record: %r", record_obj)
335
336			except TypeError as ex:
337			_LOGGER.warning(
338			"Caught %r while instantiating record %d",
339			ex, int(integer_key)
340			)
341			pdb.set_trace()
342			raise
343
344			self._cells[cell_idx] = (int(integer_key), record_obj)
345
346			def parse_freeblocks(self):
347			# The first 2 bytes of a freeblock are a big-endian integer which is
348			# the offset in the b-tree page of the next freeblock in the chain, or
349			# zero if the freeblock is the last on the chain. The third and fourth
350			# bytes of each freeblock form a big-endian integer which is the size
351			# of the freeblock in bytes, including the 4-byte header. Freeblocks
352			# are always connected in order of increasing offset. The second field
353			# of the b-tree page header is the offset of the first freeblock, or
354			# zero if there are no freeblocks on the page. In a well-formed b-tree
355			# page, there will always be at least one cell before the first
356			# freeblock.
357			#
358			# TODO But what about deleted records that exceeded the overflow
359			# threshold in the past?
360			block_offset = self.btree_header.first_freeblock_offset
361			while block_offset != 0:
362			freeblock_header = bytes(self)[block_offset:block_offset + 4]
363			# Freeblock_size includes the 4-byte header
364			next_freeblock_offset, freeblock_size = struct.unpack(
365			r'>HH',
366			freeblock_header
367			)
368			freeblock_bytes = bytes(self)[
369			block_offset + 4:block_offset + freeblock_size - 4
370			]
371			self._freeblocks[block_offset] = freeblock_bytes
372			block_offset = next_freeblock_offset
373
374			def print_cells(self):
375			for cell_idx in self.cells.keys():
376			rowid, record = self.cells[cell_idx]
377			_LOGGER.info(
378			"Cell %d, rowid: %d, record: %r",
379			cell_idx, rowid, record
380			)
381			record.print_fields(table=self.table)
382
383			def recover_freeblock_records(self, grouping):
384			# If we're lucky (i.e. if no overwriting has taken place), we should be
385			# able to find whole record headers in freeblocks.
386			# We need to start from the end of the freeblock and work our way back
387			# to the start. That means we don't know where a cell header will
388			# start, but I suppose we can take a guess
389
390			if not self.table:
391			return
392
393			try:
394			table_heuristic = self._heuristics.get_heuristic(
395			self.table, grouping
396			)
397			except ValueError as ex:
398			_LOGGER.error(str(ex))
399			return
400
401			_LOGGER.info(
402			"Using heuristic %r on table \"%s\"",
403			table_heuristic, self.table,
404			)
405
406			_LOGGER.info("Attempting to recover records from freeblocks")
407			for freeblock_idx, freeblock_offset in enumerate(self._freeblocks):
408			freeblock_bytes = self._freeblocks[freeblock_offset]
409			if 0 == len(freeblock_bytes):
410			continue
411			_LOGGER.debug(
412			"Freeblock %d/%d in page, offset %d, %d bytes",
413			1 + freeblock_idx,
414			len(self._freeblocks),
415			freeblock_offset,
416			len(freeblock_bytes)
417			)
418
419			recovered_bytes = 0
420			recovered_in_freeblock = 0
421
422			# TODO Maybe we need to guess the record header lengths rather than
423			# try and read them from the freeblocks
424			for header_start in table_heuristic(freeblock_bytes):
425			_LOGGER.debug(
426			(
427			"Trying potential record header start at "
428			"freeblock offset %d/%d"
429			),
430			header_start, len(freeblock_bytes)
431			)
432			_LOGGER.debug("%r", freeblock_bytes)
433			try:
434			# We don't know how to handle overflow in deleted records,
435			# so we'll have to truncate the bytes object used to
436			# instantiate the Record object
437			record_bytes = freeblock_bytes[
438			header_start:header_start+self._overflow_threshold
439			]
440			record_obj = Record(record_bytes)
441			except MalformedRecord:
442			# This isn't a well-formed record, let's move to the next
443			# candidate
444			continue
445
446			field_lengths = sum(
447			len(field_obj) for field_obj in record_obj.fields.values()
448			)
449			record_obj.truncate(field_lengths + len(record_obj.header))
450			self._recovered_records.add(record_obj)
451
452			recovered_bytes += len(bytes(record_obj))
453			recovered_in_freeblock += 1
454
455			_LOGGER.info(
456			(
457			"Recovered %d record(s): %d bytes out of %d "
458			"freeblock bytes @ offset %d"
459			),
460			recovered_in_freeblock,
461			recovered_bytes,
462			len(freeblock_bytes),
463			freeblock_offset,
464			)
465
466			@property
467			def recovered_records(self):
468			return self._recovered_records
469
470			def print_recovered_records(self):
471			if not self._recovered_records:
472			return
473
474			for record_obj in self._recovered_records:
475			_LOGGER.info("Recovered record: %r", record_obj)
476			_LOGGER.info("Recovered record header: %s", record_obj.header)
477			record_obj.print_fields(table=self.table)
478

mattboyer / sqbrite

Pull Request — master (#18)

BTreePage.recover_freeblock_records() D

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like