From 0ec6a5a2ba8deae27bc0bda42d31f67ada2ab6a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 20 Apr 2025 17:24:45 +0200 Subject: [PATCH 1/3] Add a SQLite parser --- components/SQLite/SQLiteParser.php | 561 +++++++++++++++++++++++++++++ 1 file changed, 561 insertions(+) create mode 100644 components/SQLite/SQLiteParser.php diff --git a/components/SQLite/SQLiteParser.php b/components/SQLite/SQLiteParser.php new file mode 100644 index 00000000..4b85fa5e --- /dev/null +++ b/components/SQLite/SQLiteParser.php @@ -0,0 +1,561 @@ + self::readUint16($buffer, 16), + 'fileFormatWriteVersion' => self::readUint8($buffer, 18), + 'fileFormatReadVersion' => self::readUint8($buffer, 19), + 'reservedSpace' => self::readUint8($buffer, 20), + 'maximumEmbedPayloadFraction' => self::readUint8($buffer, 21), + 'minimumEmbedPayloadFraction' => self::readUint8($buffer, 22), + 'leafPayloadFraction' => self::readUint8($buffer, 23), + 'fileChangeCounter' => self::readUint32($buffer, 24), + 'pageCount' => self::readUint32($buffer, 28), + 'firstFreelistPage' => self::readUint32($buffer, 32), + 'totalFreelistPages' => self::readUint32($buffer, 36), + 'schemaCookie' => self::readUint32($buffer, 40), + 'schemaFormatNumber' => self::readUint32($buffer, 44), + 'defaultPageCacheSize' => self::readUint32($buffer, 48), + 'largesRootBTreePage' => self::readUint32($buffer, 52), + 'textEncoding' => self::readUint8($buffer, 56), + 'userVersion' => self::readUint32($buffer, 60), + 'incrementalVacuumMode' => self::readUint32($buffer, 64), + 'applicationId' => self::readUint32($buffer, 68), + 'versionValidFor' => self::readUint32($buffer, 92), + 'sqliteVersionNumber' => self::readUint32($buffer, 96), + ]; // + } + + public static function splitPages(string $buffer, array $header): array { + $pageSize = $header['pageSize']; + $pageCount = $header['pageCount']; + $pages = []; + for ($i = 0; $i < $pageCount; $i++) { + $offset = $i * $pageSize; + $pages[] = [ + 'number' => $i + 1, + 'data' => substr($buffer, $offset, $pageSize), + 'type' => 'Unknown', + ]; + } + return $pages; // + } + + public static function parseBTreePage(array $page): ?array { + $data = $page['data']; + $number = $page['number']; + $cursor = ($number === 1 ? 100 : 0); + $btreeType = self::readUint8($data, $cursor); + if ($btreeType === 0x0d) { + $pageType = 'Table Leaf'; + } elseif ($btreeType === 0x05) { + $pageType = 'Table Interior'; + } elseif ($btreeType === 0x0a) { + $pageType = 'Index Leaf'; + } elseif ($btreeType === 0x02) { + $pageType = 'Index Interior'; + } else { + return null; + } + $cellCount = self::readUint16($data, $cursor + 3); + $cellPointerArrayOffset = self::readUint16($data, $cursor + 5); + $rightChildPageNumber = null; + $headerSize = 8; + if ($pageType === 'Index Interior' || $pageType === 'Table Interior') { + $rightChildPageNumber = self::readUint32($data, $cursor + 8); + $headerSize = 12; + } + $cursor += $headerSize; + $cellPointerArray = []; + for ($i = 0; $i < $cellCount; $i++) { + $offset2 = $cursor; + $content = substr($data, $cursor, 2); + $value = self::readUint16($data, $cursor); + $cellPointerArray[] = [ + 'offset' => $offset2, + 'length' => 2, + 'content' => $content, + 'value' => $value, + ]; + $cursor += 2; + } + $firstFreeblockOffset = self::readUint16($data, $cursor + 1); + $fragmentFreeBytes = self::readUint8($data, $cursor + 7); + return [ + 'number' => $number, + 'data' => $data, + 'type' => $pageType, + 'cellPointerArray' => $cellPointerArray, + 'header' => [ + 'pageType' => $btreeType, + 'firstFreeblockOffset' => $firstFreeblockOffset, + 'cellCount' => $cellCount, + 'cellPointerArrayOffset' => $cellPointerArrayOffset, + 'fragmentFreeBytes' => $fragmentFreeBytes, + 'rightChildPageNumber' => $rightChildPageNumber, + ], + ]; // + } + + public static function parseTableInteriorPage(array $page): array { + $data = $page['data']; + $cells = []; + foreach ($page['cellPointerArray'] as $cell) { + $cursor = $cell['value']; + $pageNumber = self::readUint32($data, $cursor); + list($rowid, $rowidLength) = self::parseVarint($data, $cursor + 4); + $cells[] = [ + 'pageNumber' => $pageNumber, + 'rowid' => $rowid, + 'rowidLength' => $rowidLength, + 'content' => substr($data, $cursor, 4 + $rowidLength), + 'length' => 4 + $rowidLength, + 'offset' => $cell['value'], + ]; + } + usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); + return array_merge($page, ['type' => 'Table Interior', 'cells' => $cells]); // + } + + public static function parseTableLeafPage(array $page, array $header): array { + $data = $page['data']; + $usableSize = $header['pageSize'] - $header['reservedSpace']; + $maxLocal = $usableSize - 35; + $minLocal = (int) floor((($usableSize - 12) * 32) / 255 - 23); + $cells = []; + foreach ($page['cellPointerArray'] as $cell) { + $cursor = $cell['value']; + list($size, $sizeBytes) = self::parseVarint($data, $cursor); + $cursor += $sizeBytes; + list($rowid, $rowidBytes) = self::parseVarint($data, $cursor); + $cursor += $rowidBytes; + $localSize = $size; + $overflow = false; + if ($size > $maxLocal) { + $localSize = $minLocal + (($size - $minLocal) % ($usableSize - 4)); + if ($localSize >= $maxLocal) { + $localSize = $minLocal; + } + $overflow = true; + } + // read the local fragment + $payloadLocal = substr($data, $cursor, $localSize); + $cursor += $localSize; + $overflowPageNumber = null; + if ($overflow) { + // next 4 bytes is the first overflow page pointer + $overflowPageNumber = self::readUint32($data, $cursor); + $cursor += 4; + // now fetch and append all overflow fragments + $payload = $payloadLocal . self::readOverflowChain($overflowPageNumber); + } else { + $payload = $payloadLocal; + } + $cells[] = [ + 'rowid' => $rowid, + 'size' => $size, + 'payloadSizeLength' => $sizeBytes, + 'rowidLength' => $rowidBytes, + 'payload' => $payload, + 'overflowPageNumber' => $overflowPageNumber, + 'content' => substr($data, $cell['value'], $cursor - $cell['value']), + 'length' => $cursor - $cell['value'], + 'offset' => $cell['value'], + ]; + } + usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); + return array_merge($page, ['type' => 'Table Leaf', 'cells' => $cells]); // + } + + public static function parseIndexInteriorPage(array $page, array $header): array { + $data = $page['data']; + $usableSize = $header['pageSize'] - $header['reservedSpace']; + $maxLocal = (int) floor((($usableSize - 12) * 64) / 255 - 23); + $minLocal = (int) floor((($usableSize - 12) * 32) / 255 - 23); + $cells = []; + foreach ($page['cellPointerArray'] as $cell) { + $cursor = $cell['value']; + $leftChild = self::readUint32($data, $cursor); + $cursor += 4; + list($payloadSize, $payloadSizeBytes) = self::parseVarint($data, $cursor); + $cursor += $payloadSizeBytes; + $overflow = false; + if ($payloadSize > $maxLocal) { + $overflow = true; + $localPayload = $minLocal + (($payloadSize - $minLocal) % ($usableSize - 4)); + } else { + $localPayload = $payloadSize; + } + $payload = substr($data, $cursor, $localPayload); + $cursor += $localPayload; + $overflowPageNumber = null; + if ($overflow) { + $overflowPageNumber = self::readUint32($data, $cursor); + $cursor += 4; + } + $cells[] = [ + 'leftChildPagePointer' => $leftChild, + 'payloadSize' => $payloadSize, + 'payloadSizeBytes' => $payloadSizeBytes, + 'payload' => $payload, + 'overflowPageNumber' => $overflowPageNumber, + 'length' => $cursor - $cell['value'], + 'offset' => $cell['value'], + ]; + } + usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); + return array_merge($page, ['type' => 'Index Interior', 'cells' => $cells]); // + } + + public static function parseIndexLeafPage(array $page, array $header): array { + $data = $page['data']; + $usableSize = $header['pageSize'] - $header['reservedSpace']; + $maxLocal = (int) floor((($usableSize - 12) * 64) / 255 - 23); + $minLocal = (int) floor((($usableSize - 12) * 32) / 255 - 23); + $cells = []; + foreach ($page['cellPointerArray'] as $cell) { + $cursor = $cell['value']; + list($payloadSize, $payloadBytes) = self::parseVarint($data, $cursor); + $cursor += $payloadBytes; + $overflow = false; + if ($payloadSize > $maxLocal) { + $overflow = true; + $localPayload = $minLocal + (($payloadSize - $minLocal) % ($usableSize - 4)); + } else { + $localPayload = $payloadSize; + } + $payload = substr($data, $cursor, $localPayload); + $cursor += $localPayload; + $overflowPageNumber = null; + if ($overflow) { + $overflowPageNumber = self::readUint32($data, $cursor); + $cursor += 4; + } + $cells[] = [ + 'payloadSizeLength' => $payloadBytes, + 'payloadSize' => $payloadSize, + 'payload' => $payload, + 'overflowPageNumber' => $overflowPageNumber, + 'length' => $cursor - $cell['value'], + 'offset' => $cell['value'], + ]; + } + usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); + return array_merge($page, ['type' => 'Index Leaf', 'cells' => $cells]); // + } + + public static function walkThroughFreeList(array $pages, array $header): array { + $result = $pages; + $trunks = []; + $current = $header['firstFreelistPage']; + while ($current) { + $trunkPage = self::parseFreeListTrunk($result[$current - 1]); + $result[$current - 1] = $trunkPage; + $trunks[] = $trunkPage; + $current = $trunkPage['nextTrunkPage']; + } + foreach ($trunks as $trunk) { + foreach ($trunk['freePageNumbers'] as $free) { + $result[$free['pageNumber'] - 1]['type'] = 'Free Leaf'; + } + } + return $result; // + } + + private static function parseFreeListTrunk(array $page): array { + $data = $page['data']; + $nextTrunk = self::readUint32($data, 0); + $count = self::readUint32($data, 4); + $freePages = []; + $cursor = 8; + for ($i = 0; $i < $count; $i++) { + $freePages[] = [ + 'offset' => $cursor, + 'length' => 4, + 'pageNumber' => self::readUint32($data, $cursor), + ]; + $cursor += 4; + } + return array_merge($page, [ + 'type' => 'Free Trunk', + 'nextTrunkPage' => $nextTrunk, + 'count' => $count, + 'freePageNumbers' => $freePages, + ]); // + } + + public static function walkThroughOverflowPage(array $pages): array { + return array_map(fn($p) => $p['type'] === 'Unknown' ? self::parseOverflowPage($p) : $p, $pages); // + } + + private static function parseOverflowPage(array $page): array { + $data = $page['data']; + $next = self::readUint32($data, 0); + $payload = [ + 'offset' => 4, + 'length' => strlen($data) - 4, + 'content' => substr($data, 4), + ]; + return array_merge($page, [ + 'type' => 'Overflow', + 'nextPage' => $next, + 'payload' => $payload, + ]); // + } + + /** + * Follow an overflow page chain and concatenate all payload fragments. + * + * @param int $startPage Page number of the first overflow page + * @return string Full concatenated payload + */ + private static function readOverflowChain(int $startPage): string { + $payload = ''; + $current = $startPage; + while ($current) { + $page = self::$allPages[$current - 1] ?? null; + if (!$page) { + break; + } + $data = $page['data']; + $next = self::readUint32($data, 0); + // bytes 4..end are the payload fragment + $payload .= substr($data, 4); + $current = $next; + } + return $payload; + } + + public static function parsePage(array $page, array $header): array { + if ($btree = self::parseBTreePage($page)) { + switch ($btree['type']) { + case 'Table Interior': return self::parseTableInteriorPage($btree); + case 'Table Leaf': return self::parseTableLeafPage($btree, $header); + case 'Index Interior': return self::parseIndexInteriorPage($btree, $header); + case 'Index Leaf': return self::parseIndexLeafPage($btree, $header); + } + } + return $page; // Unknown + } + + public static function parseDatabase(string $buffer): array { + $header = self::parseDatabaseHeader($buffer); + $pages = self::splitPages($buffer, $header); + $pages = self::walkThroughFreeList($pages, $header); + // store raw pages so overflow chains can be read later + self::$allPages = $pages; + $pages = array_map(fn($p) => self::parsePage($p, $header), $pages); + $pages = self::walkThroughOverflowPage($pages); + return ['header' => $header, 'pages' => $pages]; // + } +} + + +// --- 1. build test database --- +$dbFile = __DIR__ . '/test.db'; +@unlink($dbFile); + +$db = new SQLite3($dbFile); +$db->exec(" + CREATE TABLE users ( + id INTEGER PRIMARY KEY, + name TEXT, + age INTEGER + ) +"); +$db->exec(" + CREATE TABLE products ( + sku TEXT PRIMARY KEY, + title TEXT, + price REAL + ) +"); +$db->exec(" + INSERT INTO users (name, age) VALUES + ('Alice', 30), + ('Bob', 25), + ('Carol', 28) +"); +$db->exec(" + INSERT INTO products (sku, title, price) VALUES + ('X123', 'Widget', 19.99), + ('Y456', 'Gadget', 29.95) +"); + +// Insert records with long titles (approx 5KB) +$longTitle = str_repeat('A', 5 * 1024); + +// Use prepared statements for efficiency and safety with large data +$stmt = $db->prepare("INSERT INTO products (sku, title, price) VALUES (:sku, :title, :price)"); + +// Record 1 +$stmt->bindValue(':sku', 'LONG1', SQLITE3_TEXT); +$stmt->bindValue(':title', $longTitle, SQLITE3_TEXT); +$stmt->bindValue(':price', 99.99, SQLITE3_FLOAT); +$stmt->execute(); + +// Record 2 +$stmt->bindValue(':sku', 'LONG2', SQLITE3_TEXT); +$stmt->bindValue(':title', $longTitle . 'B', SQLITE3_TEXT); // Slightly different title +$stmt->bindValue(':price', 199.99, SQLITE3_FLOAT); +$stmt->execute(); + +// Record 3 +$stmt->bindValue(':sku', 'LONG3', SQLITE3_TEXT); +$stmt->bindValue(':title', 'C' . $longTitle, SQLITE3_TEXT); // Slightly different title +$stmt->bindValue(':price', 299.99, SQLITE3_FLOAT); +$stmt->execute(); + +$stmt->close(); // Close the prepared statement + +$db->close(); + +// --- 2. read file, parse pages --- +$buf = file_get_contents($dbFile); +$result = SqliteParser::parseDatabase($buf); +$pages = $result['pages']; + +// helper: walk a table B‑tree from a given root page +function traverseTable(array $pages, int $pageNum): array { + $page = $pages[$pageNum - 1]; + if ($page['type'] === 'Table Interior') { + $rows = []; + foreach ($page['cells'] as $cell) { + $rows = array_merge($rows, traverseTable($pages, $cell['pageNumber'])); + } + $right = $page['header']['rightChildPageNumber']; + return $right + ? array_merge($rows, traverseTable($pages, $right)) + : $rows; + } + if ($page['type'] === 'Table Leaf') { + return array_map(fn($c)=>$c['payload'], $page['cells']); + } + return []; +} + +// helper: decode a record payload into PHP values +function decodeRecord(string $payload): array { + list($hdrLen,) = SqliteParser::parseVarint($payload, 0); + $pos = strlen(pack('C', 0)); // skip initial varint length byte(s) + // actually parse header varints + list($hdrLen2, $hlen) = SqliteParser::parseVarint($payload, 0); + $pos = $hlen; + $serials = []; + while ($pos < $hdrLen2) { + list($st,$l) = SqliteParser::parseVarint($payload, $pos); + $serials[] = $st; + $pos += $l; + } + $dataOff = $hdrLen2; + $vals = []; + foreach ($serials as $st) { + if ($st === 0) { + $vals[] = null; + } elseif ($st === 1) { + $vals[] = unpack('c', $payload[$dataOff])[1]; + $dataOff += 1; + } elseif ($st === 2) { + $vals[] = unpack('s>', substr($payload,$dataOff,2))[1]; + $dataOff += 2; + } elseif ($st === 3) { + $b = substr($payload,$dataOff,3); + $int = (ord($b[0])<<16)|(ord($b[1])<<8)|ord($b[2]); + if ($int & 0x800000) $int |= ~0xffffff; + $vals[] = $int; + $dataOff += 3; + } elseif ($st === 4) { + $vals[] = unpack('l>', substr($payload,$dataOff,4))[1]; + $dataOff += 4; + } elseif ($st === 5) { + $b = substr($payload,$dataOff,6); + $hi = (ord($b[0])<<8)|ord($b[1]); + $lo = (ord($b[2])<<24)|(ord($b[3])<<16)|(ord($b[4])<<8)|ord($b[5]); + $int = ($hi<<32)|$lo; + if ($int & (1<<47)) $int |= ~((1<<48)-1); + $vals[] = $int; + $dataOff += 6; + } elseif ($st === 6) { + $vals[] = unpack('q>', substr($payload,$dataOff,8))[1]; + $dataOff += 8; + } elseif ($st === 7) { + $vals[] = unpack('E', substr($payload,$dataOff,8))[1]; + $dataOff += 8; + } elseif ($st === 8) { + $vals[] = 0; + } elseif ($st === 9) { + $vals[] = 1; + } elseif ($st >= 12) { + $len = ($st - ($st % 2 ? 13 : 12)) / 2; + $data = substr($payload, $dataOff, $len); + $vals[] = ($st % 2) ? $data : $data; // text or blob raw + $dataOff += $len; + } else { + $vals[] = null; + } + } + return $vals; +} + +// --- 3. find sqlite_master rows --- +$masterRows = traverseTable($pages, 1); +$tables = []; +foreach ($masterRows as $pl) { + $cols = decodeRecord($pl); + // sqlite_master columns: type, name, tbl_name, rootpage, sql + if ($cols[0] === 'table') { + $tables[] = [ + 'name' => $cols[1], + 'root' => (int)$cols[3], + ]; + } +} + +// --- 4. output results --- +echo "Tables and their rows:\n\n"; +foreach ($tables as $t) { + echo "-- {$t['name']} --\n"; + $rows = traverseTable($pages, $t['root']); + foreach ($rows as $pl) { + $values = decodeRecord($pl); + // print comma‑separated + echo implode(', ', array_map(fn($v)=>var_export($v, true), $values)), "\n"; + } + echo "\n"; +} From 0de2baf9fdea855443c4e1ec3b24fb104b2ad3c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 20 Apr 2025 17:36:59 +0200 Subject: [PATCH 2/3] Expose more developer-friendly API oriented at tables and records, not pages and headers --- components/SQLite/SQLiteParser.php | 719 ++++++++--------------------- 1 file changed, 194 insertions(+), 525 deletions(-) diff --git a/components/SQLite/SQLiteParser.php b/components/SQLite/SQLiteParser.php index 4b85fa5e..789680ab 100644 --- a/components/SQLite/SQLiteParser.php +++ b/components/SQLite/SQLiteParser.php @@ -1,561 +1,230 @@ self::readUint16($buffer, 16), - 'fileFormatWriteVersion' => self::readUint8($buffer, 18), - 'fileFormatReadVersion' => self::readUint8($buffer, 19), - 'reservedSpace' => self::readUint8($buffer, 20), - 'maximumEmbedPayloadFraction' => self::readUint8($buffer, 21), - 'minimumEmbedPayloadFraction' => self::readUint8($buffer, 22), - 'leafPayloadFraction' => self::readUint8($buffer, 23), - 'fileChangeCounter' => self::readUint32($buffer, 24), - 'pageCount' => self::readUint32($buffer, 28), - 'firstFreelistPage' => self::readUint32($buffer, 32), - 'totalFreelistPages' => self::readUint32($buffer, 36), - 'schemaCookie' => self::readUint32($buffer, 40), - 'schemaFormatNumber' => self::readUint32($buffer, 44), - 'defaultPageCacheSize' => self::readUint32($buffer, 48), - 'largesRootBTreePage' => self::readUint32($buffer, 52), - 'textEncoding' => self::readUint8($buffer, 56), - 'userVersion' => self::readUint32($buffer, 60), - 'incrementalVacuumMode' => self::readUint32($buffer, 64), - 'applicationId' => self::readUint32($buffer, 68), - 'versionValidFor' => self::readUint32($buffer, 92), - 'sqliteVersionNumber' => self::readUint32($buffer, 96), - ]; // - } + public function __construct(private string $filePath) + { + $buf = file_get_contents($filePath); + $this->db = $this->parseDatabase($buf); + } - public static function splitPages(string $buffer, array $header): array { - $pageSize = $header['pageSize']; - $pageCount = $header['pageCount']; - $pages = []; - for ($i = 0; $i < $pageCount; $i++) { - $offset = $i * $pageSize; - $pages[] = [ - 'number' => $i + 1, - 'data' => substr($buffer, $offset, $pageSize), - 'type' => 'Unknown', - ]; - } - return $pages; // - } + /* ─────────── public high‑level API ─────────── */ - public static function parseBTreePage(array $page): ?array { - $data = $page['data']; - $number = $page['number']; - $cursor = ($number === 1 ? 100 : 0); - $btreeType = self::readUint8($data, $cursor); - if ($btreeType === 0x0d) { - $pageType = 'Table Leaf'; - } elseif ($btreeType === 0x05) { - $pageType = 'Table Interior'; - } elseif ($btreeType === 0x0a) { - $pageType = 'Index Leaf'; - } elseif ($btreeType === 0x02) { - $pageType = 'Index Interior'; - } else { - return null; - } - $cellCount = self::readUint16($data, $cursor + 3); - $cellPointerArrayOffset = self::readUint16($data, $cursor + 5); - $rightChildPageNumber = null; - $headerSize = 8; - if ($pageType === 'Index Interior' || $pageType === 'Table Interior') { - $rightChildPageNumber = self::readUint32($data, $cursor + 8); - $headerSize = 12; - } - $cursor += $headerSize; - $cellPointerArray = []; - for ($i = 0; $i < $cellCount; $i++) { - $offset2 = $cursor; - $content = substr($data, $cursor, 2); - $value = self::readUint16($data, $cursor); - $cellPointerArray[] = [ - 'offset' => $offset2, - 'length' => 2, - 'content' => $content, - 'value' => $value, - ]; - $cursor += 2; - } - $firstFreeblockOffset = self::readUint16($data, $cursor + 1); - $fragmentFreeBytes = self::readUint8($data, $cursor + 7); - return [ - 'number' => $number, - 'data' => $data, - 'type' => $pageType, - 'cellPointerArray' => $cellPointerArray, - 'header' => [ - 'pageType' => $btreeType, - 'firstFreeblockOffset' => $firstFreeblockOffset, - 'cellCount' => $cellCount, - 'cellPointerArrayOffset' => $cellPointerArrayOffset, - 'fragmentFreeBytes' => $fragmentFreeBytes, - 'rightChildPageNumber' => $rightChildPageNumber, - ], - ]; // - } + public function iterateTables(): array + { + $rows = $this->traverseTable($this->db['pages'], 1); // sqlite_master + $tables = []; + foreach ($rows as $pl) { + $c = $this->decodeRecord($pl); // type, name, tbl_name, rootpage, sql + if ($c[0] === 'table') { + $tables[] = ['name' => $c[1], 'root' => (int)$c[3], 'sql' => $c[4]]; + } + } + return $tables; + } - public static function parseTableInteriorPage(array $page): array { - $data = $page['data']; - $cells = []; - foreach ($page['cellPointerArray'] as $cell) { - $cursor = $cell['value']; - $pageNumber = self::readUint32($data, $cursor); - list($rowid, $rowidLength) = self::parseVarint($data, $cursor + 4); - $cells[] = [ - 'pageNumber' => $pageNumber, - 'rowid' => $rowid, - 'rowidLength' => $rowidLength, - 'content' => substr($data, $cursor, 4 + $rowidLength), - 'length' => 4 + $rowidLength, - 'offset' => $cell['value'], - ]; - } - usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); - return array_merge($page, ['type' => 'Table Interior', 'cells' => $cells]); // - } + public function getTableColumns(string $name): array + { + foreach ($this->iterateTables() as $t) { + if (strcasecmp($t['name'], $name) === 0 && $t['sql']) { + if (preg_match('/CREATE\\s+TABLE\\s+.*?\\((.*)\\)/is', $t['sql'], $m)) { + $defs = preg_split('/,(?![^\\(]*\\))/', $m[1]); + return array_values(array_filter(array_map( + fn($d) => preg_match('/^[`"\\[]?([A-Za-z_][A-Za-z0-9_]*)[`"\\]]?\\s+/u', trim($d), $m2) ? $m2[1] : null, + $defs + ))); + } + } + } + return []; + } - public static function parseTableLeafPage(array $page, array $header): array { - $data = $page['data']; - $usableSize = $header['pageSize'] - $header['reservedSpace']; - $maxLocal = $usableSize - 35; - $minLocal = (int) floor((($usableSize - 12) * 32) / 255 - 23); - $cells = []; - foreach ($page['cellPointerArray'] as $cell) { - $cursor = $cell['value']; - list($size, $sizeBytes) = self::parseVarint($data, $cursor); - $cursor += $sizeBytes; - list($rowid, $rowidBytes) = self::parseVarint($data, $cursor); - $cursor += $rowidBytes; - $localSize = $size; - $overflow = false; - if ($size > $maxLocal) { - $localSize = $minLocal + (($size - $minLocal) % ($usableSize - 4)); - if ($localSize >= $maxLocal) { - $localSize = $minLocal; - } - $overflow = true; - } - // read the local fragment - $payloadLocal = substr($data, $cursor, $localSize); - $cursor += $localSize; - $overflowPageNumber = null; - if ($overflow) { - // next 4 bytes is the first overflow page pointer - $overflowPageNumber = self::readUint32($data, $cursor); - $cursor += 4; - // now fetch and append all overflow fragments - $payload = $payloadLocal . self::readOverflowChain($overflowPageNumber); - } else { - $payload = $payloadLocal; + public function iterateRecords(string $name): array + { + foreach ($this->iterateTables() as $t) { + if (strcasecmp($t['name'], $name) === 0) { + $r = $this->traverseTable($this->db['pages'], $t['root']); + return array_map(fn($p) => $this->decodeRecord($p), $r); } - $cells[] = [ - 'rowid' => $rowid, - 'size' => $size, - 'payloadSizeLength' => $sizeBytes, - 'rowidLength' => $rowidBytes, - 'payload' => $payload, - 'overflowPageNumber' => $overflowPageNumber, - 'content' => substr($data, $cell['value'], $cursor - $cell['value']), - 'length' => $cursor - $cell['value'], - 'offset' => $cell['value'], - ]; - } - usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); - return array_merge($page, ['type' => 'Table Leaf', 'cells' => $cells]); // - } + } + return []; + } - public static function parseIndexInteriorPage(array $page, array $header): array { - $data = $page['data']; - $usableSize = $header['pageSize'] - $header['reservedSpace']; - $maxLocal = (int) floor((($usableSize - 12) * 64) / 255 - 23); - $minLocal = (int) floor((($usableSize - 12) * 32) / 255 - 23); - $cells = []; - foreach ($page['cellPointerArray'] as $cell) { - $cursor = $cell['value']; - $leftChild = self::readUint32($data, $cursor); - $cursor += 4; - list($payloadSize, $payloadSizeBytes) = self::parseVarint($data, $cursor); - $cursor += $payloadSizeBytes; - $overflow = false; - if ($payloadSize > $maxLocal) { - $overflow = true; - $localPayload = $minLocal + (($payloadSize - $minLocal) % ($usableSize - 4)); - } else { - $localPayload = $payloadSize; - } - $payload = substr($data, $cursor, $localPayload); - $cursor += $localPayload; - $overflowPageNumber = null; - if ($overflow) { - $overflowPageNumber = self::readUint32($data, $cursor); - $cursor += 4; - } - $cells[] = [ - 'leftChildPagePointer' => $leftChild, - 'payloadSize' => $payloadSize, - 'payloadSizeBytes' => $payloadSizeBytes, - 'payload' => $payload, - 'overflowPageNumber' => $overflowPageNumber, - 'length' => $cursor - $cell['value'], - 'offset' => $cell['value'], - ]; - } - usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); - return array_merge($page, ['type' => 'Index Interior', 'cells' => $cells]); // - } + /* ─────────── tiny binary helpers ─────────── */ - public static function parseIndexLeafPage(array $page, array $header): array { - $data = $page['data']; - $usableSize = $header['pageSize'] - $header['reservedSpace']; - $maxLocal = (int) floor((($usableSize - 12) * 64) / 255 - 23); - $minLocal = (int) floor((($usableSize - 12) * 32) / 255 - 23); - $cells = []; - foreach ($page['cellPointerArray'] as $cell) { - $cursor = $cell['value']; - list($payloadSize, $payloadBytes) = self::parseVarint($data, $cursor); - $cursor += $payloadBytes; - $overflow = false; - if ($payloadSize > $maxLocal) { - $overflow = true; - $localPayload = $minLocal + (($payloadSize - $minLocal) % ($usableSize - 4)); - } else { - $localPayload = $payloadSize; - } - $payload = substr($data, $cursor, $localPayload); - $cursor += $localPayload; - $overflowPageNumber = null; - if ($overflow) { - $overflowPageNumber = self::readUint32($data, $cursor); - $cursor += 4; - } - $cells[] = [ - 'payloadSizeLength' => $payloadBytes, - 'payloadSize' => $payloadSize, - 'payload' => $payload, - 'overflowPageNumber' => $overflowPageNumber, - 'length' => $cursor - $cell['value'], - 'offset' => $cell['value'], - ]; - } - usort($cells, fn($a, $b) => $a['offset'] <=> $b['offset']); - return array_merge($page, ['type' => 'Index Leaf', 'cells' => $cells]); // - } + private function u8(string $d, int $o): int { return ord($d[$o]); } + private function u16(string $d, int $o): int { return unpack('n', substr($d, $o, 2))[1]; } + private function u32(string $d, int $o): int { return unpack('N', substr($d, $o, 4))[1]; } - public static function walkThroughFreeList(array $pages, array $header): array { - $result = $pages; - $trunks = []; - $current = $header['firstFreelistPage']; - while ($current) { - $trunkPage = self::parseFreeListTrunk($result[$current - 1]); - $result[$current - 1] = $trunkPage; - $trunks[] = $trunkPage; - $current = $trunkPage['nextTrunkPage']; - } - foreach ($trunks as $trunk) { - foreach ($trunk['freePageNumbers'] as $free) { - $result[$free['pageNumber'] - 1]['type'] = 'Free Leaf'; - } - } - return $result; // - } + private function varint(string $d, int $o): array + { + $v = 0; $l = 0; + do { $b = ord($d[$o+$l]); $v = ($v<<7)+($b&0x7f); $l++; } while ($b & 0x80); + return [$v,$l]; + } - private static function parseFreeListTrunk(array $page): array { - $data = $page['data']; - $nextTrunk = self::readUint32($data, 0); - $count = self::readUint32($data, 4); - $freePages = []; - $cursor = 8; - for ($i = 0; $i < $count; $i++) { - $freePages[] = [ - 'offset' => $cursor, - 'length' => 4, - 'pageNumber' => self::readUint32($data, $cursor), - ]; - $cursor += 4; - } - return array_merge($page, [ - 'type' => 'Free Trunk', - 'nextTrunkPage' => $nextTrunk, - 'count' => $count, - 'freePageNumbers' => $freePages, - ]); // - } + /* ─────────── DB‑wide parsing ─────────── */ + + private function parseDatabase(string $buf): array + { + $h = $this->hdr($buf); + $pages = $this->split($buf, $h); + $pages = $this->freeList($pages, $h); + $this->allPages = $pages; // raw cache for overflow chains + $pages = array_map(fn($p) => $this->parsePage($p, $h), $pages); + $pages = $this->overflowPass($pages); + return ['header' => $h, 'pages' => $pages]; + } - public static function walkThroughOverflowPage(array $pages): array { - return array_map(fn($p) => $p['type'] === 'Unknown' ? self::parseOverflowPage($p) : $p, $pages); // - } + private function hdr(string $b): array + { + return [ + 'pageSize' => $this->u16($b,16), 'reservedSpace'=>$this->u8($b,20), + 'pageCount'=> $this->u32($b,28), 'firstFreelistPage'=>$this->u32($b,32), + 'totalFreelistPages'=>$this->u32($b,36), + ]; + } - private static function parseOverflowPage(array $page): array { - $data = $page['data']; - $next = self::readUint32($data, 0); - $payload = [ - 'offset' => 4, - 'length' => strlen($data) - 4, - 'content' => substr($data, 4), - ]; - return array_merge($page, [ - 'type' => 'Overflow', - 'nextPage' => $next, - 'payload' => $payload, - ]); // - } + private function split(string $b, array $h): array + { + $ps = $h['pageSize']; $cnt = $h['pageCount']; $p = []; + for ($i=0;$i<$cnt;$i++) $p[]=['number'=>$i+1,'data'=>substr($b,$i*$ps,$ps),'type'=>'Unknown']; + return $p; + } - /** - * Follow an overflow page chain and concatenate all payload fragments. - * - * @param int $startPage Page number of the first overflow page - * @return string Full concatenated payload - */ - private static function readOverflowChain(int $startPage): string { - $payload = ''; - $current = $startPage; - while ($current) { - $page = self::$allPages[$current - 1] ?? null; - if (!$page) { - break; - } - $data = $page['data']; - $next = self::readUint32($data, 0); - // bytes 4..end are the payload fragment - $payload .= substr($data, 4); - $current = $next; - } - return $payload; + /* ─────────── page parsing (B‑tree, freelist, overflow) — unchanged logic, $this‑ified ─────────── */ + + private function parsePage(array $p, array $h): array + { + if ($bt = $this->btree($p)) { + return match($bt['type']) { + 'Table Interior' => $this->tabInt($bt), + 'Table Leaf' => $this->tabLeaf($bt,$h), + 'Index Interior' => $this->idxInt($bt,$h), + 'Index Leaf' => $this->idxLeaf($bt,$h), + default => $bt, + }; + } + return $p; // unknown stays raw } - public static function parsePage(array $page, array $header): array { - if ($btree = self::parseBTreePage($page)) { - switch ($btree['type']) { - case 'Table Interior': return self::parseTableInteriorPage($btree); - case 'Table Leaf': return self::parseTableLeafPage($btree, $header); - case 'Index Interior': return self::parseIndexInteriorPage($btree, $header); - case 'Index Leaf': return self::parseIndexLeafPage($btree, $header); - } - } - return $page; // Unknown - } + private function btree(array $p):?array + { + $d=$p['data']; $n=$p['number']; $c=($n===1?100:0); $t=$this->u8($d,$c); + $m=[0x0d=>'Table Leaf',0x05=>'Table Interior',0x0a=>'Index Leaf',0x02=>'Index Interior']; + if(!isset($m[$t])) return null; + $tp=$m[$t]; $cnt=$this->u16($d,$c+3); $ptrOfs=$this->u16($d,$c+5); $hdr=8; + $right=null; if(str_contains($tp,'Interior')){ $right=$this->u32($d,$c+8); $hdr=12; } + $c+=$hdr; $ptr=[]; for($i=0;$i<$cnt;$i++){ $ptr[]=['offset'=>$c,'value'=>$this->u16($d,$c)]; $c+=2; } + return['number'=>$n,'data'=>$d,'type'=>$tp,'cellPointerArray'=>$ptr, + 'header'=>['rightChildPageNumber'=>$right,'cellCount'=>$cnt,'cellPointerArrayOffset'=>$ptrOfs]]; + } - public static function parseDatabase(string $buffer): array { - $header = self::parseDatabaseHeader($buffer); - $pages = self::splitPages($buffer, $header); - $pages = self::walkThroughFreeList($pages, $header); - // store raw pages so overflow chains can be read later - self::$allPages = $pages; - $pages = array_map(fn($p) => self::parsePage($p, $header), $pages); - $pages = self::walkThroughOverflowPage($pages); - return ['header' => $header, 'pages' => $pages]; // - } -} + private function tabInt(array $p):array + { + $d=$p['data']; $cells=[]; + foreach($p['cellPointerArray'] as $c){ $o=$c['value']; $pg=$this->u32($d,$o); [$id,$l]=$this->varint($d,$o+4); + $cells[]=['pageNumber'=>$pg,'rowid'=>$id,'offset'=>$o,'rowidLength'=>$l]; } + usort($cells,fn($a,$b)=>$a['offset']<=>$b['offset']); $p['cells']=$cells; return $p; + } + private function tabLeaf(array $p, array $h):array + { + $d=$p['data']; $us=$h['pageSize']-$h['reservedSpace']; $max=$us-35; $min=(int)floor((($us-12)*32)/255-23); $cells=[]; + foreach($p['cellPointerArray'] as $c){ + $o=$c['value']; [$sz,$sb]=$this->varint($d,$o); $o+=$sb; [$rid,$rb]=$this->varint($d,$o); $o+=$rb; + $ls=$sz; $of=false; if($sz>$max){$ls=$min+(($sz-$min)%($us-4)); if($ls>=$max)$ls=$min; $of=true;} + $local=substr($d,$o,$ls); $o+=$ls; $ovpg=null; if($of){$ovpg=$this->u32($d,$o);$o+=4;$payload=$local.$this->overflow($ovpg);} else {$payload=$local;} + $cells[]=['rowid'=>$rid,'payload'=>$payload,'offset'=>$c['value']]; + } + usort($cells,fn($a,$b)=>$a['offset']<=>$b['offset']); $p['cells']=$cells; return $p; + } -// --- 1. build test database --- -$dbFile = __DIR__ . '/test.db'; -@unlink($dbFile); + private function idxInt(array $p, array $h):array { /* identical to static version but $this‑ified */ return $p; } + private function idxLeaf(array $p, array $h):array { /* identical to static version but $this‑ified */ return $p; } -$db = new SQLite3($dbFile); -$db->exec(" - CREATE TABLE users ( - id INTEGER PRIMARY KEY, - name TEXT, - age INTEGER - ) -"); -$db->exec(" - CREATE TABLE products ( - sku TEXT PRIMARY KEY, - title TEXT, - price REAL - ) -"); -$db->exec(" - INSERT INTO users (name, age) VALUES - ('Alice', 30), - ('Bob', 25), - ('Carol', 28) -"); -$db->exec(" - INSERT INTO products (sku, title, price) VALUES - ('X123', 'Widget', 19.99), - ('Y456', 'Gadget', 29.95) -"); + private function freeList(array $pages, array $h):array + { + $r=$pages; $cur=$h['firstFreelistPage']; while($cur){ $r[$cur-1]['type']='Free Trunk'; $cur=$this->u32($r[$cur-1]['data'],0); } + return $r; + } -// Insert records with long titles (approx 5KB) -$longTitle = str_repeat('A', 5 * 1024); + private function overflowPass(array $p):array + { + return array_map(fn($q)=>$q['type']==='Unknown'?$this->ovPage($q):$q,$p); + } + private function ovPage(array $p):array + { + $d=$p['data']; return array_merge($p,['type'=>'Overflow','nextPage'=>$this->u32($d,0),'payload'=>substr($d,4)]); + } -// Use prepared statements for efficiency and safety with large data -$stmt = $db->prepare("INSERT INTO products (sku, title, price) VALUES (:sku, :title, :price)"); + private function overflow(int $pg):string + { + $pay=''; $cur=$pg; while($cur){ $d=$this->allPages[$cur-1]['data']??''; $pay.=substr($d,4); $cur=$this->u32($d,0);} return $pay; + } -// Record 1 -$stmt->bindValue(':sku', 'LONG1', SQLITE3_TEXT); -$stmt->bindValue(':title', $longTitle, SQLITE3_TEXT); -$stmt->bindValue(':price', 99.99, SQLITE3_FLOAT); -$stmt->execute(); + /* ─────────── row helpers ─────────── */ -// Record 2 -$stmt->bindValue(':sku', 'LONG2', SQLITE3_TEXT); -$stmt->bindValue(':title', $longTitle . 'B', SQLITE3_TEXT); // Slightly different title -$stmt->bindValue(':price', 199.99, SQLITE3_FLOAT); -$stmt->execute(); + private function traverseTable(array $pages, int $pg):array + { + $p=$pages[$pg-1]; if($p['type']==='Table Interior'){ $rows=[]; foreach($p['cells'] as $c){$rows=array_merge($rows,$this->traverseTable($pages,$c['pageNumber']));} + $right=$p['header']['rightChildPageNumber']; return $right?array_merge($rows,$this->traverseTable($pages,$right)):$rows; } + if($p['type']==='Table Leaf'){ return array_map(fn($c)=>$c['payload'],$p['cells']); } + return []; + } -// Record 3 -$stmt->bindValue(':sku', 'LONG3', SQLITE3_TEXT); -$stmt->bindValue(':title', 'C' . $longTitle, SQLITE3_TEXT); // Slightly different title -$stmt->bindValue(':price', 299.99, SQLITE3_FLOAT); -$stmt->execute(); + private function decodeRecord(string $pl):array + { + [$hdr,$l]=$this->varint($pl,0); $pos=$l; $serial=[]; while($pos<$hdr){[$st,$n]=$this->varint($pl,$pos);$serial[]=$st;$pos+=$n;} + $off=$hdr; $v=[]; foreach($serial as $s){ switch($s){ + case 0:$v[]=null;break; case 1:$v[]=unpack('c',$pl[$off])[1];$off+=1;break; + case 2:$v[]=unpack('s>',substr($pl,$off,2))[1];$off+=2;break; + case 3:$b=substr($pl,$off,3);$i=(ord($b[0])<<16)|(ord($b[1])<<8)|ord($b[2]);if($i&0x800000)$i|=~0xffffff;$v[]=$i;$off+=3;break; + case 4:$v[]=unpack('l>',substr($pl,$off,4))[1];$off+=4;break; + case 5:$b=substr($pl,$off,6);$hi=(ord($b[0])<<8)|ord($b[1]);$lo=(ord($b[2])<<24)|(ord($b[3])<<16)|(ord($b[4])<<8)|ord($b[5]);$i=($hi<<32)|$lo;if($i&(1<<47))$i|=~((1<<48)-1);$v[]=$i;$off+=6;break; + case 6:$v[]=unpack('q>',substr($pl,$off,8))[1];$off+=8;break; + case 7:$v[]=unpack('E',substr($pl,$off,8))[1];$off+=8;break; + case 8:$v[]=0;break; case 9:$v[]=1;break; + default:$len=($s-($s%2?13:12))/2; $v[]=substr($pl,$off,$len); $off+=$len; } + } return $v; + } +} -$stmt->close(); // Close the prepared statement +/* ─────────── quick self‑test ─────────── */ +$dbFile = __DIR__.'/test.db'; @unlink($dbFile); +$db=new SQLite3($dbFile); +$db->exec("CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT, age INTEGER)"); +$db->exec("CREATE TABLE products (sku TEXT PRIMARY KEY, title TEXT, price REAL)"); +$db->exec("INSERT INTO users (name,age) VALUES ('Alice',30),('Bob',25),('Carol',28)"); +$db->exec("INSERT INTO products (sku,title,price) VALUES ('X123','Widget',19.99),('Y456','Gadget',29.95)"); +$long=str_repeat('A',5120);$db->exec("INSERT INTO products (sku,title,price) VALUES ('LONG1','$long',99.99)"); $db->close(); -// --- 2. read file, parse pages --- -$buf = file_get_contents($dbFile); -$result = SqliteParser::parseDatabase($buf); -$pages = $result['pages']; +$parser = new SQLiteParser($dbFile); -// helper: walk a table B‑tree from a given root page -function traverseTable(array $pages, int $pageNum): array { - $page = $pages[$pageNum - 1]; - if ($page['type'] === 'Table Interior') { - $rows = []; - foreach ($page['cells'] as $cell) { - $rows = array_merge($rows, traverseTable($pages, $cell['pageNumber'])); - } - $right = $page['header']['rightChildPageNumber']; - return $right - ? array_merge($rows, traverseTable($pages, $right)) - : $rows; - } - if ($page['type'] === 'Table Leaf') { - return array_map(fn($c)=>$c['payload'], $page['cells']); - } - return []; -} - -// helper: decode a record payload into PHP values -function decodeRecord(string $payload): array { - list($hdrLen,) = SqliteParser::parseVarint($payload, 0); - $pos = strlen(pack('C', 0)); // skip initial varint length byte(s) - // actually parse header varints - list($hdrLen2, $hlen) = SqliteParser::parseVarint($payload, 0); - $pos = $hlen; - $serials = []; - while ($pos < $hdrLen2) { - list($st,$l) = SqliteParser::parseVarint($payload, $pos); - $serials[] = $st; - $pos += $l; - } - $dataOff = $hdrLen2; - $vals = []; - foreach ($serials as $st) { - if ($st === 0) { - $vals[] = null; - } elseif ($st === 1) { - $vals[] = unpack('c', $payload[$dataOff])[1]; - $dataOff += 1; - } elseif ($st === 2) { - $vals[] = unpack('s>', substr($payload,$dataOff,2))[1]; - $dataOff += 2; - } elseif ($st === 3) { - $b = substr($payload,$dataOff,3); - $int = (ord($b[0])<<16)|(ord($b[1])<<8)|ord($b[2]); - if ($int & 0x800000) $int |= ~0xffffff; - $vals[] = $int; - $dataOff += 3; - } elseif ($st === 4) { - $vals[] = unpack('l>', substr($payload,$dataOff,4))[1]; - $dataOff += 4; - } elseif ($st === 5) { - $b = substr($payload,$dataOff,6); - $hi = (ord($b[0])<<8)|ord($b[1]); - $lo = (ord($b[2])<<24)|(ord($b[3])<<16)|(ord($b[4])<<8)|ord($b[5]); - $int = ($hi<<32)|$lo; - if ($int & (1<<47)) $int |= ~((1<<48)-1); - $vals[] = $int; - $dataOff += 6; - } elseif ($st === 6) { - $vals[] = unpack('q>', substr($payload,$dataOff,8))[1]; - $dataOff += 8; - } elseif ($st === 7) { - $vals[] = unpack('E', substr($payload,$dataOff,8))[1]; - $dataOff += 8; - } elseif ($st === 8) { - $vals[] = 0; - } elseif ($st === 9) { - $vals[] = 1; - } elseif ($st >= 12) { - $len = ($st - ($st % 2 ? 13 : 12)) / 2; - $data = substr($payload, $dataOff, $len); - $vals[] = ($st % 2) ? $data : $data; // text or blob raw - $dataOff += $len; - } else { - $vals[] = null; - } - } - return $vals; -} - -// --- 3. find sqlite_master rows --- -$masterRows = traverseTable($pages, 1); -$tables = []; -foreach ($masterRows as $pl) { - $cols = decodeRecord($pl); - // sqlite_master columns: type, name, tbl_name, rootpage, sql - if ($cols[0] === 'table') { - $tables[] = [ - 'name' => $cols[1], - 'root' => (int)$cols[3], - ]; - } -} - -// --- 4. output results --- -echo "Tables and their rows:\n\n"; -foreach ($tables as $t) { - echo "-- {$t['name']} --\n"; - $rows = traverseTable($pages, $t['root']); - foreach ($rows as $pl) { - $values = decodeRecord($pl); - // print comma‑separated - echo implode(', ', array_map(fn($v)=>var_export($v, true), $values)), "\n"; - } - echo "\n"; -} +echo "Tables and rows\n\n"; +foreach($parser->iterateTables() as $t){ + echo "-- {$t['name']} --\n"; + echo "Columns: ".implode(', ',$parser->getTableColumns($t['name']))."\n"; + foreach($parser->iterateRecords($t['name']) as $r){ + echo implode(', ',array_map(fn($v)=>var_export($v,true),$r))."\n";} + echo "\n"; +} \ No newline at end of file From a7db7197346bf5ea59b75d692f0666be4b5be523 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 20 Apr 2025 18:22:59 +0200 Subject: [PATCH 3/3] Document the next steps for the SQLiteParser --- components/SQLite/SQLiteParser.php | 190 ++++++++++++++++++++--------- 1 file changed, 133 insertions(+), 57 deletions(-) diff --git a/components/SQLite/SQLiteParser.php b/components/SQLite/SQLiteParser.php index 789680ab..e79da3cd 100644 --- a/components/SQLite/SQLiteParser.php +++ b/components/SQLite/SQLiteParser.php @@ -10,6 +10,15 @@ * • getTableColumns($name) → column names for that table * • iterateRecords($name) → decoded rows * ─────────────────────────────────────────────────────────────────── + * + * Next steps: + * * Adjust the API to be next_table(): bool, get_table(): string, get_columns(), next_record(): bool, get_record(): array + * essentially, to truly support streaming. + * * Add comments to each method and complex fragment that correlates it with the + * relevant setion of the SQLite binary format spec. + * * Examine error hangling + * * Add a solud suite of tests that checks for all kinds of corner cases, + * e.g. records with nulls, records with overflow pages, indexes, etc. */ class SQLiteParser @@ -17,19 +26,45 @@ class SQLiteParser /* ─────────── ctor & state ─────────── */ private array $db = []; - private array $allPages = []; + private array $pageCache = []; + private $fileHandle; + private int $pageSize = 0; + private int $pageCount = 0; + private int $reservedSpace = 0; public function __construct(private string $filePath) { - $buf = file_get_contents($filePath); - $this->db = $this->parseDatabase($buf); + $this->fileHandle = fopen($filePath, 'rb'); + if (!$this->fileHandle) { + throw new Exception("Could not open SQLite database file: $filePath"); + } + + $header = $this->readHeader(); + $this->pageSize = $header['pageSize']; + $this->pageCount = $header['pageCount']; + $this->reservedSpace = $header['reservedSpace']; + + $this->db = [ + 'header' => $header, + 'pages' => [] // Will be populated on demand + ]; + + // Mark free list pages + $this->markFreeListPages($header); + } + + public function __destruct() + { + if ($this->fileHandle) { + fclose($this->fileHandle); + } } /* ─────────── public high‑level API ─────────── */ public function iterateTables(): array { - $rows = $this->traverseTable($this->db['pages'], 1); // sqlite_master + $rows = $this->traverseTable(1); // sqlite_master $tables = []; foreach ($rows as $pl) { $c = $this->decodeRecord($pl); // type, name, tbl_name, rootpage, sql @@ -60,13 +95,69 @@ public function iterateRecords(string $name): array { foreach ($this->iterateTables() as $t) { if (strcasecmp($t['name'], $name) === 0) { - $r = $this->traverseTable($this->db['pages'], $t['root']); + $r = $this->traverseTable($t['root']); return array_map(fn($p) => $this->decodeRecord($p), $r); } } return []; } + /* ─────────── file reading helpers ─────────── */ + + private function readBytes(int $offset, int $length): string + { + fseek($this->fileHandle, $offset); + return fread($this->fileHandle, $length); + } + + private function readPage(int $pageNumber): array + { + // Page numbers are 1-indexed + if (isset($this->pageCache[$pageNumber])) { + return $this->pageCache[$pageNumber]; + } + + $offset = ($pageNumber - 1) * $this->pageSize; + $data = $this->readBytes($offset, $this->pageSize); + + $page = [ + 'number' => $pageNumber, + 'data' => $data, + 'type' => 'Unknown' + ]; + + // Store raw page in cache + $this->pageCache[$pageNumber] = $page; + + // Parse the page + $parsedPage = $this->parsePage($page); + $this->pageCache[$pageNumber] = $parsedPage; + + return $parsedPage; + } + + private function readHeader(): array + { + $headerData = $this->readBytes(0, 100); + return [ + 'pageSize' => $this->u16($headerData, 16), + 'reservedSpace' => $this->u8($headerData, 20), + 'pageCount' => $this->u32($headerData, 28), + 'firstFreelistPage' => $this->u32($headerData, 32), + 'totalFreelistPages' => $this->u32($headerData, 36), + ]; + } + + private function markFreeListPages(array $header): void + { + $cur = $header['firstFreelistPage']; + while ($cur) { + $page = $this->readPage($cur); + $this->pageCache[$cur]['type'] = 'Free Trunk'; + $cur = $this->u32($page['data'], 0); + } + } + /* ─────────── tiny binary helpers ─────────── */ private function u8(string $d, int $o): int { return ord($d[$o]); } @@ -80,45 +171,16 @@ private function varint(string $d, int $o): array return [$v,$l]; } - /* ─────────── DB‑wide parsing ─────────── */ - - private function parseDatabase(string $buf): array - { - $h = $this->hdr($buf); - $pages = $this->split($buf, $h); - $pages = $this->freeList($pages, $h); - $this->allPages = $pages; // raw cache for overflow chains - $pages = array_map(fn($p) => $this->parsePage($p, $h), $pages); - $pages = $this->overflowPass($pages); - return ['header' => $h, 'pages' => $pages]; - } - - private function hdr(string $b): array - { - return [ - 'pageSize' => $this->u16($b,16), 'reservedSpace'=>$this->u8($b,20), - 'pageCount'=> $this->u32($b,28), 'firstFreelistPage'=>$this->u32($b,32), - 'totalFreelistPages'=>$this->u32($b,36), - ]; - } - - private function split(string $b, array $h): array - { - $ps = $h['pageSize']; $cnt = $h['pageCount']; $p = []; - for ($i=0;$i<$cnt;$i++) $p[]=['number'=>$i+1,'data'=>substr($b,$i*$ps,$ps),'type'=>'Unknown']; - return $p; - } - /* ─────────── page parsing (B‑tree, freelist, overflow) — unchanged logic, $this‑ified ─────────── */ - private function parsePage(array $p, array $h): array + private function parsePage(array $p): array { if ($bt = $this->btree($p)) { return match($bt['type']) { 'Table Interior' => $this->tabInt($bt), - 'Table Leaf' => $this->tabLeaf($bt,$h), - 'Index Interior' => $this->idxInt($bt,$h), - 'Index Leaf' => $this->idxLeaf($bt,$h), + 'Table Leaf' => $this->tabLeaf($bt), + 'Index Interior' => $this->idxInt($bt), + 'Index Leaf' => $this->idxLeaf($bt), default => $bt, }; } @@ -145,9 +207,14 @@ private function tabInt(array $p):array usort($cells,fn($a,$b)=>$a['offset']<=>$b['offset']); $p['cells']=$cells; return $p; } - private function tabLeaf(array $p, array $h):array + private function tabLeaf(array $p):array { - $d=$p['data']; $us=$h['pageSize']-$h['reservedSpace']; $max=$us-35; $min=(int)floor((($us-12)*32)/255-23); $cells=[]; + $d=$p['data']; + $us=$this->pageSize-$this->reservedSpace; + $max=$us-35; + $min=(int)floor((($us-12)*32)/255-23); + $cells=[]; + foreach($p['cellPointerArray'] as $c){ $o=$c['value']; [$sz,$sb]=$this->varint($d,$o); $o+=$sb; [$rid,$rb]=$this->varint($d,$o); $o+=$rb; $ls=$sz; $of=false; if($sz>$max){$ls=$min+(($sz-$min)%($us-4)); if($ls>=$max)$ls=$min; $of=true;} @@ -157,19 +224,9 @@ private function tabLeaf(array $p, array $h):array usort($cells,fn($a,$b)=>$a['offset']<=>$b['offset']); $p['cells']=$cells; return $p; } - private function idxInt(array $p, array $h):array { /* identical to static version but $this‑ified */ return $p; } - private function idxLeaf(array $p, array $h):array { /* identical to static version but $this‑ified */ return $p; } + private function idxInt(array $p):array { return $p; } + private function idxLeaf(array $p):array { return $p; } - private function freeList(array $pages, array $h):array - { - $r=$pages; $cur=$h['firstFreelistPage']; while($cur){ $r[$cur-1]['type']='Free Trunk'; $cur=$this->u32($r[$cur-1]['data'],0); } - return $r; - } - - private function overflowPass(array $p):array - { - return array_map(fn($q)=>$q['type']==='Unknown'?$this->ovPage($q):$q,$p); - } private function ovPage(array $p):array { $d=$p['data']; return array_merge($p,['type'=>'Overflow','nextPage'=>$this->u32($d,0),'payload'=>substr($d,4)]); @@ -177,16 +234,35 @@ private function ovPage(array $p):array private function overflow(int $pg):string { - $pay=''; $cur=$pg; while($cur){ $d=$this->allPages[$cur-1]['data']??''; $pay.=substr($d,4); $cur=$this->u32($d,0);} return $pay; + $pay=''; $cur=$pg; + while($cur) { + $page = $this->readPage($cur); + $d = $page['data']; + $pay .= substr($d,4); + $cur = $this->u32($d,0); + } + return $pay; } /* ─────────── row helpers ─────────── */ - private function traverseTable(array $pages, int $pg):array + private function traverseTable(int $pg):array { - $p=$pages[$pg-1]; if($p['type']==='Table Interior'){ $rows=[]; foreach($p['cells'] as $c){$rows=array_merge($rows,$this->traverseTable($pages,$c['pageNumber']));} - $right=$p['header']['rightChildPageNumber']; return $right?array_merge($rows,$this->traverseTable($pages,$right)):$rows; } - if($p['type']==='Table Leaf'){ return array_map(fn($c)=>$c['payload'],$p['cells']); } + $p = $this->readPage($pg); + + if ($p['type'] === 'Table Interior') { + $rows = []; + foreach ($p['cells'] as $c) { + $rows = array_merge($rows, $this->traverseTable($c['pageNumber'])); + } + $right = $p['header']['rightChildPageNumber']; + return $right ? array_merge($rows, $this->traverseTable($right)) : $rows; + } + + if ($p['type'] === 'Table Leaf') { + return array_map(fn($c) => $c['payload'], $p['cells']); + } + return []; }