From 0034b65ba813682a432df5d84847a7a2f36e4a76 Mon Sep 17 00:00:00 2001 From: Pham N Hong Thai Date: Sun, 24 May 2026 14:54:06 +0700 Subject: [PATCH 1/2] fix(export): stream and paginate database dumps to support large databases Resolves OOM/timeout failures when dumping large databases by switching the /export/dump response to a ReadableStream and paginating per-table SELECTs with LIMIT/OFFSET instead of loading the entire result set into memory at once. Closes #59 --- src/export/dump.test.ts | 51 +++++++++++++++++++ src/export/dump.ts | 110 ++++++++++++++++++++++++++-------------- 2 files changed, 123 insertions(+), 38 deletions(-) diff --git a/src/export/dump.test.ts b/src/export/dump.test.ts index ca65b43..d32f91f 100644 --- a/src/export/dump.test.ts +++ b/src/export/dump.test.ts @@ -128,6 +128,57 @@ describe('Database Dump Module', () => { ) }) + it('should paginate large tables across multiple queries', async () => { + const firstPage = Array.from({ length: 1000 }, (_, i) => ({ + id: i + 1, + name: `User${i + 1}`, + })) + const secondPage = [{ id: 1001, name: 'User1001' }] + + vi.mocked(executeOperation) + .mockResolvedValueOnce([{ name: 'users' }]) + .mockResolvedValueOnce([ + { sql: 'CREATE TABLE users (id INTEGER, name TEXT);' }, + ]) + .mockResolvedValueOnce(firstPage) + .mockResolvedValueOnce(secondPage) + + const response = await dumpDatabaseRoute(mockDataSource, mockConfig) + const dumpText = await response.text() + + expect(dumpText).toContain("INSERT INTO users VALUES (1, 'User1');") + expect(dumpText).toContain( + "INSERT INTO users VALUES (1000, 'User1000');" + ) + expect(dumpText).toContain( + "INSERT INTO users VALUES (1001, 'User1001');" + ) + + const issuedSql = vi + .mocked(executeOperation) + .mock.calls.map((c) => (c[0] as any)[0].sql) + expect(issuedSql).toContain( + 'SELECT * FROM users LIMIT 1000 OFFSET 0;' + ) + expect(issuedSql).toContain( + 'SELECT * FROM users LIMIT 1000 OFFSET 1000;' + ) + }) + + it('should serialize NULL values as the NULL keyword', async () => { + vi.mocked(executeOperation) + .mockResolvedValueOnce([{ name: 'users' }]) + .mockResolvedValueOnce([ + { sql: 'CREATE TABLE users (id INTEGER, name TEXT);' }, + ]) + .mockResolvedValueOnce([{ id: 1, name: null }]) + + const response = await dumpDatabaseRoute(mockDataSource, mockConfig) + const dumpText = await response.text() + + expect(dumpText).toContain('INSERT INTO users VALUES (1, NULL);') + }) + it('should return a 500 response when an error occurs', async () => { const consoleErrorMock = vi .spyOn(console, 'error') diff --git a/src/export/dump.ts b/src/export/dump.ts index 91a2e89..33dd629 100644 --- a/src/export/dump.ts +++ b/src/export/dump.ts @@ -3,67 +3,101 @@ import { StarbaseDBConfiguration } from '../handler' import { DataSource } from '../types' import { createResponse } from '../utils' +// Number of rows fetched per page when dumping a table. Keeping this +// bounded avoids loading entire tables into Worker memory, which is +// what previously caused dumps of large databases to fail. +const DUMP_PAGE_SIZE = 1000 + +function formatValue(value: unknown): string { + if (value === null || value === undefined) return 'NULL' + if (typeof value === 'string') return `'${value.replace(/'/g, "''")}'` + return String(value) +} + export async function dumpDatabaseRoute( dataSource: DataSource, config: StarbaseDBConfiguration ): Promise { try { - // Get all table names + // Resolve the list of tables up front so any failure surfaces as a + // 500 (matching prior behavior) rather than mid-stream. const tablesResult = await executeOperation( [{ sql: "SELECT name FROM sqlite_master WHERE type='table';" }], dataSource, config ) - const tables = tablesResult.map((row: any) => row.name) - let dumpContent = 'SQLite format 3\0' // SQLite file header - // Iterate through all tables - for (const table of tables) { - // Get table schema - const schemaResult = await executeOperation( - [ - { - sql: `SELECT sql FROM sqlite_master WHERE type='table' AND name='${table}';`, - }, - ], - dataSource, - config - ) + const encoder = new TextEncoder() + const stream = new ReadableStream({ + async start(controller) { + try { + controller.enqueue(encoder.encode('SQLite format 3\0')) - if (schemaResult.length) { - const schema = schemaResult[0].sql - dumpContent += `\n-- Table: ${table}\n${schema};\n\n` - } + for (const table of tables) { + const schemaResult = await executeOperation( + [ + { + sql: `SELECT sql FROM sqlite_master WHERE type='table' AND name='${table}';`, + }, + ], + dataSource, + config + ) - // Get table data - const dataResult = await executeOperation( - [{ sql: `SELECT * FROM ${table};` }], - dataSource, - config - ) + if (schemaResult.length) { + const schema = schemaResult[0].sql + controller.enqueue( + encoder.encode( + `\n-- Table: ${table}\n${schema};\n\n` + ) + ) + } - for (const row of dataResult) { - const values = Object.values(row).map((value) => - typeof value === 'string' - ? `'${value.replace(/'/g, "''")}'` - : value - ) - dumpContent += `INSERT INTO ${table} VALUES (${values.join(', ')});\n` - } + // Page through the table so we never materialize the + // full result set in memory. + let offset = 0 + while (true) { + const dataResult = await executeOperation( + [ + { + sql: `SELECT * FROM ${table} LIMIT ${DUMP_PAGE_SIZE} OFFSET ${offset};`, + }, + ], + dataSource, + config + ) - dumpContent += '\n' - } + if (!dataResult.length) break - // Create a Blob from the dump content - const blob = new Blob([dumpContent], { type: 'application/x-sqlite3' }) + let chunk = '' + for (const row of dataResult) { + const values = + Object.values(row).map(formatValue) + chunk += `INSERT INTO ${table} VALUES (${values.join(', ')});\n` + } + controller.enqueue(encoder.encode(chunk)) + + if (dataResult.length < DUMP_PAGE_SIZE) break + offset += DUMP_PAGE_SIZE + } + + controller.enqueue(encoder.encode('\n')) + } + + controller.close() + } catch (error) { + controller.error(error) + } + }, + }) const headers = new Headers({ 'Content-Type': 'application/x-sqlite3', 'Content-Disposition': 'attachment; filename="database_dump.sql"', }) - return new Response(blob, { headers }) + return new Response(stream, { headers }) } catch (error: any) { console.error('Database Dump Error:', error) return createResponse(undefined, 'Failed to create database dump', 500) From 34b392720eec0265784976c6fc5051fe5ac4a180 Mon Sep 17 00:00:00 2001 From: Pham N Hong Thai Date: Sun, 24 May 2026 14:55:58 +0700 Subject: [PATCH 2/2] fix(export): stream and paginate database dumps to support large databases --- .bounty_pr.json | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .bounty_pr.json diff --git a/.bounty_pr.json b/.bounty_pr.json new file mode 100644 index 0000000..ff2d40c --- /dev/null +++ b/.bounty_pr.json @@ -0,0 +1,18 @@ +{ + "status": "ready", + "commit_message": "fix(export): stream and paginate database dumps to support large databases", + "pr_title": "fix(export): stream and paginate /export/dump for large databases", + "pr_body": "## Purpose\n\nFixes #59 — `/export/dump` previously failed on large databases because it loaded **every row of every table** into a single in-memory string before responding. On any non-trivial database this exceeds the Worker's memory budget and/or wall-clock and the dump never completes.\n\n## Changes\n\n- `src/export/dump.ts`: response is now produced via a `ReadableStream`, so chunks are flushed to the client as they are generated instead of being concatenated in memory.\n- Per-table data is paged with `SELECT * FROM LIMIT 1000 OFFSET ` and the loop stops as soon as a page returns fewer rows than the page size. This keeps peak memory bounded to one page (~1000 rows) regardless of table size.\n- Small correctness improvement in value serialization: `NULL`/`undefined` are now emitted as the SQL `NULL` keyword (previously they were stringified to the literal text `null`, which only parsed correctly by accident).\n- All existing test cases continue to pass unchanged; added two new tests:\n - paginates across multiple `LIMIT/OFFSET` queries when a table is larger than the page size\n - serializes `NULL` values as the `NULL` keyword\n\nThe public route, headers, and dump format are unchanged — this is a drop-in fix.\n\n## Tasks\n\n- [x] Stream the dump response instead of buffering it\n- [x] Page through table rows instead of `SELECT *` in one shot\n- [x] Preserve existing dump format and test expectations\n- [x] Add tests for pagination and NULL handling\n\n## Verify\n\n- `npx vitest run src/export/` → 25 passed (4 files)\n- `npx vitest run src/export/dump.test.ts` → 7 passed (5 original + 2 new)\n- `npx tsc --noEmit` introduces no new errors in `src/export/dump.ts` (only pre-existing errors in unrelated files remain).\n\nCloses #59", + "branch": "fix/issue-59-starbasedb-database-dumps-do-not", + "tests_run": [ + "npx vitest run src/export/dump.test.ts", + "npx vitest run src/export/", + "npx tsc --noEmit" + ], + "tests_passed": true, + "files_changed": [ + "src/export/dump.ts", + "src/export/dump.test.ts" + ], + "notes": "Minimum-change fix scoped to the dump route only. I intentionally did NOT adopt the prior-analysis suggestion of R2 multipart uploads + Durable Object alarms — that is a much larger feature and a significant architectural decision that the maintainer should drive, not a bounty hunter. Streaming + pagination resolves the OOM root cause described in the issue while keeping the existing /export/dump API, headers, and format byte-compatible. The bogus 'SQLite format 3\\0' header at the top of a text SQL dump is preserved unchanged because removing it would change the dump format and existing tests depend on it. Pre-existing TypeScript errors in plugins/cdc, src/cache, src/do, and src/operation are unrelated to this change." +}