Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .bounty_pr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"status": "ready",
"commit_message": "fix(export): stream and paginate database dumps to support large databases",
"pr_title": "fix(export): stream and paginate /export/dump for large databases",
"pr_body": "## Purpose\n\nFixes #59 β€” `/export/dump` previously failed on large databases because it loaded **every row of every table** into a single in-memory string before responding. On any non-trivial database this exceeds the Worker's memory budget and/or wall-clock and the dump never completes.\n\n## Changes\n\n- `src/export/dump.ts`: response is now produced via a `ReadableStream`, so chunks are flushed to the client as they are generated instead of being concatenated in memory.\n- Per-table data is paged with `SELECT * FROM <table> LIMIT 1000 OFFSET <n>` and the loop stops as soon as a page returns fewer rows than the page size. This keeps peak memory bounded to one page (~1000 rows) regardless of table size.\n- Small correctness improvement in value serialization: `NULL`/`undefined` are now emitted as the SQL `NULL` keyword (previously they were stringified to the literal text `null`, which only parsed correctly by accident).\n- All existing test cases continue to pass unchanged; added two new tests:\n - paginates across multiple `LIMIT/OFFSET` queries when a table is larger than the page size\n - serializes `NULL` values as the `NULL` keyword\n\nThe public route, headers, and dump format are unchanged β€” this is a drop-in fix.\n\n## Tasks\n\n- [x] Stream the dump response instead of buffering it\n- [x] Page through table rows instead of `SELECT *` in one shot\n- [x] Preserve existing dump format and test expectations\n- [x] Add tests for pagination and NULL handling\n\n## Verify\n\n- `npx vitest run src/export/` β†’ 25 passed (4 files)\n- `npx vitest run src/export/dump.test.ts` β†’ 7 passed (5 original + 2 new)\n- `npx tsc --noEmit` introduces no new errors in `src/export/dump.ts` (only pre-existing errors in unrelated files remain).\n\nCloses #59",
"branch": "fix/issue-59-starbasedb-database-dumps-do-not",
"tests_run": [
"npx vitest run src/export/dump.test.ts",
"npx vitest run src/export/",
"npx tsc --noEmit"
],
"tests_passed": true,
"files_changed": [
"src/export/dump.ts",
"src/export/dump.test.ts"
],
"notes": "Minimum-change fix scoped to the dump route only. I intentionally did NOT adopt the prior-analysis suggestion of R2 multipart uploads + Durable Object alarms β€” that is a much larger feature and a significant architectural decision that the maintainer should drive, not a bounty hunter. Streaming + pagination resolves the OOM root cause described in the issue while keeping the existing /export/dump API, headers, and format byte-compatible. The bogus 'SQLite format 3\\0' header at the top of a text SQL dump is preserved unchanged because removing it would change the dump format and existing tests depend on it. Pre-existing TypeScript errors in plugins/cdc, src/cache, src/do, and src/operation are unrelated to this change."
}
51 changes: 51 additions & 0 deletions src/export/dump.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,57 @@ describe('Database Dump Module', () => {
)
})

it('should paginate large tables across multiple queries', async () => {
const firstPage = Array.from({ length: 1000 }, (_, i) => ({
id: i + 1,
name: `User${i + 1}`,
}))
const secondPage = [{ id: 1001, name: 'User1001' }]

vi.mocked(executeOperation)
.mockResolvedValueOnce([{ name: 'users' }])
.mockResolvedValueOnce([
{ sql: 'CREATE TABLE users (id INTEGER, name TEXT);' },
])
.mockResolvedValueOnce(firstPage)
.mockResolvedValueOnce(secondPage)

const response = await dumpDatabaseRoute(mockDataSource, mockConfig)
const dumpText = await response.text()

expect(dumpText).toContain("INSERT INTO users VALUES (1, 'User1');")
expect(dumpText).toContain(
"INSERT INTO users VALUES (1000, 'User1000');"
)
expect(dumpText).toContain(
"INSERT INTO users VALUES (1001, 'User1001');"
)

const issuedSql = vi
.mocked(executeOperation)
.mock.calls.map((c) => (c[0] as any)[0].sql)
expect(issuedSql).toContain(
'SELECT * FROM users LIMIT 1000 OFFSET 0;'
)
expect(issuedSql).toContain(
'SELECT * FROM users LIMIT 1000 OFFSET 1000;'
)
})

it('should serialize NULL values as the NULL keyword', async () => {
vi.mocked(executeOperation)
.mockResolvedValueOnce([{ name: 'users' }])
.mockResolvedValueOnce([
{ sql: 'CREATE TABLE users (id INTEGER, name TEXT);' },
])
.mockResolvedValueOnce([{ id: 1, name: null }])

const response = await dumpDatabaseRoute(mockDataSource, mockConfig)
const dumpText = await response.text()

expect(dumpText).toContain('INSERT INTO users VALUES (1, NULL);')
})

it('should return a 500 response when an error occurs', async () => {
const consoleErrorMock = vi
.spyOn(console, 'error')
Expand Down
110 changes: 72 additions & 38 deletions src/export/dump.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,67 +3,101 @@ import { StarbaseDBConfiguration } from '../handler'
import { DataSource } from '../types'
import { createResponse } from '../utils'

// Number of rows fetched per page when dumping a table. Keeping this
// bounded avoids loading entire tables into Worker memory, which is
// what previously caused dumps of large databases to fail.
const DUMP_PAGE_SIZE = 1000

function formatValue(value: unknown): string {
if (value === null || value === undefined) return 'NULL'
if (typeof value === 'string') return `'${value.replace(/'/g, "''")}'`
return String(value)
}

export async function dumpDatabaseRoute(
dataSource: DataSource,
config: StarbaseDBConfiguration
): Promise<Response> {
try {
// Get all table names
// Resolve the list of tables up front so any failure surfaces as a
// 500 (matching prior behavior) rather than mid-stream.
const tablesResult = await executeOperation(
[{ sql: "SELECT name FROM sqlite_master WHERE type='table';" }],
dataSource,
config
)

const tables = tablesResult.map((row: any) => row.name)
let dumpContent = 'SQLite format 3\0' // SQLite file header

// Iterate through all tables
for (const table of tables) {
// Get table schema
const schemaResult = await executeOperation(
[
{
sql: `SELECT sql FROM sqlite_master WHERE type='table' AND name='${table}';`,
},
],
dataSource,
config
)
const encoder = new TextEncoder()
const stream = new ReadableStream<Uint8Array>({
async start(controller) {
try {
controller.enqueue(encoder.encode('SQLite format 3\0'))

if (schemaResult.length) {
const schema = schemaResult[0].sql
dumpContent += `\n-- Table: ${table}\n${schema};\n\n`
}
for (const table of tables) {
const schemaResult = await executeOperation(
[
{
sql: `SELECT sql FROM sqlite_master WHERE type='table' AND name='${table}';`,
},
],
dataSource,
config
)

// Get table data
const dataResult = await executeOperation(
[{ sql: `SELECT * FROM ${table};` }],
dataSource,
config
)
if (schemaResult.length) {
const schema = schemaResult[0].sql
controller.enqueue(
encoder.encode(
`\n-- Table: ${table}\n${schema};\n\n`
)
)
}

for (const row of dataResult) {
const values = Object.values(row).map((value) =>
typeof value === 'string'
? `'${value.replace(/'/g, "''")}'`
: value
)
dumpContent += `INSERT INTO ${table} VALUES (${values.join(', ')});\n`
}
// Page through the table so we never materialize the
// full result set in memory.
let offset = 0
while (true) {
const dataResult = await executeOperation(
[
{
sql: `SELECT * FROM ${table} LIMIT ${DUMP_PAGE_SIZE} OFFSET ${offset};`,
},
],
dataSource,
config
)

dumpContent += '\n'
}
if (!dataResult.length) break

// Create a Blob from the dump content
const blob = new Blob([dumpContent], { type: 'application/x-sqlite3' })
let chunk = ''
for (const row of dataResult) {
const values =
Object.values(row).map(formatValue)
chunk += `INSERT INTO ${table} VALUES (${values.join(', ')});\n`
}
controller.enqueue(encoder.encode(chunk))

if (dataResult.length < DUMP_PAGE_SIZE) break
offset += DUMP_PAGE_SIZE
}

controller.enqueue(encoder.encode('\n'))
}

controller.close()
} catch (error) {
controller.error(error)
}
},
})

const headers = new Headers({
'Content-Type': 'application/x-sqlite3',
'Content-Disposition': 'attachment; filename="database_dump.sql"',
})

return new Response(blob, { headers })
return new Response(stream, { headers })
} catch (error: any) {
console.error('Database Dump Error:', error)
return createResponse(undefined, 'Failed to create database dump', 500)
Expand Down