From 70742447e1080087798a1576b384bb9373653cd6 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 21:24:48 +0000 Subject: [PATCH 1/3] feat(duckdb): implement copy_to_get_written_statistics for COPY function Implements the copy_to_get_written_statistics callback across the full C/C++/Rust stack, enabling duck-lake (and similar table-format) layers to collect per-file write statistics (row_count, file_size_bytes) after a COPY TO operation completes. Changes: - Add duckdb_vx_written_statistics_t C struct and optional copy_to_get_written_statistics callback (returning bool) to the vtable in copy_function.h - Bridge the callback through c_get_written_statistics in copy_function.cpp, which converts the C struct into a DuckDB vector STRUCT; returns an empty vector when the callback returns false (no stats available) - Add WrittenStatistics type and get_written_statistics trait method to the CopyFunction trait with a default no-op implementation - Add copy_to_get_written_statistics_callback shim in callback.rs - Store WriteSummary in GlobalState after copy_to_finalize and implement get_written_statistics for VortexCopyFunction to expose row_count and file_size_bytes Co-authored-by: Nicholas Gates Signed-off-by: "Claude" --- vortex-duckdb/cpp/copy_function.cpp | 31 +++++++++++++++++++ .../cpp/include/duckdb_vx/copy_function.h | 19 ++++++++++++ vortex-duckdb/src/copy.rs | 18 ++++++++++- .../src/duckdb/copy_function/callback.rs | 26 ++++++++++++++++ vortex-duckdb/src/duckdb/copy_function/mod.rs | 26 ++++++++++++++++ 5 files changed, 119 insertions(+), 1 deletion(-) diff --git a/vortex-duckdb/cpp/copy_function.cpp b/vortex-duckdb/cpp/copy_function.cpp index e05aa942189..73c1114db13 100644 --- a/vortex-duckdb/cpp/copy_function.cpp +++ b/vortex-duckdb/cpp/copy_function.cpp @@ -134,6 +134,34 @@ void copy_to_finalize(ClientContext & /*context*/, FunctionData &bind_data, Glob } } +vector c_get_written_statistics(ClientContext & /*context*/, + FunctionData &bind_data, + GlobalFunctionData &gstate) { + auto &bind = bind_data.Cast(); + auto &global = gstate.Cast(); + if (!bind.vtab.copy_to_get_written_statistics) { + return {}; + } + + duckdb_vx_error error_out = nullptr; + duckdb_vx_written_statistics_t stats = {}; + bool has_stats = bind.vtab.copy_to_get_written_statistics(bind.ffi_data->DataPtr(), + global.ffi_data->DataPtr(), + &stats, + &error_out); + if (error_out) { + throw ExecutorException(IntoErrString(error_out)); + } + if (!has_stats) { + return {}; + } + + child_list_t struct_vals; + struct_vals.emplace_back("row_count", Value::UBIGINT(stats.row_count)); + struct_vals.emplace_back("file_size_bytes", Value::UBIGINT(stats.file_size_bytes)); + return {Value::STRUCT(std::move(struct_vals))}; +} + extern "C" duckdb_vx_copy_func_vtab_t *get_vtab_one() { return ©_vtab_one; } @@ -153,6 +181,9 @@ extern "C" duckdb_state duckdb_vx_copy_func_register_vtab_one(duckdb_database ff copy_function.copy_to_sink = c_copy_to_sink; copy_function.copy_to_finalize = copy_to_finalize; + if (copy_vtab_one.copy_to_get_written_statistics) { + copy_function.copy_to_get_written_statistics = c_get_written_statistics; + } copy_function.extension = copy_vtab_one.extension; // TODO(joe): expose this via c our api diff --git a/vortex-duckdb/cpp/include/duckdb_vx/copy_function.h b/vortex-duckdb/cpp/include/duckdb_vx/copy_function.h index cb9142ed710..fa72203db41 100644 --- a/vortex-duckdb/cpp/include/duckdb_vx/copy_function.h +++ b/vortex-duckdb/cpp/include/duckdb_vx/copy_function.h @@ -33,6 +33,14 @@ typedef struct { // BATCH_COPY_TO_FILE // } copy_function_execution_mode; +/// Statistics about a single written file, reported via copy_to_get_written_statistics. +typedef struct { + /// Total number of rows written to the file. + unsigned long long row_count; + /// Total size of the written file in bytes. + unsigned long long file_size_bytes; +} duckdb_vx_written_statistics_t; + // A transparent DuckDB copy function vtable, which can be used to configure a copy function. typedef struct { // The name of the copy function. @@ -63,6 +71,17 @@ typedef struct { void (*copy_to_finalize)(const void *bind_data, void *global_data, duckdb_vx_error *error_out); + /// Optional callback to report per-file write statistics back to DuckDB. + /// + /// Called after copy_to_finalize. If non-null and the callback returns true, `stats_out` + /// is filled with aggregate statistics about the written file, and DuckDB exposes these + /// to table-format layers (e.g. duck-lake) for use in manifest or catalog entries. + /// Return false to indicate that no statistics are available for this write. + bool (*copy_to_get_written_statistics)(const void *bind_data, + const void *global_data, + duckdb_vx_written_statistics_t *stats_out, + duckdb_vx_error *error_out); + // TODO(joe): expose via c api // copy_function_execution_mode (*execution_mode)(bool preserve_insertion_order, bool // supports_batch_index); diff --git a/vortex-duckdb/src/copy.rs b/vortex-duckdb/src/copy.rs index fa6b25b5ce7..aa3e43e99ba 100644 --- a/vortex-duckdb/src/copy.rs +++ b/vortex-duckdb/src/copy.rs @@ -34,6 +34,7 @@ use crate::duckdb::CopyFunction; use crate::duckdb::DataChunkRef; use crate::duckdb::DuckDbFsWriter; use crate::duckdb::LogicalTypeRef; +use crate::duckdb::WrittenStatistics; #[derive(Debug)] pub struct VortexCopyFunction; @@ -56,6 +57,9 @@ pub struct GlobalState { // into us, and we call `RUNTIME.block_on`. #[expect(dead_code)] worker_pool: CurrentThreadWorkerPool, + /// Summary populated by [`VortexCopyFunction::copy_to_finalize`] for use in + /// [`VortexCopyFunction::get_written_statistics`]. + write_summary: Option, } impl CopyFunction for VortexCopyFunction { @@ -112,7 +116,8 @@ impl CopyFunction for VortexCopyFunction { .lock() .take() .vortex_expect("no file to close"); - task.await?; + let summary = task.await?; + init_global.write_summary = Some(summary); Ok(()) }) } @@ -145,10 +150,21 @@ impl CopyFunction for VortexCopyFunction { worker_pool, write_task: Mutex::new(Some(write_task)), sink: Some(sink), + write_summary: None, }) } fn init_local(_global: &Self::BindData) -> VortexResult { Ok(()) } + + fn get_written_statistics( + _bind_data: &Self::BindData, + global_state: &Self::GlobalState, + ) -> VortexResult> { + Ok(global_state.write_summary.as_ref().map(|s| WrittenStatistics { + row_count: s.row_count(), + file_size_bytes: s.size(), + })) + } } diff --git a/vortex-duckdb/src/duckdb/copy_function/callback.rs b/vortex-duckdb/src/duckdb/copy_function/callback.rs index 0bc303fe1cb..bff1cd0cc78 100644 --- a/vortex-duckdb/src/duckdb/copy_function/callback.rs +++ b/vortex-duckdb/src/duckdb/copy_function/callback.rs @@ -14,6 +14,7 @@ use crate::cpp::duckdb_data_chunk; use crate::cpp::duckdb_logical_type; use crate::cpp::duckdb_vx_copy_func_bind_input; use crate::cpp::duckdb_vx_error; +use crate::cpp::duckdb_vx_written_statistics_t; use crate::duckdb::ClientContext; use crate::duckdb::CopyFunction; use crate::duckdb::Data; @@ -118,3 +119,28 @@ pub(crate) unsafe extern "C-unwind" fn copy_to_finalize_callback( + bind_data: *const c_void, + global_data: *const c_void, + stats_out: *mut duckdb_vx_written_statistics_t, + error_out: *mut duckdb_vx_error, +) -> bool { + let bind_data = + unsafe { bind_data.cast::().as_ref() }.vortex_expect("bind_data null pointer"); + let global_data = unsafe { global_data.cast::().as_ref() } + .vortex_expect("global_data null pointer"); + + try_or(error_out, || { + match T::get_written_statistics(bind_data, global_data)? { + Some(stats) => { + if let Some(out) = unsafe { stats_out.as_mut() } { + out.row_count = stats.row_count; + out.file_size_bytes = stats.file_size_bytes; + } + Ok(true) + } + None => Ok(false), + } + }) +} diff --git a/vortex-duckdb/src/duckdb/copy_function/mod.rs b/vortex-duckdb/src/duckdb/copy_function/mod.rs index 420ca7c6854..697e965eccf 100644 --- a/vortex-duckdb/src/duckdb/copy_function/mod.rs +++ b/vortex-duckdb/src/duckdb/copy_function/mod.rs @@ -16,11 +16,24 @@ use crate::duckdb::DatabaseRef; use crate::duckdb::LogicalTypeRef; use crate::duckdb::copy_function::callback::bind_callback; use crate::duckdb::copy_function::callback::copy_to_finalize_callback; +use crate::duckdb::copy_function::callback::copy_to_get_written_statistics_callback; use crate::duckdb::copy_function::callback::copy_to_sink_callback; use crate::duckdb::copy_function::callback::global_callback; use crate::duckdb::copy_function::callback::local_callback; use crate::duckdb_try; +/// Statistics about a single written Vortex file. +/// +/// Returned by [`CopyFunction::get_written_statistics`] and forwarded to DuckDB via +/// `copy_to_get_written_statistics`. Table-format layers (e.g. duck-lake) consume these +/// values to build manifest or catalog entries. +pub struct WrittenStatistics { + /// Total number of rows written to the file. + pub row_count: u64, + /// Total size of the written file in bytes. + pub file_size_bytes: u64, +} + pub trait CopyFunction: Sized + Debug { type BindData: Send; type GlobalState: Send + Sync; @@ -61,6 +74,17 @@ pub trait CopyFunction: Sized + Debug { /// is thread-local. fn init_local(bind: &Self::BindData) -> VortexResult; + /// Returns per-file write statistics after [`copy_to_finalize`][Self::copy_to_finalize]. + /// + /// Return `Some` to report statistics to DuckDB (e.g. for duck-lake manifest entries). + /// The default implementation returns `None`, which disables the callback. + fn get_written_statistics( + _bind_data: &Self::BindData, + _global_state: &Self::GlobalState, + ) -> VortexResult> { + Ok(None) + } + // TODO(joe): there are many more callbacks that can be configured. } @@ -80,6 +104,8 @@ impl DatabaseRef { vtab.init_local = Some(local_callback::); vtab.copy_to_sink = Some(copy_to_sink_callback::); vtab.copy_to_finalize = Some(copy_to_finalize_callback::); + vtab.copy_to_get_written_statistics = + Some(copy_to_get_written_statistics_callback::); duckdb_try!( unsafe { cpp::duckdb_vx_copy_func_register_vtab_one(self.as_ptr()) }, From d837df1ca04584080c0f75d1b4fa5bace6a9a645 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 6 May 2026 17:51:13 -0400 Subject: [PATCH 2/3] fix(duckdb): match DuckDB's copy_to_get_written_statistics signature The C++ shim returned vector from c_get_written_statistics, but DuckDB's copy_to_get_written_statistics_t typedef is a void-returning callback that fills a CopyFunctionFileStatistics out-parameter. This caused the C++ build to fail in any job that links vortex-duckdb. Also re-run nightly rustfmt over copy.rs and copy_function/mod.rs to match CI's nightly fmt check, and add the lessons (verify FFI signatures against bundled DuckDB headers; always run nightly fmt) to AGENTS.md. Signed-off-by: Nicholas Gates Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Nicholas Gates --- AGENTS.md | 8 +++++++ vortex-duckdb/cpp/copy_function.cpp | 23 +++++++++---------- vortex-duckdb/src/copy.rs | 11 +++++---- vortex-duckdb/src/duckdb/copy_function/mod.rs | 3 +-- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 6ba6006251f..cee456acd8f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -159,6 +159,14 @@ Check new and modified lines against this list before finishing: - Updating expected test output to match buggy behavior without independently verifying the intended semantics. - Silently reducing the scope of an approved plan when implementation is harder than expected. +- Skipping `cargo +nightly fmt --all -- --check` after editing Rust code. CI's `Rust Lint - Format` + step uses the nightly toolchain and rejects diffs even when stable `cargo fmt` looks clean. +- For `vortex-duckdb` C/C++ FFI changes: assuming a DuckDB callback signature from memory or + upstream docs instead of reading the bundled DuckDB headers under + `target/*/build/vortex-duckdb-*/out/duckdb-source-v*/duckdb-*/src/include/duckdb/` for the + exact `typedef` (e.g. `copy_to_get_written_statistics_t`). DuckDB is vendored at a pinned + version, so the bundled headers are authoritative. A wrong shim signature only fails at C++ + compile time inside `build.rs`, so always run `cargo build -p vortex-duckdb` before stopping. ## Summaries diff --git a/vortex-duckdb/cpp/copy_function.cpp b/vortex-duckdb/cpp/copy_function.cpp index 73c1114db13..4c508cbe115 100644 --- a/vortex-duckdb/cpp/copy_function.cpp +++ b/vortex-duckdb/cpp/copy_function.cpp @@ -134,32 +134,31 @@ void copy_to_finalize(ClientContext & /*context*/, FunctionData &bind_data, Glob } } -vector c_get_written_statistics(ClientContext & /*context*/, - FunctionData &bind_data, - GlobalFunctionData &gstate) { +void c_get_written_statistics(ClientContext & /*context*/, + FunctionData &bind_data, + GlobalFunctionData &gstate, + CopyFunctionFileStatistics &statistics) { auto &bind = bind_data.Cast(); auto &global = gstate.Cast(); if (!bind.vtab.copy_to_get_written_statistics) { - return {}; + return; } duckdb_vx_error error_out = nullptr; duckdb_vx_written_statistics_t stats = {}; bool has_stats = bind.vtab.copy_to_get_written_statistics(bind.ffi_data->DataPtr(), - global.ffi_data->DataPtr(), - &stats, - &error_out); + global.ffi_data->DataPtr(), + &stats, + &error_out); if (error_out) { throw ExecutorException(IntoErrString(error_out)); } if (!has_stats) { - return {}; + return; } - child_list_t struct_vals; - struct_vals.emplace_back("row_count", Value::UBIGINT(stats.row_count)); - struct_vals.emplace_back("file_size_bytes", Value::UBIGINT(stats.file_size_bytes)); - return {Value::STRUCT(std::move(struct_vals))}; + statistics.row_count = stats.row_count; + statistics.file_size_bytes = stats.file_size_bytes; } extern "C" duckdb_vx_copy_func_vtab_t *get_vtab_one() { diff --git a/vortex-duckdb/src/copy.rs b/vortex-duckdb/src/copy.rs index aa3e43e99ba..a76f52edfb1 100644 --- a/vortex-duckdb/src/copy.rs +++ b/vortex-duckdb/src/copy.rs @@ -162,9 +162,12 @@ impl CopyFunction for VortexCopyFunction { _bind_data: &Self::BindData, global_state: &Self::GlobalState, ) -> VortexResult> { - Ok(global_state.write_summary.as_ref().map(|s| WrittenStatistics { - row_count: s.row_count(), - file_size_bytes: s.size(), - })) + Ok(global_state + .write_summary + .as_ref() + .map(|s| WrittenStatistics { + row_count: s.row_count(), + file_size_bytes: s.size(), + })) } } diff --git a/vortex-duckdb/src/duckdb/copy_function/mod.rs b/vortex-duckdb/src/duckdb/copy_function/mod.rs index 697e965eccf..30be70b91a4 100644 --- a/vortex-duckdb/src/duckdb/copy_function/mod.rs +++ b/vortex-duckdb/src/duckdb/copy_function/mod.rs @@ -104,8 +104,7 @@ impl DatabaseRef { vtab.init_local = Some(local_callback::); vtab.copy_to_sink = Some(copy_to_sink_callback::); vtab.copy_to_finalize = Some(copy_to_finalize_callback::); - vtab.copy_to_get_written_statistics = - Some(copy_to_get_written_statistics_callback::); + vtab.copy_to_get_written_statistics = Some(copy_to_get_written_statistics_callback::); duckdb_try!( unsafe { cpp::duckdb_vx_copy_func_register_vtab_one(self.as_ptr()) }, From c337153ca12caa4a190a7deb5c87b18cdf40c060 Mon Sep 17 00:00:00 2001 From: Nicholas Gates Date: Wed, 6 May 2026 17:53:27 -0400 Subject: [PATCH 3/3] docs(agents): reframe vortex-duckdb FFI guidance as positive instructions Move the lessons learned out of the "Common Mistakes" do-not list into a dedicated "Native FFI in vortex-duckdb" section that states what to do. Signed-off-by: Nicholas Gates Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Nicholas Gates --- AGENTS.md | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index cee456acd8f..68e14a8ffce 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -159,14 +159,23 @@ Check new and modified lines against this list before finishing: - Updating expected test output to match buggy behavior without independently verifying the intended semantics. - Silently reducing the scope of an approved plan when implementation is harder than expected. -- Skipping `cargo +nightly fmt --all -- --check` after editing Rust code. CI's `Rust Lint - Format` - step uses the nightly toolchain and rejects diffs even when stable `cargo fmt` looks clean. -- For `vortex-duckdb` C/C++ FFI changes: assuming a DuckDB callback signature from memory or - upstream docs instead of reading the bundled DuckDB headers under - `target/*/build/vortex-duckdb-*/out/duckdb-source-v*/duckdb-*/src/include/duckdb/` for the - exact `typedef` (e.g. `copy_to_get_written_statistics_t`). DuckDB is vendored at a pinned - version, so the bundled headers are authoritative. A wrong shim signature only fails at C++ - compile time inside `build.rs`, so always run `cargo build -p vortex-duckdb` before stopping. + +## Native FFI in vortex-duckdb + +`vortex-duckdb` vendors a pinned DuckDB version and compiles a C++ shim under +`vortex-duckdb/cpp/` from `build.rs`. When writing or modifying any callback that DuckDB +invokes through a C/C++ function pointer: + +- Treat the bundled DuckDB headers as the authoritative reference for callback signatures. + Read the `typedef` directly from + `target/*/build/vortex-duckdb-*/out/duckdb-source-v*/duckdb-*/src/include/duckdb/` + (for example `function/copy_function.hpp` for `copy_to_get_written_statistics_t`). + Do not rely on memory or upstream docs, which may describe a different DuckDB version. +- Run `cargo build -p vortex-duckdb` before stopping. C++ signature mismatches only surface + at C++ compile time inside `build.rs`, so they are invisible to Rust-only checks. +- Run `cargo +nightly fmt --all -- --check` to confirm formatting matches the nightly + toolchain CI uses; stable `cargo fmt` may accept code that the nightly `Rust Lint - Format` + step rejects. ## Summaries