From 5e0eab0ab9f8e8082935c4ff59e81be866ca3bd5 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Thu, 12 Feb 2026 16:34:57 -0500 Subject: [PATCH 01/19] basic metastore integrationa and distributed planning --- quickwit/Cargo.lock | 1665 ++++++++++++++++- quickwit/Cargo.toml | 6 + quickwit/quickwit-datafusion/Cargo.toml | 38 + .../quickwit-datafusion/TODO-Datafusion.md | 91 + quickwit/quickwit-datafusion/src/lib.rs | 7 + .../quickwit-datafusion/src/split_opener.rs | 113 ++ .../quickwit-datafusion/src/table_provider.rs | 132 ++ quickwit/quickwit-datafusion/src/worker.rs | 38 + .../tests/distributed_join_plan.rs | 241 +++ .../quickwit-datafusion/tests/join_plan.rs | 331 ++++ .../quickwit-datafusion/tests/single_node.rs | 252 +++ 11 files changed, 2891 insertions(+), 23 deletions(-) create mode 100644 quickwit/quickwit-datafusion/Cargo.toml create mode 100644 quickwit/quickwit-datafusion/TODO-Datafusion.md create mode 100644 quickwit/quickwit-datafusion/src/lib.rs create mode 100644 quickwit/quickwit-datafusion/src/split_opener.rs create mode 100644 quickwit/quickwit-datafusion/src/table_provider.rs create mode 100644 quickwit/quickwit-datafusion/src/worker.rs create mode 100644 quickwit/quickwit-datafusion/tests/distributed_join_plan.rs create mode 100644 quickwit/quickwit-datafusion/tests/join_plan.rs create mode 100644 quickwit/quickwit-datafusion/tests/single_node.rs diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 21f1a789bec..f5d809debf8 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -77,6 +77,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "const-random", "getrandom 0.3.4", "once_cell", "serde", @@ -108,6 +109,21 @@ dependencies = [ "equator", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "alloca" version = "0.4.0" @@ -213,6 +229,24 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "arc-swap" version = "1.8.0" @@ -222,12 +256,261 @@ dependencies = [ "rustversion", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "arrow" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-cast" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-flight" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58c5b083668e6230eae3eab2fc4b5fb989974c845d0aa538dde61a4327c78675" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-schema", + "base64 0.22.1", + "bytes", + "futures", + "prost 0.14.1", + "prost-types 0.14.1", + "tonic 0.14.2", + "tonic-prost", +] + +[[package]] +name = "arrow-ipc" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", + "lz4_flex 0.12.0", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.13.0", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +dependencies = [ + "serde_core", + "serde_json", +] + +[[package]] +name = "arrow-select" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num-traits", +] + +[[package]] +name = "arrow-string" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + [[package]] name = "ascii-canvas" version = "4.0.0" @@ -1186,6 +1469,19 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d809780667f4410e7c41b07f52439b94d2bdf8528eeedc287fa38d3b7f95d82" +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "bindgen" version = "0.72.1" @@ -1261,6 +1557,29 @@ dependencies = [ "crunchy", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq 0.4.2", + "cpufeatures", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -1332,6 +1651,27 @@ dependencies = [ "rand 0.8.5", ] +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bs58" version = "0.5.1" @@ -1402,6 +1742,15 @@ dependencies = [ "bytes", ] +[[package]] +name = "bzip2" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" +dependencies = [ + "libbz2-rs-sys", +] + [[package]] name = "camino" version = "1.2.2" @@ -1740,6 +2089,16 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width 0.2.2", +] + [[package]] name = "community-id" version = "0.2.4" @@ -1757,8 +2116,10 @@ version = "0.4.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0f7ac3e5b97fdce45e8922fb05cae2c37f7bbd63d30dd94821dacfd8f3f2bf2" dependencies = [ + "bzip2", "compression-core", "flate2", + "liblzma", "memchr", "zstd", "zstd-safe", @@ -1779,6 +2140,18 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "windows-sys 0.59.0", +] + [[package]] name = "console" version = "0.16.2" @@ -1838,12 +2211,44 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const_fn" version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "convert_case" version = "0.7.1" @@ -2276,12 +2681,747 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" [[package]] -name = "dbl" -version = "0.3.2" +name = "datafusion" +version = "52.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd2735a791158376708f9347fe8faba9667589d82427ef3aed6794a8981de3d9" +checksum = "d12ee9fdc6cdb5898c7691bb994f0ba606c4acc93a2258d78bb9f26ff8158bb3" dependencies = [ - "generic-array", + "arrow", + "arrow-schema", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "flate2", + "futures", + "itertools 0.14.0", + "liblzma", + "log", + "object_store", + "parking_lot 0.12.5", + "parquet", + "rand 0.9.2", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "462dc9ef45e5d688aeaae49a7e310587e81b6016b9d03bace5626ad0043e5a9e" +dependencies = [ + "arrow", + "async-trait", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot 0.12.5", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b96dbf1d728fc321817b744eb5080cdd75312faa6980b338817f68f3caa4208" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "itertools 0.14.0", + "log", + "object_store", +] + +[[package]] +name = "datafusion-common" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3237a6ff0d2149af4631290074289cae548c9863c885d821315d54c6673a074a" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc", + "chrono", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "libc", + "log", + "object_store", + "parquet", + "paste", + "recursive", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70b5e34026af55a1bfccb1ef0a763cf1f64e77c696ffcf5a128a278c31236528" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b2a6be734cc3785e18bbf2a7f2b22537f6b9fb960d79617775a51568c281842" +dependencies = [ + "arrow", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "flate2", + "futures", + "glob", + "itertools 0.14.0", + "liblzma", + "log", + "object_store", + "rand 0.9.2", + "tokio", + "tokio-util", + "url", + "zstd", +] + +[[package]] +name = "datafusion-datasource-arrow" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1739b9b07c9236389e09c74f770e88aff7055250774e9def7d3f4f56b3dcc7be" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c73bc54b518bbba7c7650299d07d58730293cfba4356f6f428cc94c20b7600" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37812c8494c698c4d889374ecfabbff780f1f26d9ec095dd1bddfc2a8ca12559" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-parquet" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2210937ecd9f0e824c397e73f4b5385c97cd1aff43ab2b5836fcfd2d321523fb" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot 0.12.5", + "parquet", + "tokio", +] + +[[package]] +name = "datafusion-distributed" +version = "0.1.0" +source = "git+https://github.com/datafusion-contrib/datafusion-distributed#47e132e8c79ca5e24579b259b0572b8e5336681b" +dependencies = [ + "arrow", + "arrow-flight", + "arrow-ipc", + "arrow-select", + "async-trait", + "bytes", + "chrono", + "crossbeam-queue", + "dashmap 6.1.0", + "datafusion", + "datafusion-proto", + "delegate", + "futures", + "http 1.4.0", + "hyper-util", + "insta", + "itertools 0.14.0", + "moka", + "object_store", + "parquet", + "pin-project", + "pretty_assertions", + "prost 0.14.1", + "rand 0.9.2", + "reqwest", + "sketches-ddsketch", + "tokio", + "tokio-stream", + "tokio-util", + "tonic 0.14.2", + "tonic-prost", + "tower 0.5.2", + "tpchgen", + "tpchgen-arrow", + "url", + "uuid", + "zip", +] + +[[package]] +name = "datafusion-doc" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c825f969126bc2ef6a6a02d94b3c07abff871acf4d6dd759ce1255edb7923ce" + +[[package]] +name = "datafusion-execution" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa03ef05a2c2f90dd6c743e3e111078e322f4b395d20d4b4d431a245d79521ae" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "dashmap 6.1.0", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot 0.12.5", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef33934c1f98ee695cc51192cc5f9ed3a8febee84fdbcd9131bf9d3a9a78276f" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.13.0", + "itertools 0.14.0", + "paste", + "recursive", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "000c98206e3dd47d2939a94b6c67af4bfa6732dd668ac4fafdbde408fd9134ea" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.13.0", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "379b01418ab95ca947014066248c22139fe9af9289354de10b445bd000d5d276" +dependencies = [ + "arrow", + "arrow-buffer", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "chrono-tz", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "num-traits", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd00d5454ba4c3f8ebbd04bd6a6a9dc7ced7c56d883f70f2076c188be8459e4c" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aec06b380729a87210a4e11f555ec2d729a328142253f8d557b87593622ecc9f" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904f48d45e0f1eb7d0eb5c0f80f2b5c6046a85454364a6b16a2e0b46f62e7dff" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9a0d20e2b887e11bee24f7734d780a2588b925796ac741c3118dd06d5aa77f0" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot 0.12.5", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3414b0a07e39b6979fe3a69c7aa79a9f1369f1d5c8e52146e66058be1b285ee" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bf2feae63cd4754e31add64ce75cae07d015bce4bb41cd09872f93add32523a" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4fe888aeb6a095c4bcbe8ac1874c4b9a4c7ffa2ba849db7922683ba20875aaf" +dependencies = [ + "datafusion-doc", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "datafusion-optimizer" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a6527c063ae305c11be397a86d8193936f4b84d137fe40bd706dfc178cf733c" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap 2.13.0", + "itertools 0.14.0", + "log", + "recursive", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bb028323dd4efd049dd8a78d78fe81b2b969447b39c51424167f973ac5811d9" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "parking_lot 0.12.5", + "paste", + "petgraph 0.8.3", + "recursive", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78fe0826aef7eab6b4b61533d811234a7a9e5e458331ebbf94152a51fc8ab433" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfccd388620734c661bd8b7ca93c44cdd59fecc9b550eea416a78ffcbb29475f" +dependencies = [ + "ahash", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "parking_lot 0.12.5", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bde5fa10e73259a03b705d5fddc136516814ab5f441b939525618a4070f5a059" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools 0.14.0", + "recursive", +] + +[[package]] +name = "datafusion-physical-plan" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1098760fb29127c24cc9ade3277051dc73c9ed0ac0131bd7bcd742e0ad7470" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "log", + "parking_lot 0.12.5", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-proto" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cf75daf56aa6b1c6867cc33ff0fb035d517d6d06737fd355a3e1ef67cba6e7a" +dependencies = [ + "arrow", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-datasource-parquet", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-table", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-proto-common", + "object_store", + "prost 0.14.1", +] + +[[package]] +name = "datafusion-proto-common" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0cb3cce232a3de0d14ef44b58a6537aeb1362cfb6cf4d808691ddbb918956" +dependencies = [ + "arrow", + "datafusion-common", + "prost 0.14.1", +] + +[[package]] +name = "datafusion-pruning" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64d0fef4201777b52951edec086c21a5b246f3c82621569ddb4a26f488bc38a9" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", +] + +[[package]] +name = "datafusion-session" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f71f1e39e8f2acbf1c63b0e93756c2e970a64729dab70ac789587d6237c4fde0" +dependencies = [ + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot 0.12.5", +] + +[[package]] +name = "datafusion-sql" +version = "52.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f44693cfcaeb7a9f12d71d1c576c3a6dc025a12cef209375fa2d16fb3b5670ee" +dependencies = [ + "arrow", + "bigdecimal", + "chrono", + "datafusion-common", + "datafusion-expr", + "indexmap 2.13.0", + "log", + "recursive", + "regex", + "sqlparser", +] + +[[package]] +name = "dbl" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd2735a791158376708f9347fe8faba9667589d82427ef3aed6794a8981de3d9" +dependencies = [ + "generic-array", ] [[package]] @@ -2311,6 +3451,23 @@ dependencies = [ "uuid", ] +[[package]] +name = "deflate64" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26bf8fc351c5ed29b5c2f0cbbac1b209b74f60ecd62e675a998df72c49af5204" + +[[package]] +name = "delegate" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "der" version = "0.6.1" @@ -2353,13 +3510,24 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "dialoguer" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25f104b501bf2364e78d0d3974cbc774f738f5865306ed128e1e0d7499c0ad96" dependencies = [ - "console", + "console 0.16.2", "shell-words", ] @@ -2897,6 +4065,16 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags 2.10.0", + "rustc_version", +] + [[package]] name = "flate2" version = "1.1.5" @@ -3431,6 +4609,7 @@ checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", + "num-traits", "zerocopy", ] @@ -3816,9 +4995,11 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3991,7 +5172,7 @@ version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88" dependencies = [ - "console", + "console 0.16.2", "portable-atomic", "unicode-width 0.2.2", "unit-prefix", @@ -4065,6 +5246,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "insta" +version = "1.46.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e82db8c87c7f1ccecb34ce0c24399b8a73081427f3c7c50a5d597925356115e4" +dependencies = [ + "console 0.15.11", + "once_cell", + "regex", + "similar", + "tempfile", +] + [[package]] name = "instant" version = "0.1.13" @@ -4077,6 +5271,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + [[package]] name = "inventory" version = "0.3.21" @@ -4308,7 +5508,7 @@ dependencies = [ "ena", "itertools 0.14.0", "lalrpop-util", - "petgraph", + "petgraph 0.7.1", "regex", "regex-syntax", "sha3", @@ -4343,6 +5543,69 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libbz2-rs-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + [[package]] name = "libc" version = "0.2.180" @@ -4350,13 +5613,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] -name = "libloading" -version = "0.8.9" +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link 0.2.1", +] + +[[package]] +name = "liblzma" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73c36d08cad03a3fbe2c4e7bb3a9e84c57e4ee4135ed0b065cade3d98480c648" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" dependencies = [ - "cfg-if", - "windows-link 0.2.1", + "cc", + "libc", + "pkg-config", ] [[package]] @@ -4512,6 +5795,19 @@ name = "lz4_flex" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lzma-rust2" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c60a23ffb90d527e23192f1246b14746e2f7f071cb84476dd879071696c18a4a" +dependencies = [ + "crc", + "sha2", +] [[package]] name = "matchers" @@ -5066,6 +6362,30 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "http 1.4.0", + "humantime", + "itertools 0.14.0", + "parking_lot 0.12.5", + "percent-encoding", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + [[package]] name = "octseq" version = "0.5.2" @@ -5403,7 +6723,6 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "ownedbytes" version = "0.9.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "stable_deref_trait", ] @@ -5520,6 +6839,43 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "parquet" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64 0.22.1", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.12.0", + "num-bigint", + "num-integer", + "num-traits", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + [[package]] name = "parse-size" version = "1.1.0" @@ -5651,6 +7007,18 @@ dependencies = [ "indexmap 2.13.0", ] +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "serde", +] + [[package]] name = "phf" version = "0.12.1" @@ -5961,6 +7329,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppmd-rust" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efca4c95a19a79d1c98f791f10aebd5c1363b473244630bb7dbde1dc98455a24" + [[package]] name = "pprof" version = "0.15.0" @@ -6222,7 +7596,7 @@ dependencies = [ "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.7.1", "prettyplease", "prost 0.13.5", "prost-types 0.13.5", @@ -6242,7 +7616,7 @@ dependencies = [ "log", "multimap", "once_cell", - "petgraph", + "petgraph 0.7.1", "prettyplease", "prost 0.14.1", "prost-types 0.14.1", @@ -6323,6 +7697,16 @@ version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "publicsuffix" version = "2.3.0" @@ -6729,6 +8113,32 @@ dependencies = [ "ulid", ] +[[package]] +name = "quickwit-datafusion" +version = "0.8.0" +dependencies = [ + "anyhow", + "arrow", + "async-trait", + "dashmap 6.1.0", + "datafusion", + "datafusion-datasource", + "datafusion-distributed", + "datafusion-physical-plan", + "datafusion-proto", + "futures", + "prost 0.14.1", + "quickwit-common", + "quickwit-metastore", + "quickwit-proto", + "serde", + "serde_json", + "tantivy", + "tantivy-datafusion", + "tokio", + "tracing", +] + [[package]] name = "quickwit-datetime" version = "0.8.0" @@ -7621,6 +9031,26 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.114", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -7998,6 +9428,16 @@ dependencies = [ "walkdir", ] +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + [[package]] name = "rust_decimal" version = "1.39.0" @@ -8397,6 +9837,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f97841a747eef040fcd2e7b3b9a220a7205926e60488e673d9e4926d27772ce5" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -8733,6 +10179,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" +[[package]] +name = "similar" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" + [[package]] name = "simple_asn1" version = "0.6.3" @@ -8906,6 +10358,28 @@ dependencies = [ "der 0.7.10", ] +[[package]] +name = "sqlparser" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +dependencies = [ + "log", + "recursive", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "sqlx" version = "0.8.6" @@ -9106,6 +10580,19 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -9249,6 +10736,27 @@ dependencies = [ "nom 8.0.0", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tabled" version = "0.20.0" @@ -9284,7 +10792,6 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.26.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "aho-corasick", "arc-swap", @@ -9314,6 +10821,7 @@ dependencies = [ "oneshot", "rayon", "regex", + "rust-stemmers", "rustc-hash", "serde", "serde_json", @@ -9339,7 +10847,6 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.9.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "bitpacking", ] @@ -9347,7 +10854,6 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "downcast-rs", "fastdivide", @@ -9362,7 +10868,6 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.10.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "async-trait", "byteorder", @@ -9371,6 +10876,26 @@ dependencies = [ "time", ] +[[package]] +name = "tantivy-datafusion" +version = "0.1.0" +dependencies = [ + "arrow", + "async-trait", + "datafusion", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-proto", + "futures", + "prost 0.13.5", + "serde", + "serde_json", + "tantivy", +] + [[package]] name = "tantivy-fst" version = "0.5.0" @@ -9385,7 +10910,6 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.25.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "fnv", "nom 7.1.3", @@ -9397,7 +10921,6 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "futures-util", "itertools 0.14.0", @@ -9410,7 +10933,6 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "murmurhash32", "tantivy-common", @@ -9419,7 +10941,6 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=98ebbf9#98ebbf922d5715941afa5afd1fc817a8c81ded97" dependencies = [ "serde", ] @@ -9526,6 +11047,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float 2.10.1", +] + [[package]] name = "tikv-jemalloc-ctl" version = "0.6.1" @@ -9601,6 +11133,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -9996,6 +11537,20 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" +[[package]] +name = "tpchgen" +version = "2.0.1" +source = "git+https://github.com/clflushopt/tpchgen-rs?rev=e83365a5a9101906eb9f78c5607b83bc59849acf#e83365a5a9101906eb9f78c5607b83bc59849acf" + +[[package]] +name = "tpchgen-arrow" +version = "2.0.1" +source = "git+https://github.com/clflushopt/tpchgen-rs?rev=e83365a5a9101906eb9f78c5607b83bc59849acf#e83365a5a9101906eb9f78c5607b83bc59849acf" +dependencies = [ + "arrow", + "tpchgen", +] + [[package]] name = "tracing" version = "0.1.44" @@ -10917,6 +12472,17 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link 0.2.1", + "windows-result 0.4.1", + "windows-strings 0.5.1", +] + [[package]] name = "windows-result" version = "0.3.4" @@ -11346,6 +12912,20 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] [[package]] name = "zerotrie" @@ -11380,6 +12960,33 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "zip" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2a05c7c36fde6c09b08576c9f7fb4cda705990f73b58fe011abf7dfb24168b" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq 0.3.1", + "crc32fast", + "deflate64", + "flate2", + "getrandom 0.3.4", + "hmac", + "indexmap 2.13.0", + "lzma-rust2", + "memchr", + "pbkdf2", + "ppmd-rust", + "sha1", + "time", + "zeroize", + "zopfli", + "zstd", +] + [[package]] name = "zlib-rs" version = "0.5.5" @@ -11392,6 +12999,18 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fc5a66a20078bf1251bde995aa2fdcc4b800c70b5d92dd2c62abc5c60f679f8" +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + [[package]] name = "zstd" version = "0.13.3" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index c2e8ec62dc3..9b5ee5bc90d 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -10,6 +10,7 @@ members = [ "quickwit-common", "quickwit-config", "quickwit-control-plane", + "quickwit-datafusion", "quickwit-datetime", "quickwit-directories", "quickwit-doc-mapper", @@ -47,6 +48,7 @@ default-members = [ "quickwit-common", "quickwit-config", "quickwit-control-plane", + "quickwit-datafusion", "quickwit-datetime", "quickwit-directories", "quickwit-doc-mapper", @@ -335,6 +337,7 @@ quickwit-codegen-example = { path = "quickwit-codegen/example" } quickwit-common = { path = "quickwit-common" } quickwit-config = { path = "quickwit-config" } quickwit-control-plane = { path = "quickwit-control-plane" } +quickwit-datafusion = { path = "quickwit-datafusion" } quickwit-datetime = { path = "quickwit-datetime" } quickwit-directories = { path = "quickwit-directories" } quickwit-doc-mapper = { path = "quickwit-doc-mapper" } @@ -371,6 +374,9 @@ encoding_rs = "=0.8.35" [patch.crates-io] sasl2-sys = { git = "https://github.com/quickwit-oss/rust-sasl/", rev = "085a4c7" } +[patch.'https://github.com/quickwit-oss/tantivy/'] +tantivy = { path = "/Users/alex.bianchi/oss/tantivy/.worktrees/bianchi/tantivydf" } + ## this patched version of tracing helps better understand what happens inside futures (when are ## they polled, how long does poll take...) #tracing = { git = "https://github.com/trinity-1686a/tracing.git", rev = "6806cac3" } diff --git a/quickwit/quickwit-datafusion/Cargo.toml b/quickwit/quickwit-datafusion/Cargo.toml new file mode 100644 index 00000000000..67b48d7722e --- /dev/null +++ b/quickwit/quickwit-datafusion/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "quickwit-datafusion" +description = "Distributed DataFusion execution for Quickwit" +version.workspace = true +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow = { workspace = true } +async-trait = { workspace = true } +dashmap = "6" +futures = { workspace = true } +prost = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } + +tantivy = { workspace = true } +tantivy-datafusion = { path = "/Users/alex.bianchi/oss/tantivy/.worktrees/bianchi/tantivydf/tantivy-datafusion" } + +quickwit-metastore = { workspace = true } +quickwit-proto = { workspace = true } + +datafusion = "52" +datafusion-datasource = "52" +datafusion-physical-plan = "52" +datafusion-proto = "52" +datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed" } +arrow = { version = "57", features = ["prettyprint"] } + +[dev-dependencies] +datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed", features = ["integration"] } +dashmap = "6" +quickwit-common = { workspace = true, features = ["testsuite"] } +quickwit-proto = { workspace = true, features = ["testsuite"] } +quickwit-metastore = { workspace = true, features = ["testsuite"] } +tokio = { workspace = true, features = ["test-util", "macros"] } diff --git a/quickwit/quickwit-datafusion/TODO-Datafusion.md b/quickwit/quickwit-datafusion/TODO-Datafusion.md new file mode 100644 index 00000000000..c2662d247fd --- /dev/null +++ b/quickwit/quickwit-datafusion/TODO-Datafusion.md @@ -0,0 +1,91 @@ +# TODO: Distributed DataFusion Execution + +## Plan Quality + +### Unnecessary RepartitionExec in join plans +The distributed plans show `RepartitionExec: Hash([_doc_id, _segment_ord], 3)` between every DataSource and its HashJoin. This is wrong — inv, f, and d within a single split are already co-partitioned by segment (they declare `Hash([_doc_id, _segment_ord], N)` partitioning). The optimizer shouldn't add shuffles for co-partitioned joins. + +Likely cause: `target_partitions` on the distributed context doesn't match the segment count, so DF thinks it needs to repartition. With `target_partitions=1` locally the plan uses `CollectLeft` mode and no repartitions. Need to either: +- Set `target_partitions` = segment count on the distributed context +- Or teach the distributed planner to respect co-partitioned DataSource declarations + +### Explicit UNION ALL vs plan-level decomposition +Currently each split's join plan is written as explicit SQL `UNION ALL`. This means the planner sees N copies of the same join pattern. A better approach: +- `QuickwitTableProvider` registers a single logical table backed by all splits +- The planner produces one join plan +- The distributed optimizer decomposes it by split at the physical level +- Each split's join subtree becomes a task assigned to a worker + +This would let df-distributed's `DistributedPhysicalOptimizerRule` handle the split-to-worker mapping natively instead of the SQL manually encoding it. + +### CollectLeft vs Partitioned join mode +With `target_partitions=1` the local plan uses `CollectLeft` (broadcast build side) which is correct for the inv⋈f pattern — the inverted index result is small. The distributed plan uses `Partitioned` mode because the context has higher target_partitions. Need to control this so the join strategy matches the data characteristics. + +## Worker/Split/Segment Mapping + +### How many workers per split? Per segment? +Currently: 1 worker per split. Each split may have multiple segments. Segments within a split are co-partitioned and joined locally on the worker. + +Open questions: +- Should a split with many segments be split across multiple workers? (probably not — segments within a split share the same index and need co-partitioned joins) +- Should multiple small splits be assigned to the same worker? (yes — df-distributed's task assignment should batch them) +- How does the coordinator know how many workers are available? (WorkerResolver::get_urls() — need to implement for Quickwit cluster) + +### Worker discovery +`start_localhost_context` hardcodes localhost workers. Production needs: +- `WorkerResolver` backed by Quickwit's cluster membership (Chitchat) +- Workers = Quickwit searcher nodes +- Split-to-worker affinity based on cache locality + +## Metastore Integration + +### QuickwitTableProvider doesn't query the metastore +Currently takes a hand-built `Vec>`. Production path: +- `QuickwitTableProvider::new(metastore, index_id)` +- At `scan()` time, call `metastore.list_splits()` to discover splits +- Create `SplitIndexOpener` per split from split metadata +- Build per-split tantivy-df providers and compose the plan + +### SplitIndexOpener doesn't open real splits +Currently backed by an in-memory `DashMap`. Production path: +- `open()` calls `open_index_with_caches()` from quickwit-search +- Downloads split bundle from object storage (S3/GCS/Azure) +- Opens tantivy index from the local cache or downloaded bundle +- Returns the opened `Index` + +## Codec Gaps + +### Pushed filters lost across serialization +`FastFieldDataSource` claims `PushedDown::Yes` for all filters, so DF removes the `FilterExec`. But the codec doesn't serialize pushed filters (they're `Arc`, not trivially serializable). On the worker, the reconstructed DataSource has no filters. + +Options: +- Serialize pushed filters via DF's PhysicalExpr proto support +- Change tantivy-df to return `PushedDown::No` so DF keeps FilterExec in the plan (it serializes fine as a built-in node) +- Encode the filter expressions as Expr in the proto and pass to scan() + +### Aggregation pushdown not tested in distributed +tantivy-df has `AggPushdown` that replaces `AggregateExec` with `TantivyAggregateExec`. This is a custom ExecutionPlan node that the codec doesn't handle yet. Need to: +- Add `TantivyAggregateExec` encoding to `TantivyCodec` +- Or make it a DataSource variant the existing codec pattern handles +- Test partial aggregation on workers + final merge on coordinator + +## Optimizer Rules in Distributed Context + +### tantivy-df optimizer rules not registered on workers +`FastFieldFilterPushdown`, `TopKPushdown`, `AggPushdown`, `OrdinalGroupByOptimization` are registered on the coordinator's session but not on workers. Workers rebuild the plan from the codec, so they'd need these rules if the plan is further optimized on the worker side. + +Currently this doesn't matter because the plan is fully optimized on the coordinator before distribution. But if df-distributed ever does worker-side re-optimization, the rules need to be registered in `build_worker_session_builder`. + +## Testing + +### Multi-segment splits +All current tests use single-segment splits (one commit). Need tests with multi-segment splits to verify: +- Segment-level co-partitioning in joins (N partitions per split) +- Correct partition mapping across inv/f/d providers +- Chunking behavior with target_partitions > segment count + +### Real storage backend +All tests use `Index::create_in_ram`. Need integration tests with: +- Split bundles on local filesystem +- `open_index_with_caches()` in the opener +- Split download + cache warming diff --git a/quickwit/quickwit-datafusion/src/lib.rs b/quickwit/quickwit-datafusion/src/lib.rs new file mode 100644 index 00000000000..d8f418c9adc --- /dev/null +++ b/quickwit/quickwit-datafusion/src/lib.rs @@ -0,0 +1,7 @@ +pub mod split_opener; +pub mod table_provider; +pub mod worker; + +pub use split_opener::{SplitIndexOpener, SplitRegistry}; +pub use table_provider::{OpenerFactory, QuickwitTableProvider}; +pub use worker::build_worker_session_builder; diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs new file mode 100644 index 00000000000..98aba9cce14 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -0,0 +1,113 @@ +use std::any::Any; +use std::fmt; +use std::sync::Arc; + +use async_trait::async_trait; +use dashmap::DashMap; +use datafusion::common::Result; +use datafusion::error::DataFusionError; +use tantivy::Index; +use tantivy_datafusion::IndexOpener; + +/// Registry of opened tantivy indexes, keyed by split ID. +/// +/// For integration tests this is populated before query execution. +/// In production this would be replaced by `open_index_with_caches()`. +pub type SplitRegistry = DashMap; + +/// An [`IndexOpener`] backed by an in-memory [`SplitRegistry`]. +/// +/// Planning-time metadata (schema, segment sizes) is stored inline so +/// that the opener can answer schema/partition queries without touching +/// the registry. The actual [`open`](IndexOpener::open) call looks up +/// the registry at execution time. +#[derive(Clone)] +pub struct SplitIndexOpener { + split_id: String, + registry: Arc, + tantivy_schema: tantivy::schema::Schema, + segment_sizes: Vec, +} + +impl SplitIndexOpener { + pub fn new( + split_id: String, + registry: Arc, + tantivy_schema: tantivy::schema::Schema, + segment_sizes: Vec, + ) -> Self { + Self { + split_id, + registry, + tantivy_schema, + segment_sizes, + } + } + + /// Build an opener by extracting schema and segment sizes from an + /// already-opened index, then inserting it into the registry. + pub fn from_index(split_id: String, index: Index, registry: Arc) -> Self { + let tantivy_schema = index.schema(); + let segment_sizes = index + .reader() + .map(|r| { + r.searcher() + .segment_readers() + .iter() + .map(|sr| sr.max_doc()) + .collect() + }) + .unwrap_or_default(); + registry.insert(split_id.clone(), index); + Self { + split_id, + registry, + tantivy_schema, + segment_sizes, + } + } + + pub fn split_id(&self) -> &str { + &self.split_id + } +} + +impl fmt::Debug for SplitIndexOpener { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SplitIndexOpener") + .field("split_id", &self.split_id) + .field("segment_sizes", &self.segment_sizes) + .finish() + } +} + +#[async_trait] +impl IndexOpener for SplitIndexOpener { + async fn open(&self) -> Result { + self.registry + .get(&self.split_id) + .map(|entry| entry.value().clone()) + .ok_or_else(|| { + DataFusionError::Execution(format!( + "split {} not found in registry", + self.split_id + )) + }) + } + + fn schema(&self) -> tantivy::schema::Schema { + self.tantivy_schema.clone() + } + + fn segment_sizes(&self) -> Vec { + self.segment_sizes.clone() + } + + fn identifier(&self) -> &str { + &self.split_id + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/quickwit/quickwit-datafusion/src/table_provider.rs b/quickwit/quickwit-datafusion/src/table_provider.rs new file mode 100644 index 00000000000..8af44d87f3d --- /dev/null +++ b/quickwit/quickwit-datafusion/src/table_provider.rs @@ -0,0 +1,132 @@ +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use async_trait::async_trait; +use datafusion::catalog::Session; +use datafusion::common::Result; +use datafusion::datasource::{TableProvider, TableType}; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::Expr; +use datafusion::physical_plan::union::UnionExec; +use datafusion::physical_plan::ExecutionPlan; +use quickwit_metastore::{ + ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitMetadata, + SplitState, +}; +use quickwit_proto::metastore::{ + ListSplitsRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_proto::types::IndexUid; +use tantivy_datafusion::{IndexOpener, TantivyTableProvider}; +use tokio::sync::Mutex; + +/// Factory that creates an [`IndexOpener`] from split metadata. +/// +/// Called at scan time for each split discovered from the metastore. +/// The returned opener defers actual index opening to execution time. +pub type OpenerFactory = + Arc Arc + Send + Sync>; + +/// A DataFusion table provider backed by a Quickwit index. +/// +/// At scan time, queries the metastore for published splits, creates +/// an [`IndexOpener`] per split via the provided factory, builds +/// per-split tantivy-df providers, and unions them. +pub struct QuickwitTableProvider { + index_uid: IndexUid, + metastore: Mutex, + opener_factory: OpenerFactory, + arrow_schema: SchemaRef, +} + +impl QuickwitTableProvider { + pub fn new( + index_uid: IndexUid, + metastore: MetastoreServiceClient, + opener_factory: OpenerFactory, + tantivy_schema: &tantivy::schema::Schema, + ) -> Self { + let arrow_schema = tantivy_datafusion::tantivy_schema_to_arrow(tantivy_schema); + Self { + index_uid, + metastore: Mutex::new(metastore), + opener_factory, + arrow_schema, + } + } + + /// List published splits from the metastore. + async fn list_splits(&self) -> Result> { + let metastore = self.metastore.lock().await; + let query = ListSplitsQuery::for_index(self.index_uid.clone()) + .with_split_state(SplitState::Published); + + let request = ListSplitsRequest::try_from_list_splits_query(&query) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + let splits = metastore + .list_splits(request) + .await + .map_err(|e| DataFusionError::External(Box::new(e)))? + .collect_splits_metadata() + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(splits) + } +} + +impl std::fmt::Debug for QuickwitTableProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("QuickwitTableProvider") + .field("index_uid", &self.index_uid) + .field("arrow_schema", &self.arrow_schema) + .finish() + } +} + +#[async_trait] +impl TableProvider for QuickwitTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.arrow_schema.clone() + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + async fn scan( + &self, + state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + ) -> Result> { + let splits = self.list_splits().await?; + + if splits.is_empty() { + return Err(DataFusionError::Plan(format!( + "no published splits found for index {}", + self.index_uid + ))); + } + + let mut execs = Vec::with_capacity(splits.len()); + for split_meta in &splits { + let opener = (self.opener_factory)(split_meta); + let provider = TantivyTableProvider::from_opener(opener); + let exec = provider.scan(state, projection, filters, limit).await?; + execs.push(exec); + } + + if execs.len() == 1 { + return Ok(execs.into_iter().next().unwrap()); + } + UnionExec::try_new(execs) + } +} diff --git a/quickwit/quickwit-datafusion/src/worker.rs b/quickwit/quickwit-datafusion/src/worker.rs new file mode 100644 index 00000000000..1dfa3d6c8a6 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/worker.rs @@ -0,0 +1,38 @@ +use std::sync::Arc; + +use datafusion::error::DataFusionError; +use datafusion::execution::SessionState; +use datafusion_distributed::{DistributedExt, WorkerQueryContext}; +use tantivy_datafusion::{OpenerMetadata, TantivyCodec}; + +use crate::split_opener::{SplitIndexOpener, SplitRegistry}; + +/// Build a worker session builder that registers tantivy-df's +/// [`TantivyCodec`] with the given [`SplitRegistry`]. +/// +/// Each worker gets its own registry populated with the splits it +/// should serve. The codec's opener factory reconstructs +/// [`SplitIndexOpener`]s from the serialized metadata. +pub fn build_worker_session_builder( + registry: Arc, +) -> impl Fn(WorkerQueryContext) -> std::pin::Pin< + Box> + Send>, +> + Clone + + Send + + Sync + + 'static { + move |ctx: WorkerQueryContext| { + let registry = registry.clone(); + Box::pin(async move { + let codec = TantivyCodec::new(move |meta: OpenerMetadata| { + Arc::new(SplitIndexOpener::new( + meta.identifier, + registry.clone(), + meta.tantivy_schema, + meta.segment_sizes, + )) as Arc + }); + Ok(ctx.builder.with_distributed_user_codec(codec).build()) + }) + } +} diff --git a/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs b/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs new file mode 100644 index 00000000000..8596f5d9a07 --- /dev/null +++ b/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs @@ -0,0 +1,241 @@ +//! Distributed join plans showing the full tantivy-df decomposition +//! with stage boundaries, network shuffles, and segment-level +//! co-partitioned joins across workers. + +use std::sync::Arc; + +use arrow::array::{AsArray, RecordBatch}; +use arrow::datatypes::UInt64Type; +use datafusion::physical_plan::execute_stream; +use datafusion::prelude::*; +use datafusion_distributed::{display_plan_ascii, DistributedExt}; +use futures::TryStreamExt; +use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; +use tantivy::{Index, IndexWriter, TantivyDocument}; +use tantivy_datafusion::{ + full_text_udf, IndexOpener, OpenerMetadata, TantivyCodec, TantivyDocumentProvider, + TantivyInvertedIndexProvider, TantivyTableProvider, +}; + +use quickwit_datafusion::{SplitIndexOpener, SplitRegistry}; +use quickwit_datafusion::worker::build_worker_session_builder; + +fn create_index(docs: &[(u64, i64, f64, &str)]) -> Index { + let mut builder = SchemaBuilder::new(); + let id_f = builder.add_u64_field("id", FAST | STORED); + let score_f = builder.add_i64_field("score", FAST); + let price_f = builder.add_f64_field("price", FAST); + let cat_f = builder.add_text_field("category", TEXT | FAST | STORED); + let schema = builder.build(); + + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + for &(id, score, price, category) in docs { + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, id); + doc.add_i64(score_f, score); + doc.add_f64(price_f, price); + doc.add_text(cat_f, category); + writer.add_document(doc).unwrap(); + } + writer.commit().unwrap(); + index +} + +fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { + arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() +} + +fn register_split(ctx: &SessionContext, prefix: &str, opener: Arc) { + let o: Arc = opener; + ctx.register_table( + &format!("{prefix}_f"), + Arc::new(TantivyTableProvider::from_opener(o.clone())), + ) + .unwrap(); + ctx.register_table( + &format!("{prefix}_inv"), + Arc::new(TantivyInvertedIndexProvider::from_opener(o.clone())), + ) + .unwrap(); + ctx.register_table( + &format!("{prefix}_d"), + Arc::new(TantivyDocumentProvider::from_opener(o)), + ) + .unwrap(); +} + +fn setup_registry() -> (Arc, Vec>) { + let registry = Arc::new(SplitRegistry::new()); + + let idx1 = create_index(&[ + (1, 10, 1.5, "electronics"), + (2, 20, 2.5, "electronics"), + (3, 30, 3.5, "books"), + ]); + let idx2 = create_index(&[ + (4, 40, 4.5, "books"), + (5, 50, 5.5, "clothing"), + ]); + let idx3 = create_index(&[ + (6, 60, 6.5, "electronics"), + (7, 70, 7.5, "clothing"), + ]); + + let opener1 = Arc::new(SplitIndexOpener::from_index( + "split-1".to_string(), idx1, registry.clone(), + )); + let opener2 = Arc::new(SplitIndexOpener::from_index( + "split-2".to_string(), idx2, registry.clone(), + )); + let opener3 = Arc::new(SplitIndexOpener::from_index( + "split-3".to_string(), idx3, registry.clone(), + )); + + (registry, vec![opener1, opener2, opener3]) +} + +fn make_codec(registry: Arc) -> TantivyCodec { + TantivyCodec::new(move |meta: OpenerMetadata| { + Arc::new(SplitIndexOpener::new( + meta.identifier, + registry.clone(), + meta.tantivy_schema, + meta.segment_sizes, + )) as Arc + }) +} + +/// Full-text search across 3 splits: inv ⋈ f, distributed. +#[tokio::test] +async fn test_distributed_full_text_join_plan() { + let (registry, openers) = setup_registry(); + + let (mut ctx, _guard, _workers) = + datafusion_distributed::test_utils::localhost::start_localhost_context( + 3, + build_worker_session_builder(registry.clone()), + ) + .await; + + ctx.set_distributed_user_codec(make_codec(registry.clone())); + ctx.register_udf(full_text_udf()); + + for (i, opener) in openers.iter().enumerate() { + register_split(&ctx, &format!("s{}", i + 1), opener.clone()); + } + + let sql = "\ + SELECT id, price FROM ( \ + SELECT s1_f.id, s1_f.price \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id AND s1_f._segment_ord = s1_inv._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + UNION ALL \ + SELECT s2_f.id, s2_f.price \ + FROM s2_inv \ + JOIN s2_f ON s2_f._doc_id = s2_inv._doc_id AND s2_f._segment_ord = s2_inv._segment_ord \ + WHERE full_text(s2_inv.category, 'electronics') \ + UNION ALL \ + SELECT s3_f.id, s3_f.price \ + FROM s3_inv \ + JOIN s3_f ON s3_f._doc_id = s3_inv._doc_id AND s3_f._segment_ord = s3_inv._segment_ord \ + WHERE full_text(s3_inv.category, 'electronics') \ + ) ORDER BY id"; + + let df: DataFrame = ctx.sql(sql).await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + let plan_str = display_plan_ascii(plan.as_ref(), false); + + println!("=== Distributed full-text join plan (inv ⋈ f, 3 splits) ===\n{plan_str}\n"); + + assert!( + plan_str.contains("InvertedIndexDataSource"), + "should contain InvertedIndexDataSource\n\n{plan_str}" + ); + assert!( + plan_str.contains("FastFieldDataSource"), + "should contain FastFieldDataSource\n\n{plan_str}" + ); + + // Execute and verify + let stream = execute_stream(plan, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = collect_batches(&batches); + + // electronics: ids {1,2} (split-1) + {6} (split-3) + assert_eq!(batch.num_rows(), 3); + let ids = batch.column(0).as_primitive::(); + let mut id_values: Vec = (0..batch.num_rows()).map(|i| ids.value(i)).collect(); + id_values.sort(); + assert_eq!(id_values, vec![1, 2, 6]); +} + +/// Three-way join across 3 splits: inv ⋈ f ⋈ d, distributed. +#[tokio::test] +async fn test_distributed_three_way_join_plan() { + let (registry, openers) = setup_registry(); + + let (mut ctx, _guard, _workers) = + datafusion_distributed::test_utils::localhost::start_localhost_context( + 3, + build_worker_session_builder(registry.clone()), + ) + .await; + + ctx.set_distributed_user_codec(make_codec(registry.clone())); + ctx.register_udf(full_text_udf()); + + for (i, opener) in openers.iter().enumerate() { + register_split(&ctx, &format!("s{}", i + 1), opener.clone()); + } + + let sql = "\ + SELECT id, price, doc FROM ( \ + SELECT s1_f.id, s1_f.price, s1_d._document as doc \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id AND s1_f._segment_ord = s1_inv._segment_ord \ + JOIN s1_d ON s1_d._doc_id = s1_f._doc_id AND s1_d._segment_ord = s1_f._segment_ord \ + WHERE full_text(s1_inv.category, 'books') \ + UNION ALL \ + SELECT s2_f.id, s2_f.price, s2_d._document as doc \ + FROM s2_inv \ + JOIN s2_f ON s2_f._doc_id = s2_inv._doc_id AND s2_f._segment_ord = s2_inv._segment_ord \ + JOIN s2_d ON s2_d._doc_id = s2_f._doc_id AND s2_d._segment_ord = s2_f._segment_ord \ + WHERE full_text(s2_inv.category, 'books') \ + UNION ALL \ + SELECT s3_f.id, s3_f.price, s3_d._document as doc \ + FROM s3_inv \ + JOIN s3_f ON s3_f._doc_id = s3_inv._doc_id AND s3_f._segment_ord = s3_inv._segment_ord \ + JOIN s3_d ON s3_d._doc_id = s3_f._doc_id AND s3_d._segment_ord = s3_f._segment_ord \ + WHERE full_text(s3_inv.category, 'books') \ + ) ORDER BY id"; + + let df: DataFrame = ctx.sql(sql).await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + let plan_str = display_plan_ascii(plan.as_ref(), false); + + println!("=== Distributed three-way join plan (inv ⋈ f ⋈ d, 3 splits) ===\n{plan_str}\n"); + + assert!(plan_str.contains("InvertedIndexDataSource"), "{plan_str}"); + assert!(plan_str.contains("FastFieldDataSource"), "{plan_str}"); + assert!(plan_str.contains("DocumentDataSource"), "{plan_str}"); + + // Execute and verify + let stream = execute_stream(plan, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = collect_batches(&batches); + + // books: ids {3} (split-1) + {4} (split-2) + assert_eq!(batch.num_rows(), 2); + let ids = batch.column(0).as_primitive::(); + let mut id_values: Vec = (0..batch.num_rows()).map(|i| ids.value(i)).collect(); + id_values.sort(); + assert_eq!(id_values, vec![3, 4]); + + let docs = batch.column(2).as_string::(); + for i in 0..2 { + let doc: serde_json::Value = serde_json::from_str(docs.value(i)).unwrap(); + assert_eq!(doc["category"][0], "books"); + } +} diff --git a/quickwit/quickwit-datafusion/tests/join_plan.rs b/quickwit/quickwit-datafusion/tests/join_plan.rs new file mode 100644 index 00000000000..957b839df36 --- /dev/null +++ b/quickwit/quickwit-datafusion/tests/join_plan.rs @@ -0,0 +1,331 @@ +//! Tests showing the full tantivy-df join plan with all three provider +//! types per split: InvertedIndex ⋈ FastField ⋈ Document, joined on +//! (_doc_id, _segment_ord) with segment-level co-partitioning. +//! +//! This is what a real Quickwit query looks like decomposed into tantivy nodes. + +use std::sync::Arc; + +use arrow::array::{AsArray, RecordBatch}; +use arrow::datatypes::{Float64Type, UInt64Type}; +use datafusion::prelude::*; +use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; +use tantivy::{Index, IndexWriter, TantivyDocument}; +use tantivy_datafusion::{ + full_text_udf, IndexOpener, TantivyDocumentProvider, TantivyInvertedIndexProvider, + TantivyTableProvider, +}; + +use quickwit_datafusion::{SplitIndexOpener, SplitRegistry}; + +fn create_index(docs: &[(u64, i64, f64, &str)]) -> Index { + let mut builder = SchemaBuilder::new(); + let id_f = builder.add_u64_field("id", FAST | STORED); + let score_f = builder.add_i64_field("score", FAST); + let price_f = builder.add_f64_field("price", FAST); + let cat_f = builder.add_text_field("category", TEXT | FAST | STORED); + let schema = builder.build(); + + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + for &(id, score, price, category) in docs { + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, id); + doc.add_i64(score_f, score); + doc.add_f64(price_f, price); + doc.add_text(cat_f, category); + writer.add_document(doc).unwrap(); + } + writer.commit().unwrap(); + index +} + +fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { + arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() +} + +fn plan_to_string(batches: &[RecordBatch]) -> String { + let batch = arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap(); + let plan_col = batch.column(1).as_string::(); + (0..batch.num_rows()) + .map(|i| plan_col.value(i)) + .collect::>() + .join("\n") +} + +/// Register all three tantivy-df providers for a single split. +/// +/// This is the decomposition: one tantivy index → three DF table providers: +/// - `{prefix}_f` — fast fields (columnar values) +/// - `{prefix}_inv` — inverted index (full-text search) +/// - `{prefix}_d` — document store (stored JSON) +/// +/// Joined on `(_doc_id, _segment_ord)` at query time. +fn register_split( + ctx: &SessionContext, + prefix: &str, + opener: Arc, +) { + let opener_dyn: Arc = opener; + ctx.register_table( + &format!("{prefix}_f"), + Arc::new(TantivyTableProvider::from_opener(opener_dyn.clone())), + ) + .unwrap(); + ctx.register_table( + &format!("{prefix}_inv"), + Arc::new(TantivyInvertedIndexProvider::from_opener(opener_dyn.clone())), + ) + .unwrap(); + ctx.register_table( + &format!("{prefix}_d"), + Arc::new(TantivyDocumentProvider::from_opener(opener_dyn)), + ) + .unwrap(); +} + +/// Set up 3 splits, each with all 3 providers registered. +fn setup() -> (SessionContext, Arc) { + let registry = Arc::new(SplitRegistry::new()); + + let idx1 = create_index(&[ + (1, 10, 1.5, "electronics"), + (2, 20, 2.5, "electronics"), + (3, 30, 3.5, "books"), + ]); + let idx2 = create_index(&[ + (4, 40, 4.5, "books"), + (5, 50, 5.5, "clothing"), + ]); + let idx3 = create_index(&[ + (6, 60, 6.5, "electronics"), + (7, 70, 7.5, "clothing"), + ]); + + let opener1 = Arc::new(SplitIndexOpener::from_index( + "split-1".to_string(), idx1, registry.clone(), + )); + let opener2 = Arc::new(SplitIndexOpener::from_index( + "split-2".to_string(), idx2, registry.clone(), + )); + let opener3 = Arc::new(SplitIndexOpener::from_index( + "split-3".to_string(), idx3, registry.clone(), + )); + + // Use 1 target partition per split so segment partitioning is 1:1. + let config = SessionConfig::new().with_target_partitions(1); + let ctx = SessionContext::new_with_config(config); + ctx.register_udf(full_text_udf()); + + register_split(&ctx, "s1", opener1); + register_split(&ctx, "s2", opener2); + register_split(&ctx, "s3", opener3); + + (ctx, registry) +} + +// ── Full-text search: inv ⋈ f ────────────────────────────────────── + +#[tokio::test] +async fn test_full_text_join_plan() { + let (ctx, _) = setup(); + + let df = ctx + .sql( + "EXPLAIN \ + SELECT s1_f.id, s1_f.price \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id \ + AND s1_f._segment_ord = s1_inv._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + ORDER BY s1_f.id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let plan = plan_to_string(&batches); + + println!("=== Full-text join plan (single split) ===\n{plan}\n"); + + // The physical plan should show: + // - HashJoinExec on (_doc_id, _segment_ord) — segment-level co-partitioning + // - InvertedIndexDataSource on the build side (with query=true) + // - FastFieldDataSource on the probe side + assert!(plan.contains("HashJoinExec"), "expected HashJoinExec\n\n{plan}"); + assert!(plan.contains("InvertedIndexDataSource"), "expected InvertedIndexDataSource\n\n{plan}"); + assert!(plan.contains("FastFieldDataSource"), "expected FastFieldDataSource\n\n{plan}"); +} + +#[tokio::test] +async fn test_full_text_join_results() { + let (ctx, _) = setup(); + + let df = ctx + .sql( + "SELECT s1_f.id, s1_f.price \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id \ + AND s1_f._segment_ord = s1_inv._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + ORDER BY s1_f.id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + // split-1 has electronics at ids {1, 2} + assert_eq!(batch.num_rows(), 2); + let ids = batch.column(0).as_primitive::(); + assert_eq!(ids.value(0), 1); + assert_eq!(ids.value(1), 2); +} + +// ── Three-way join: inv ⋈ f ⋈ d ──────────────────────────────────── + +#[tokio::test] +async fn test_three_way_join_plan() { + let (ctx, _) = setup(); + + let df = ctx + .sql( + "EXPLAIN \ + SELECT s1_f.id, s1_f.price, s1_d._document \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id \ + AND s1_f._segment_ord = s1_inv._segment_ord \ + JOIN s1_d ON s1_d._doc_id = s1_f._doc_id \ + AND s1_d._segment_ord = s1_f._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + ORDER BY s1_f.id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let plan = plan_to_string(&batches); + + println!("=== Three-way join plan (inv ⋈ f ⋈ d, single split) ===\n{plan}\n"); + + // All three DataSource types in one plan + assert!(plan.contains("InvertedIndexDataSource"), "expected InvertedIndexDataSource\n\n{plan}"); + assert!(plan.contains("FastFieldDataSource"), "expected FastFieldDataSource\n\n{plan}"); + assert!(plan.contains("DocumentDataSource"), "expected DocumentDataSource\n\n{plan}"); + // Two joins + let join_count = plan.matches("HashJoinExec").count(); + assert!(join_count >= 2, "expected 2 HashJoinExec, got {join_count}\n\n{plan}"); +} + +#[tokio::test] +async fn test_three_way_join_results() { + let (ctx, _) = setup(); + + let df = ctx + .sql( + "SELECT s1_f.id, s1_f.price, s1_d._document \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id \ + AND s1_f._segment_ord = s1_inv._segment_ord \ + JOIN s1_d ON s1_d._doc_id = s1_f._doc_id \ + AND s1_d._segment_ord = s1_f._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + ORDER BY s1_f.id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 2); + let ids = batch.column(0).as_primitive::(); + assert_eq!(ids.value(0), 1); + assert_eq!(ids.value(1), 2); + + // Documents should be JSON with id and category + let docs = batch.column(2).as_string::(); + let doc0: serde_json::Value = serde_json::from_str(docs.value(0)).unwrap(); + assert_eq!(doc0["id"][0], 1); + assert_eq!(doc0["category"][0], "electronics"); +} + +// ── Cross-split UNION of join plans ───────────────────────────────── + +#[tokio::test] +async fn test_cross_split_full_text_plan() { + let (ctx, _) = setup(); + + // Search "electronics" across all 3 splits via UNION ALL of per-split joins. + // This is how a real distributed Quickwit query decomposes. + let df = ctx + .sql( + "EXPLAIN \ + SELECT id, price FROM ( \ + SELECT s1_f.id, s1_f.price \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id AND s1_f._segment_ord = s1_inv._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + UNION ALL \ + SELECT s2_f.id, s2_f.price \ + FROM s2_inv \ + JOIN s2_f ON s2_f._doc_id = s2_inv._doc_id AND s2_f._segment_ord = s2_inv._segment_ord \ + WHERE full_text(s2_inv.category, 'electronics') \ + UNION ALL \ + SELECT s3_f.id, s3_f.price \ + FROM s3_inv \ + JOIN s3_f ON s3_f._doc_id = s3_inv._doc_id AND s3_f._segment_ord = s3_inv._segment_ord \ + WHERE full_text(s3_inv.category, 'electronics') \ + ) ORDER BY id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let plan = plan_to_string(&batches); + + println!("=== Cross-split full-text search plan ===\n{plan}\n"); + + // Should have 3 HashJoinExec (one per split), 3 InvertedIndexDataSource, 3 FastFieldDataSource + let inv_count = plan.matches("InvertedIndexDataSource").count(); + let ff_count = plan.matches("FastFieldDataSource").count(); + let join_count = plan.matches("HashJoinExec").count(); + assert_eq!(inv_count, 3, "expected 3 InvertedIndexDataSource, got {inv_count}\n\n{plan}"); + assert_eq!(ff_count, 3, "expected 3 FastFieldDataSource, got {ff_count}\n\n{plan}"); + assert_eq!(join_count, 3, "expected 3 HashJoinExec, got {join_count}\n\n{plan}"); +} + +#[tokio::test] +async fn test_cross_split_full_text_results() { + let (ctx, _) = setup(); + + let df = ctx + .sql( + "SELECT id, price FROM ( \ + SELECT s1_f.id, s1_f.price \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id AND s1_f._segment_ord = s1_inv._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + UNION ALL \ + SELECT s2_f.id, s2_f.price \ + FROM s2_inv \ + JOIN s2_f ON s2_f._doc_id = s2_inv._doc_id AND s2_f._segment_ord = s2_inv._segment_ord \ + WHERE full_text(s2_inv.category, 'electronics') \ + UNION ALL \ + SELECT s3_f.id, s3_f.price \ + FROM s3_inv \ + JOIN s3_f ON s3_f._doc_id = s3_inv._doc_id AND s3_f._segment_ord = s3_inv._segment_ord \ + WHERE full_text(s3_inv.category, 'electronics') \ + ) ORDER BY id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + // electronics across all splits: ids {1, 2} (split-1) + {} (split-2) + {6} (split-3) + assert_eq!(batch.num_rows(), 3); + let ids = batch.column(0).as_primitive::(); + let prices = batch.column(1).as_primitive::(); + assert_eq!(ids.value(0), 1); + assert!((prices.value(0) - 1.5).abs() < 1e-10); + assert_eq!(ids.value(1), 2); + assert_eq!(ids.value(2), 6); + assert!((prices.value(2) - 6.5).abs() < 1e-10); +} diff --git a/quickwit/quickwit-datafusion/tests/single_node.rs b/quickwit/quickwit-datafusion/tests/single_node.rs new file mode 100644 index 00000000000..b2b64ee8c42 --- /dev/null +++ b/quickwit/quickwit-datafusion/tests/single_node.rs @@ -0,0 +1,252 @@ +//! Single-node integration tests for quickwit-datafusion. +//! +//! Tests the full flow: mock metastore → QuickwitTableProvider → tantivy-df +//! providers → query execution. + +use std::sync::Arc; + +use arrow::array::{AsArray, RecordBatch}; +use arrow::datatypes::{Float64Type, Int64Type, UInt64Type}; +use datafusion::prelude::*; +use quickwit_metastore::{ListSplitsResponseExt, SplitMetadata}; +use quickwit_proto::metastore::MetastoreError; +use quickwit_proto::metastore::{ + ListSplitsResponse, MetastoreServiceClient, MockMetastoreService, +}; +use quickwit_common::ServiceStream; +use quickwit_proto::types::IndexUid; +use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; +use tantivy::{Index, IndexWriter, TantivyDocument}; +use tantivy_datafusion::{ + full_text_udf, IndexOpener, TantivyInvertedIndexProvider, TantivyTableProvider, +}; + +use quickwit_datafusion::{SplitIndexOpener, SplitRegistry}; + +fn build_test_schema() -> ( + tantivy::schema::Schema, + tantivy::schema::Field, // id + tantivy::schema::Field, // score + tantivy::schema::Field, // price + tantivy::schema::Field, // category +) { + let mut builder = SchemaBuilder::new(); + let id_field = builder.add_u64_field("id", FAST | STORED); + let score_field = builder.add_i64_field("score", FAST); + let price_field = builder.add_f64_field("price", FAST); + let category_field = builder.add_text_field("category", TEXT | FAST | STORED); + let schema = builder.build(); + (schema, id_field, score_field, price_field, category_field) +} + +fn create_index(docs: &[(u64, i64, f64, &str)]) -> Index { + let (schema, id_f, score_f, price_f, cat_f) = build_test_schema(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + for &(id, score, price, category) in docs { + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, id); + doc.add_i64(score_f, score); + doc.add_f64(price_f, price); + doc.add_text(cat_f, category); + writer.add_document(doc).unwrap(); + } + writer.commit().unwrap(); + index +} + +fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { + arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() +} + +/// Create a mock metastore that returns the given split IDs. +fn mock_metastore(index_uid: &IndexUid, split_ids: &[&str]) -> MetastoreServiceClient { + let index_uid = index_uid.clone(); + let split_ids: Vec = split_ids.iter().map(|s| s.to_string()).collect(); + + let mut mock = MockMetastoreService::new(); + mock.expect_list_splits().returning(move |_request| { + let splits: Vec = split_ids + .iter() + .map(|id| quickwit_metastore::Split { + split_state: quickwit_metastore::SplitState::Published, + split_metadata: SplitMetadata { + split_id: id.clone(), + index_uid: index_uid.clone(), + num_docs: 2, + ..Default::default() + }, + update_timestamp: 0, + publish_timestamp: None, + }) + .collect(); + let response = ListSplitsResponse::try_from_splits(splits).unwrap(); + let items: Vec> = vec![Ok(response)]; + Ok(ServiceStream::from(items)) + }); + + MetastoreServiceClient::from_mock(mock) +} + +// ── QuickwitTableProvider with mock metastore ─────────────────────── + +#[tokio::test] +async fn test_metastore_backed_provider() { + let registry = Arc::new(SplitRegistry::new()); + let (tantivy_schema, ..) = build_test_schema(); + + // Create indexes and register in the registry + let idx1 = create_index(&[(1, 10, 1.5, "electronics"), (2, 20, 2.5, "electronics")]); + let idx2 = create_index(&[(3, 30, 3.5, "books"), (4, 40, 4.5, "books")]); + let idx3 = create_index(&[(5, 50, 5.5, "clothing"), (6, 60, 6.5, "clothing")]); + registry.insert("split-1".to_string(), idx1); + registry.insert("split-2".to_string(), idx2); + registry.insert("split-3".to_string(), idx3); + + let index_uid = IndexUid::for_test("test-index", 0); + let metastore = mock_metastore(&index_uid, &["split-1", "split-2", "split-3"]); + + // The opener factory creates a SplitIndexOpener from SplitMetadata. + // It reads the tantivy schema from the registry (in production this + // would come from the doc mapper or the split footer). + let reg = registry.clone(); + let schema = tantivy_schema.clone(); + let opener_factory: quickwit_datafusion::OpenerFactory = Arc::new(move |meta: &SplitMetadata| { + let index = reg.get(&meta.split_id).expect("split not in registry"); + let segment_sizes: Vec = index + .reader() + .map(|r| { + r.searcher() + .segment_readers() + .iter() + .map(|sr| sr.max_doc()) + .collect() + }) + .unwrap_or_default(); + Arc::new(SplitIndexOpener::new( + meta.split_id.clone(), + reg.clone(), + schema.clone(), + segment_sizes, + )) as Arc + }); + + let provider = quickwit_datafusion::QuickwitTableProvider::new( + index_uid, + metastore, + opener_factory, + &tantivy_schema, + ); + + let ctx = SessionContext::new(); + ctx.register_table("splits", Arc::new(provider)).unwrap(); + + // SELECT all + let df = ctx.sql("SELECT id, price FROM splits ORDER BY id").await.unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + assert_eq!(batch.num_rows(), 6); + let ids = batch.column(0).as_primitive::(); + let mut id_values: Vec = (0..6).map(|i| ids.value(i)).collect(); + id_values.sort(); + assert_eq!(id_values, vec![1, 2, 3, 4, 5, 6]); +} + +#[tokio::test] +async fn test_metastore_backed_provider_group_by() { + let registry = Arc::new(SplitRegistry::new()); + let (tantivy_schema, ..) = build_test_schema(); + + let idx1 = create_index(&[(1, 10, 1.5, "electronics"), (2, 20, 2.5, "electronics")]); + let idx2 = create_index(&[(3, 30, 3.5, "books"), (4, 40, 4.5, "books")]); + registry.insert("split-1".to_string(), idx1); + registry.insert("split-2".to_string(), idx2); + + let index_uid = IndexUid::for_test("test-index", 0); + let metastore = mock_metastore(&index_uid, &["split-1", "split-2"]); + + let reg = registry.clone(); + let schema = tantivy_schema.clone(); + let opener_factory: quickwit_datafusion::OpenerFactory = Arc::new(move |meta: &SplitMetadata| { + let index = reg.get(&meta.split_id).expect("split not in registry"); + let segment_sizes: Vec = index + .reader() + .map(|r| r.searcher().segment_readers().iter().map(|sr| sr.max_doc()).collect()) + .unwrap_or_default(); + Arc::new(SplitIndexOpener::new( + meta.split_id.clone(), + reg.clone(), + schema.clone(), + segment_sizes, + )) as Arc + }); + + let provider = quickwit_datafusion::QuickwitTableProvider::new( + index_uid, metastore, opener_factory, &tantivy_schema, + ); + + let ctx = SessionContext::new(); + ctx.register_table("splits", Arc::new(provider)).unwrap(); + + let df = ctx + .sql("SELECT category, COUNT(*) as cnt FROM splits GROUP BY category ORDER BY category") + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 2); + let cat_col = arrow::compute::cast(batch.column(0), &arrow::datatypes::DataType::Utf8).unwrap(); + let categories = cat_col.as_string::(); + let counts = batch.column(1).as_primitive::(); + assert_eq!(categories.value(0), "books"); + assert_eq!(counts.value(0), 2); + assert_eq!(categories.value(1), "electronics"); + assert_eq!(counts.value(1), 2); +} + +// ── Direct tantivy-df provider tests (no QuickwitTableProvider) ───── + +#[tokio::test] +async fn test_full_text_join_single_split() { + let registry = Arc::new(SplitRegistry::new()); + let idx = create_index(&[ + (1, 10, 1.5, "electronics"), + (2, 20, 2.5, "electronics"), + (3, 30, 3.5, "books"), + ]); + let opener = Arc::new(SplitIndexOpener::from_index( + "split-1".to_string(), idx, registry.clone(), + )); + + let ctx = SessionContext::new(); + ctx.register_udf(full_text_udf()); + ctx.register_table( + "f", + Arc::new(TantivyTableProvider::from_opener(opener.clone() as Arc)), + ) + .unwrap(); + ctx.register_table( + "inv", + Arc::new(TantivyInvertedIndexProvider::from_opener(opener as Arc)), + ) + .unwrap(); + + let df = ctx + .sql( + "SELECT f.id \ + FROM inv \ + JOIN f ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord \ + WHERE full_text(inv.category, 'electronics') \ + ORDER BY f.id", + ) + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 2); + let ids = batch.column(0).as_primitive::(); + assert_eq!(ids.value(0), 1); + assert_eq!(ids.value(1), 2); +} From a7fa135f924f04ac59c6d8896c57e1670a3e422a Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Fri, 13 Feb 2026 18:57:23 -0500 Subject: [PATCH 02/19] integrate quickwit with datafusion via Flight --- quickwit/Cargo.lock | 10 + .../quickwit-config/src/node_config/mod.rs | 6 + quickwit/quickwit-datafusion/Cargo.toml | 8 + quickwit/quickwit-datafusion/src/flight.rs | 234 +++++++++++++ quickwit/quickwit-datafusion/src/lib.rs | 7 + .../src/query_translator.rs | 310 ++++++++++++++++++ quickwit/quickwit-datafusion/src/resolver.rs | 42 +++ quickwit/quickwit-datafusion/src/session.rs | 113 +++++++ quickwit/quickwit-datafusion/src/worker.rs | 25 +- .../tests/distributed_join_plan.rs | 16 +- .../tests/serve_integration.rs | 181 ++++++++++ quickwit/quickwit-serve/Cargo.toml | 4 + quickwit/quickwit-serve/src/datafusion_api.rs | 70 ++++ quickwit/quickwit-serve/src/grpc.rs | 10 +- quickwit/quickwit-serve/src/lib.rs | 30 +- quickwit/quickwit-serve/src/rest.rs | 38 +++ 16 files changed, 1076 insertions(+), 28 deletions(-) create mode 100644 quickwit/quickwit-datafusion/src/flight.rs create mode 100644 quickwit/quickwit-datafusion/src/query_translator.rs create mode 100644 quickwit/quickwit-datafusion/src/resolver.rs create mode 100644 quickwit/quickwit-datafusion/src/session.rs create mode 100644 quickwit/quickwit-datafusion/tests/serve_integration.rs create mode 100644 quickwit/quickwit-serve/src/datafusion_api.rs diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index f5d809debf8..c6e2ad42ba0 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -8119,6 +8119,7 @@ version = "0.8.0" dependencies = [ "anyhow", "arrow", + "arrow-flight", "async-trait", "dashmap 6.1.0", "datafusion", @@ -8131,12 +8132,17 @@ dependencies = [ "quickwit-common", "quickwit-metastore", "quickwit-proto", + "quickwit-query", + "quickwit-search", "serde", "serde_json", "tantivy", "tantivy-datafusion", "tokio", + "tokio-stream", + "tonic 0.14.2", "tracing", + "url", ] [[package]] @@ -8635,11 +8641,14 @@ name = "quickwit-serve" version = "0.8.0" dependencies = [ "anyhow", + "arrow-flight", "assert-json-diff", "async-trait", "base64 0.22.1", "bytes", "bytesize", + "datafusion", + "datafusion-distributed", "elasticsearch-dsl", "flate2", "futures", @@ -8664,6 +8673,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-control-plane", + "quickwit-datafusion", "quickwit-doc-mapper", "quickwit-index-management", "quickwit-indexing", diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index e8c347eb4a5..ea3d15e1cd2 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -300,6 +300,11 @@ pub struct SearcherConfig { pub storage_timeout_policy: Option, pub warmup_memory_budget: ByteSize, pub warmup_single_split_initial_allocation: ByteSize, + /// Enable the experimental DataFusion SQL endpoint. + /// When enabled, `POST /api/v1/{index_id}/datafusion` accepts SQL + /// queries and returns Arrow IPC record batches. + #[serde(default)] + pub enable_datafusion_endpoint: bool, } #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] @@ -435,6 +440,7 @@ impl Default for SearcherConfig { storage_timeout_policy: None, warmup_memory_budget: ByteSize::gb(100), warmup_single_split_initial_allocation: ByteSize::gb(1), + enable_datafusion_endpoint: false, } } } diff --git a/quickwit/quickwit-datafusion/Cargo.toml b/quickwit/quickwit-datafusion/Cargo.toml index 67b48d7722e..177c738ac01 100644 --- a/quickwit/quickwit-datafusion/Cargo.toml +++ b/quickwit/quickwit-datafusion/Cargo.toml @@ -14,13 +14,19 @@ prost = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tokio = { workspace = true } +tokio-stream = { workspace = true } +tonic = { workspace = true } tracing = { workspace = true } +url = "2" tantivy = { workspace = true } tantivy-datafusion = { path = "/Users/alex.bianchi/oss/tantivy/.worktrees/bianchi/tantivydf/tantivy-datafusion" } +quickwit-common = { workspace = true } quickwit-metastore = { workspace = true } quickwit-proto = { workspace = true } +quickwit-query = { workspace = true } +quickwit-search = { workspace = true } datafusion = "52" datafusion-datasource = "52" @@ -28,6 +34,7 @@ datafusion-physical-plan = "52" datafusion-proto = "52" datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed" } arrow = { version = "57", features = ["prettyprint"] } +arrow-flight = "57" [dev-dependencies] datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed", features = ["integration"] } @@ -35,4 +42,5 @@ dashmap = "6" quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-proto = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } +quickwit-search = { workspace = true, features = ["testsuite"] } tokio = { workspace = true, features = ["test-util", "macros"] } diff --git a/quickwit/quickwit-datafusion/src/flight.rs b/quickwit/quickwit-datafusion/src/flight.rs new file mode 100644 index 00000000000..7a1431ec0d4 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/flight.rs @@ -0,0 +1,234 @@ +use std::pin::Pin; +use std::sync::Arc; + +use arrow_flight::flight_service_server::{FlightService, FlightServiceServer}; +use arrow_flight::{ + Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightInfo, + HandshakeRequest, HandshakeResponse, PollInfo, PutResult, SchemaResult, Ticket, +}; +use datafusion::physical_plan::execute_stream; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_distributed::{DistributedExt, DistributedPhysicalOptimizerRule, Worker}; +use futures::stream::BoxStream; +use futures::TryStreamExt; +use tantivy_datafusion::{ + IndexOpener, OpenerFactoryExt, OpenerMetadata, TantivyCodec, full_text_udf, +}; +use tonic::{Request, Response, Status, Streaming}; + +use crate::resolver::QuickwitWorkerResolver; +use crate::split_opener::{SplitIndexOpener, SplitRegistry}; + +/// A Flight service that handles both: +/// - **df-distributed plan fragments** (worker execution) +/// - **SQL queries from external clients** (via `do_get` with SQL string tickets) +/// +/// Dispatch: if the ticket decodes as a df-distributed protobuf, route +/// to the worker. Otherwise treat the ticket bytes as a UTF-8 SQL string. +pub struct QuickwitFlightService { + worker: Worker, + registry: Arc, + searcher_pool: quickwit_search::SearcherPool, +} + +impl QuickwitFlightService { + fn build_client_session(&self) -> SessionContext { + let mut config = SessionConfig::new(); + let registry = self.registry.clone(); + config.set_opener_factory(Arc::new(move |meta: OpenerMetadata| { + Arc::new(SplitIndexOpener::new( + meta.identifier, + registry.clone(), + meta.tantivy_schema, + meta.segment_sizes, + )) as Arc + })); + + let worker_resolver = QuickwitWorkerResolver::new(self.searcher_pool.clone()); + + let state = datafusion::execution::SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .with_distributed_worker_resolver(worker_resolver) + .with_distributed_user_codec(TantivyCodec) + .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule)) + .build(); + + let ctx = SessionContext::new_with_state(state); + ctx.register_udf(full_text_udf()); + ctx + } + + /// Execute a SQL query and return a Flight stream of RecordBatches. + async fn execute_sql( + &self, + sql: &str, + ) -> Result>>, Status> { + let ctx = self.build_client_session(); + + // TODO: register index tables from metastore based on query. + // For now, execute against an empty context (useful for SHOW TABLES, SELECT 1, etc.) + + let df = ctx + .sql(sql) + .await + .map_err(|e| Status::invalid_argument(format!("SQL error: {e}")))?; + let plan = df + .create_physical_plan() + .await + .map_err(|e| Status::internal(format!("plan error: {e}")))?; + let stream = execute_stream(plan.clone(), ctx.task_ctx()) + .map_err(|e| Status::internal(format!("execution error: {e}")))?; + + let schema = plan.schema(); + let flight_stream = arrow_flight::encode::FlightDataEncoderBuilder::new() + .with_schema(schema) + .build(stream.map_err(|e| { + arrow_flight::error::FlightError::ExternalError(Box::new(e)) + })) + .map_err(|e| Status::internal(format!("flight encode error: {e}"))); + + Ok(Response::new(Box::pin(flight_stream))) + } +} + +/// Build the combined Flight service. +/// +/// Handles both df-distributed worker traffic AND external SQL queries +/// on the same gRPC port. +pub fn build_flight_service( + registry: Arc, + searcher_pool: quickwit_search::SearcherPool, +) -> FlightServiceServer { + let reg = registry.clone(); + let worker = Worker::from_session_builder( + move |ctx: datafusion_distributed::WorkerQueryContext| { + let registry = reg.clone(); + Box::pin(async move { + let mut config = SessionConfig::new(); + config.set_opener_factory(Arc::new(move |meta: OpenerMetadata| { + Arc::new(SplitIndexOpener::new( + meta.identifier, + registry.clone(), + meta.tantivy_schema, + meta.segment_sizes, + )) as Arc + })); + + Ok(ctx + .builder + .with_config(config) + .with_distributed_user_codec(TantivyCodec) + .build()) + }) + }, + ); + + FlightServiceServer::new(QuickwitFlightService { + worker, + registry, + searcher_pool, + }) +} + +// ── FlightService impl: dispatch worker vs SQL ────────────────────── + +#[tonic::async_trait] +impl FlightService for QuickwitFlightService { + type HandshakeStream = BoxStream<'static, Result>; + type ListFlightsStream = BoxStream<'static, Result>; + type DoGetStream = BoxStream<'static, Result>; + type DoPutStream = BoxStream<'static, Result>; + type DoExchangeStream = BoxStream<'static, Result>; + type DoActionStream = BoxStream<'static, Result>; + type ListActionsStream = BoxStream<'static, Result>; + + async fn do_get( + &self, + request: Request, + ) -> Result, Status> { + let ticket = request.get_ref(); + + // Try to parse as a df-distributed plan fragment. + // df-distributed encodes DoGet as a prost::Message. + // If the ticket is valid protobuf with the right fields, it's a worker request. + // Otherwise, treat as UTF-8 SQL from an external client. + if let Ok(sql) = std::str::from_utf8(&ticket.ticket) { + // Heuristic: df-distributed tickets are protobuf (binary), + // not valid UTF-8 text. If we got valid UTF-8 and it looks + // like SQL (or any human-readable string), handle as SQL. + // Pure protobuf tickets will almost never be valid UTF-8. + if sql.len() > 0 && !sql.starts_with('\0') { + return self.execute_sql(sql).await; + } + } + + // Delegate to df-distributed worker. + self.worker.do_get(request).await + } + + // All other methods delegate to the worker (which returns unimplemented for most). + + async fn handshake( + &self, + request: Request>, + ) -> Result, Status> { + self.worker.handshake(request).await + } + + async fn list_flights( + &self, + request: Request, + ) -> Result, Status> { + self.worker.list_flights(request).await + } + + async fn get_flight_info( + &self, + request: Request, + ) -> Result, Status> { + self.worker.get_flight_info(request).await + } + + async fn get_schema( + &self, + request: Request, + ) -> Result, Status> { + self.worker.get_schema(request).await + } + + async fn do_put( + &self, + request: Request>, + ) -> Result, Status> { + self.worker.do_put(request).await + } + + async fn do_exchange( + &self, + request: Request>, + ) -> Result, Status> { + self.worker.do_exchange(request).await + } + + async fn do_action( + &self, + request: Request, + ) -> Result, Status> { + self.worker.do_action(request).await + } + + async fn list_actions( + &self, + request: Request, + ) -> Result, Status> { + self.worker.list_actions(request).await + } + + async fn poll_flight_info( + &self, + request: Request, + ) -> Result, Status> { + self.worker.poll_flight_info(request).await + } +} diff --git a/quickwit/quickwit-datafusion/src/lib.rs b/quickwit/quickwit-datafusion/src/lib.rs index d8f418c9adc..15e0e77e8d8 100644 --- a/quickwit/quickwit-datafusion/src/lib.rs +++ b/quickwit/quickwit-datafusion/src/lib.rs @@ -1,7 +1,14 @@ +pub mod flight; +pub mod query_translator; +pub mod resolver; +pub mod session; pub mod split_opener; pub mod table_provider; pub mod worker; +pub use flight::{QuickwitFlightService, build_flight_service}; +pub use resolver::QuickwitWorkerResolver; +pub use session::QuickwitSessionBuilder; pub use split_opener::{SplitIndexOpener, SplitRegistry}; pub use table_provider::{OpenerFactory, QuickwitTableProvider}; pub use worker::build_worker_session_builder; diff --git a/quickwit/quickwit-datafusion/src/query_translator.rs b/quickwit/quickwit-datafusion/src/query_translator.rs new file mode 100644 index 00000000000..f6d5421fec6 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/query_translator.rs @@ -0,0 +1,310 @@ +//! Translates a Quickwit [`SearchRequest`] into a DataFusion [`DataFrame`]. +//! +//! The translation builds a per-split join plan (inv ⋈ f), unions them +//! across splits, then applies aggregation / sort / limit on top. +//! Everything uses the DataFrame API — no SQL strings. + +use std::ops::Bound; +use std::sync::Arc; + +use datafusion::common::Result; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::expr::Sort; +use datafusion::logical_expr::{col, lit, Expr, JoinType, SortExpr}; +use datafusion::prelude::{DataFrame, SessionContext}; +use quickwit_metastore::SplitMetadata; +use quickwit_proto::search::{SearchRequest, SortField, SortOrder}; +use quickwit_query::query_ast::{BoolQuery, FullTextQuery, QueryAst, RangeQuery, TermQuery}; +use quickwit_query::JsonLiteral; +use tantivy_datafusion::{ + IndexOpener, TantivyInvertedIndexProvider, TantivyTableProvider, full_text_udf, +}; + +use crate::table_provider::OpenerFactory; + +// ── QueryAst → Expr ───────────────────────────────────────────────── + +/// Convert a [`QueryAst`] into a DataFusion filter [`Expr`]. +/// +/// Full-text queries become `full_text(field, text)` UDF calls. +/// Term/range/bool queries become standard comparison expressions. +pub fn query_ast_to_expr(ast: &QueryAst) -> Result { + match ast { + QueryAst::MatchAll => Ok(lit(true)), + QueryAst::MatchNone => Ok(lit(false)), + QueryAst::FullText(ft) => Ok(full_text_expr(ft)), + QueryAst::Term(t) => Ok(term_expr(t)), + QueryAst::Range(r) => range_expr(r), + QueryAst::Bool(b) => bool_expr(b), + + QueryAst::TermSet(ts) => { + let mut exprs = Vec::new(); + for (field, values) in &ts.terms_per_field { + let literals: Vec = values.iter().map(|v| lit(v.as_str())).collect(); + exprs.push(col(field.as_str()).in_list(literals, false)); + } + Ok(and_all(exprs)) + } + + QueryAst::Boost { underlying, .. } => query_ast_to_expr(underlying), + + QueryAst::Wildcard(w) => { + let pattern = w + .value + .replace('%', "\\%") + .replace('_', "\\_") + .replace('*', "%") + .replace('?', "_"); + Ok(col(w.field.as_str()).like(lit(pattern))) + } + + QueryAst::UserInput(_) => Err(DataFusionError::Plan( + "UserInput queries must be parsed before translation; \ + call query_ast.parse_user_query() first" + .to_string(), + )), + + QueryAst::Regex(r) => Ok(col(r.field.as_str()).like(lit(format!("%{}%", r.regex)))), + + QueryAst::FieldPresence(fp) => Ok(col(fp.field.as_str()).is_not_null()), + + QueryAst::PhrasePrefix(pp) => Ok(Expr::ScalarFunction( + datafusion::logical_expr::expr::ScalarFunction::new_udf( + Arc::new(full_text_udf()), + vec![col(pp.field.as_str()), lit(pp.phrase.as_str())], + ), + )), + + QueryAst::Cache(c) => query_ast_to_expr(&c.inner), + } +} + +fn full_text_expr(ft: &FullTextQuery) -> Expr { + Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf( + Arc::new(full_text_udf()), + vec![col(ft.field.as_str()), lit(ft.text.as_str())], + )) +} + +fn term_expr(t: &TermQuery) -> Expr { + col(t.field.as_str()).eq(lit(t.value.as_str())) +} + +fn range_expr(r: &RangeQuery) -> Result { + let field = col(r.field.as_str()); + let mut exprs = Vec::new(); + match &r.lower_bound { + Bound::Included(v) => exprs.push(field.clone().gt_eq(json_literal_to_expr(v))), + Bound::Excluded(v) => exprs.push(field.clone().gt(json_literal_to_expr(v))), + Bound::Unbounded => {} + } + match &r.upper_bound { + Bound::Included(v) => exprs.push(field.clone().lt_eq(json_literal_to_expr(v))), + Bound::Excluded(v) => exprs.push(field.lt(json_literal_to_expr(v))), + Bound::Unbounded => {} + } + Ok(and_all(exprs)) +} + +fn bool_expr(b: &BoolQuery) -> Result { + let mut parts = Vec::new(); + for ast in &b.must { + parts.push(query_ast_to_expr(ast)?); + } + for ast in &b.filter { + parts.push(query_ast_to_expr(ast)?); + } + for ast in &b.must_not { + parts.push(Expr::Not(Box::new(query_ast_to_expr(ast)?))); + } + if !b.should.is_empty() { + let should_exprs: Vec = b + .should + .iter() + .map(query_ast_to_expr) + .collect::>>()?; + parts.push(or_all(should_exprs)); + } + Ok(and_all(parts)) +} + +fn json_literal_to_expr(literal: &JsonLiteral) -> Expr { + match literal { + JsonLiteral::String(s) => lit(s.as_str()), + JsonLiteral::Number(n) => { + if let Some(i) = n.as_i64() { + lit(i) + } else if let Some(u) = n.as_u64() { + lit(u) + } else if let Some(f) = n.as_f64() { + lit(f) + } else { + lit(n.to_string()) + } + } + JsonLiteral::Bool(b) => lit(*b), + } +} + +fn and_all(exprs: Vec) -> Expr { + exprs + .into_iter() + .reduce(|a, b| a.and(b)) + .unwrap_or(lit(true)) +} + +fn or_all(exprs: Vec) -> Expr { + exprs + .into_iter() + .reduce(|a, b| a.or(b)) + .unwrap_or(lit(false)) +} + +fn sort_field_to_sort_expr(sf: &SortField) -> SortExpr { + let asc = sf.sort_order() != SortOrder::Desc; + SortExpr { + expr: col(sf.field_name.as_str()), + asc, + nulls_first: !asc, + } +} + +// ── Per-split join plan builder ───────────────────────────────────── + +/// Build a per-split join plan: `inv ⋈ f` with the query filter +/// applied on the inverted index side. +/// +/// This is a single split's contribution to the overall query. +/// The join is on `(_doc_id, _segment_ord)` — segment-level +/// co-partitioning between the inverted index and fast fields. +fn build_split_plan( + ctx: &SessionContext, + opener: Arc, + query_filter: &Expr, + has_full_text: bool, +) -> Result { + let df_f = ctx.read_table(Arc::new(TantivyTableProvider::from_opener( + opener.clone(), + )))?; + + if !has_full_text { + // No full-text query — just fast field scan with filter. + return df_f.filter(query_filter.clone()); + } + + // Full-text query: build inv ⋈ f join. + let df_inv = ctx.read_table(Arc::new(TantivyInvertedIndexProvider::from_opener( + opener, + )))?; + + // Apply the full-text filter on the inverted index side. + let df_inv = df_inv.filter(query_filter.clone())?; + + // Join on (_doc_id, _segment_ord). + df_inv.join( + df_f, + JoinType::Inner, + &["_doc_id", "_segment_ord"], + &["_doc_id", "_segment_ord"], + None, + ) +} + +/// Check if a QueryAst contains any full-text query nodes. +fn has_full_text_queries(ast: &QueryAst) -> bool { + match ast { + QueryAst::FullText(_) | QueryAst::PhrasePrefix(_) => true, + QueryAst::Bool(b) => { + b.must.iter().any(has_full_text_queries) + || b.should.iter().any(has_full_text_queries) + || b.filter.iter().any(has_full_text_queries) + } + QueryAst::Boost { underlying, .. } => has_full_text_queries(underlying), + QueryAst::Cache(c) => has_full_text_queries(&c.inner), + _ => false, + } +} + +// ── SearchRequest → DataFrame ─────────────────────────────────────── + +/// Translate a [`SearchRequest`] into a DataFusion [`DataFrame`] that +/// spans all provided splits. +/// +/// Builds per-split join plans (inv ⋈ f), unions them, then applies +/// sort / limit / offset on top. Uses the DataFrame API throughout — +/// no SQL strings. +/// +/// For aggregations, use [`translate_aggregation_request`] on the +/// returned DataFrame. +pub fn build_search_plan( + ctx: &SessionContext, + splits: &[SplitMetadata], + opener_factory: &OpenerFactory, + request: &SearchRequest, +) -> Result { + if splits.is_empty() { + return Err(DataFusionError::Plan("no splits to search".to_string())); + } + + // Parse the query AST. + let query_ast: QueryAst = serde_json::from_str(&request.query_ast).map_err(|e| { + DataFusionError::Plan(format!("failed to parse query_ast: {e}")) + })?; + let filter_expr = query_ast_to_expr(&query_ast)?; + let has_ft = has_full_text_queries(&query_ast); + + // Build per-split plans and union them. + let mut per_split_dfs: Vec = Vec::with_capacity(splits.len()); + for split in splits { + let opener = opener_factory(split); + let df = build_split_plan(ctx, opener, &filter_expr, has_ft)?; + per_split_dfs.push(df); + } + + let mut result = per_split_dfs.remove(0); + for df in per_split_dfs { + result = result.union(df)?; + } + + // Drop internal columns (_doc_id, _segment_ord, _score, virtual text cols) + // that the caller doesn't need. Keep only user-facing columns. + // For now, we keep all columns and let the caller select. + + // Apply sort. + if !request.sort_fields.is_empty() { + let sort_exprs: Vec = request + .sort_fields + .iter() + .map(sort_field_to_sort_expr) + .collect(); + result = result.sort(sort_exprs)?; + } + + // Apply limit + offset. + if request.start_offset > 0 || request.max_hits > 0 { + let offset = request.start_offset as usize; + let limit = if request.max_hits > 0 { + Some(request.max_hits as usize) + } else { + None + }; + result = result.limit(offset, limit)?; + } + + Ok(result) +} + +/// Translate the aggregation portion of a [`SearchRequest`]. +/// +/// Takes the unioned DataFrame (from `build_search_plan`) and applies +/// ES-style aggregations using tantivy-df's translator. +pub fn build_aggregation_plan( + df: DataFrame, + aggregation_json: &str, +) -> Result> { + let aggs: tantivy::aggregation::agg_req::Aggregations = + serde_json::from_str(aggregation_json).map_err(|e| { + DataFusionError::Plan(format!("failed to parse aggregation request: {e}")) + })?; + tantivy_datafusion::translate_aggregations(df, &aggs) +} diff --git a/quickwit/quickwit-datafusion/src/resolver.rs b/quickwit/quickwit-datafusion/src/resolver.rs new file mode 100644 index 00000000000..7f53351b4f7 --- /dev/null +++ b/quickwit/quickwit-datafusion/src/resolver.rs @@ -0,0 +1,42 @@ +use std::net::SocketAddr; + +use datafusion::error::DataFusionError; +use datafusion_distributed::WorkerResolver; +use quickwit_search::SearcherPool; +use url::Url; + +/// A [`WorkerResolver`] backed by Quickwit's [`SearcherPool`]. +/// +/// The searcher pool is populated from Chitchat cluster membership. +/// Every searcher node runs both the Quickwit gRPC `SearchService` +/// and the Arrow Flight service on the same port. This resolver +/// returns those addresses as Flight URLs for df-distributed. +#[derive(Clone)] +pub struct QuickwitWorkerResolver { + searcher_pool: SearcherPool, +} + +impl QuickwitWorkerResolver { + pub fn new(searcher_pool: SearcherPool) -> Self { + Self { searcher_pool } + } +} + +impl WorkerResolver for QuickwitWorkerResolver { + fn get_urls(&self) -> Result, DataFusionError> { + let addrs: Vec = self.searcher_pool.keys(); + if addrs.is_empty() { + return Err(DataFusionError::Execution( + "no searcher nodes available in the cluster".to_string(), + )); + } + addrs + .into_iter() + .map(|addr| { + Url::parse(&format!("http://{addr}")).map_err(|e| { + DataFusionError::Internal(format!("bad worker url: {e}")) + }) + }) + .collect() + } +} diff --git a/quickwit/quickwit-datafusion/src/session.rs b/quickwit/quickwit-datafusion/src/session.rs new file mode 100644 index 00000000000..9b7f0ce0cad --- /dev/null +++ b/quickwit/quickwit-datafusion/src/session.rs @@ -0,0 +1,113 @@ +use std::sync::Arc; + +use datafusion::execution::SessionStateBuilder; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_distributed::{DistributedExt, DistributedPhysicalOptimizerRule}; +use quickwit_metastore::SplitMetadata; +use quickwit_proto::metastore::MetastoreServiceClient; +use quickwit_proto::types::IndexUid; +use quickwit_search::SearcherPool; +use tantivy_datafusion::{IndexOpener, OpenerMetadata, TantivyCodec, full_text_udf}; + +use crate::resolver::QuickwitWorkerResolver; +use crate::split_opener::{SplitIndexOpener, SplitRegistry}; +use crate::table_provider::{OpenerFactory, QuickwitTableProvider}; + +/// Everything needed to build a DataFusion session for a Quickwit node. +pub struct QuickwitSessionBuilder { + metastore: MetastoreServiceClient, + searcher_pool: SearcherPool, + registry: Arc, +} + +impl QuickwitSessionBuilder { + pub fn new( + metastore: MetastoreServiceClient, + searcher_pool: SearcherPool, + registry: Arc, + ) -> Self { + Self { + metastore, + searcher_pool, + registry, + } + } + + /// Build a `SessionContext` configured for distributed query execution. + /// + /// The context has: + /// - `TantivyCodec` (stateless) for serializing tantivy-df nodes + /// - `QuickwitWorkerResolver` for discovering searcher nodes + /// - `full_text()` UDF registered + /// + /// The opener factory is NOT set here — it lives on each worker's + /// session config (set in `build_flight_service`). The coordinator + /// doesn't need an opener factory because it never opens indexes. + pub fn build_session(&self) -> SessionContext { + let config = SessionConfig::new(); + let worker_resolver = QuickwitWorkerResolver::new(self.searcher_pool.clone()); + + let state = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .with_distributed_worker_resolver(worker_resolver) + .with_distributed_user_codec(TantivyCodec) + .with_physical_optimizer_rule(Arc::new(DistributedPhysicalOptimizerRule)) + .build(); + + let ctx = SessionContext::new_with_state(state); + ctx.register_udf(full_text_udf()); + ctx + } + + /// Register a Quickwit index as a DataFusion table. + pub fn register_index( + &self, + ctx: &SessionContext, + table_name: &str, + index_uid: IndexUid, + tantivy_schema: &tantivy::schema::Schema, + ) -> datafusion::common::Result<()> { + let opener_factory = self.make_opener_factory(tantivy_schema); + let provider = QuickwitTableProvider::new( + index_uid, + self.metastore.clone(), + opener_factory, + tantivy_schema, + ); + ctx.register_table(table_name, Arc::new(provider))?; + Ok(()) + } + + fn make_opener_factory( + &self, + tantivy_schema: &tantivy::schema::Schema, + ) -> OpenerFactory { + let registry = self.registry.clone(); + let schema = tantivy_schema.clone(); + Arc::new(move |meta: &SplitMetadata| { + let segment_sizes = registry + .get(&meta.split_id) + .map(|entry| { + entry + .reader() + .map(|r| { + r.searcher() + .segment_readers() + .iter() + .map(|sr| sr.max_doc()) + .collect() + }) + .unwrap_or_default() + }) + .unwrap_or_default(); + + Arc::new(SplitIndexOpener::new( + meta.split_id.clone(), + registry.clone(), + schema.clone(), + segment_sizes, + )) as Arc + }) + } +} diff --git a/quickwit/quickwit-datafusion/src/worker.rs b/quickwit/quickwit-datafusion/src/worker.rs index 1dfa3d6c8a6..900d81b32cf 100644 --- a/quickwit/quickwit-datafusion/src/worker.rs +++ b/quickwit/quickwit-datafusion/src/worker.rs @@ -2,17 +2,17 @@ use std::sync::Arc; use datafusion::error::DataFusionError; use datafusion::execution::SessionState; +use datafusion::prelude::SessionConfig; use datafusion_distributed::{DistributedExt, WorkerQueryContext}; -use tantivy_datafusion::{OpenerMetadata, TantivyCodec}; +use tantivy_datafusion::{IndexOpener, OpenerFactoryExt, OpenerMetadata, TantivyCodec}; use crate::split_opener::{SplitIndexOpener, SplitRegistry}; -/// Build a worker session builder that registers tantivy-df's -/// [`TantivyCodec`] with the given [`SplitRegistry`]. +/// Build a worker session builder for test contexts. /// -/// Each worker gets its own registry populated with the splits it -/// should serve. The codec's opener factory reconstructs -/// [`SplitIndexOpener`]s from the serialized metadata. +/// Sets the opener factory on the session config and registers the +/// stateless `TantivyCodec`. In production, `build_flight_service` +/// does the same thing. pub fn build_worker_session_builder( registry: Arc, ) -> impl Fn(WorkerQueryContext) -> std::pin::Pin< @@ -24,15 +24,20 @@ pub fn build_worker_session_builder( move |ctx: WorkerQueryContext| { let registry = registry.clone(); Box::pin(async move { - let codec = TantivyCodec::new(move |meta: OpenerMetadata| { + let mut config = SessionConfig::new(); + config.set_opener_factory(Arc::new(move |meta: OpenerMetadata| { Arc::new(SplitIndexOpener::new( meta.identifier, registry.clone(), meta.tantivy_schema, meta.segment_sizes, - )) as Arc - }); - Ok(ctx.builder.with_distributed_user_codec(codec).build()) + )) as Arc + })); + Ok(ctx + .builder + .with_config(config) + .with_distributed_user_codec(TantivyCodec) + .build()) }) } } diff --git a/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs b/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs index 8596f5d9a07..8ae56c776fc 100644 --- a/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs +++ b/quickwit/quickwit-datafusion/tests/distributed_join_plan.rs @@ -13,7 +13,7 @@ use futures::TryStreamExt; use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy_datafusion::{ - full_text_udf, IndexOpener, OpenerMetadata, TantivyCodec, TantivyDocumentProvider, + full_text_udf, IndexOpener, TantivyCodec, TantivyDocumentProvider, TantivyInvertedIndexProvider, TantivyTableProvider, }; @@ -95,16 +95,6 @@ fn setup_registry() -> (Arc, Vec>) { (registry, vec![opener1, opener2, opener3]) } -fn make_codec(registry: Arc) -> TantivyCodec { - TantivyCodec::new(move |meta: OpenerMetadata| { - Arc::new(SplitIndexOpener::new( - meta.identifier, - registry.clone(), - meta.tantivy_schema, - meta.segment_sizes, - )) as Arc - }) -} /// Full-text search across 3 splits: inv ⋈ f, distributed. #[tokio::test] @@ -118,7 +108,7 @@ async fn test_distributed_full_text_join_plan() { ) .await; - ctx.set_distributed_user_codec(make_codec(registry.clone())); + ctx.set_distributed_user_codec(TantivyCodec); ctx.register_udf(full_text_udf()); for (i, opener) in openers.iter().enumerate() { @@ -183,7 +173,7 @@ async fn test_distributed_three_way_join_plan() { ) .await; - ctx.set_distributed_user_codec(make_codec(registry.clone())); + ctx.set_distributed_user_codec(TantivyCodec); ctx.register_udf(full_text_udf()); for (i, opener) in openers.iter().enumerate() { diff --git a/quickwit/quickwit-datafusion/tests/serve_integration.rs b/quickwit/quickwit-datafusion/tests/serve_integration.rs new file mode 100644 index 00000000000..25f393589fd --- /dev/null +++ b/quickwit/quickwit-datafusion/tests/serve_integration.rs @@ -0,0 +1,181 @@ +//! Integration test that exercises the real serving path: +//! Flight service on the same gRPC port, SearcherPool for worker +//! discovery, QuickwitSessionBuilder for distributed execution. +//! +//! Each simulated "searcher node" runs a tonic server with the Flight +//! service — the same way quickwit-serve/grpc.rs mounts it. Workers +//! discover each other via the SearcherPool, just like Chitchat would +//! populate it in production. + +use std::net::SocketAddr; +use std::sync::Arc; + +use arrow::array::{AsArray, RecordBatch}; +use arrow::datatypes::UInt64Type; +use datafusion::physical_plan::execute_stream; +use datafusion::prelude::*; +use datafusion_distributed::display_plan_ascii; +use futures::TryStreamExt; +use quickwit_search::SearcherPool; +use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; +use tantivy::{Index, IndexWriter, TantivyDocument}; +use tantivy_datafusion::{ + full_text_udf, IndexOpener, TantivyInvertedIndexProvider, TantivyTableProvider, +}; +use tokio::net::TcpListener; + +use quickwit_datafusion::session::QuickwitSessionBuilder; +use quickwit_datafusion::{SplitIndexOpener, SplitRegistry, build_flight_service}; + +fn create_index(docs: &[(u64, i64, f64, &str)]) -> Index { + let mut builder = SchemaBuilder::new(); + let id_f = builder.add_u64_field("id", FAST | STORED); + let score_f = builder.add_i64_field("score", FAST); + let price_f = builder.add_f64_field("price", FAST); + let cat_f = builder.add_text_field("category", TEXT | FAST | STORED); + let schema = builder.build(); + + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + for &(id, score, price, category) in docs { + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, id); + doc.add_i64(score_f, score); + doc.add_f64(price_f, price); + doc.add_text(cat_f, category); + writer.add_document(doc).unwrap(); + } + writer.commit().unwrap(); + index +} + +fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { + arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() +} + +/// Start a tonic gRPC server with the Flight service on a random port. +/// This mirrors what `quickwit-serve/src/grpc.rs` does — same port, +/// same tonic server, Flight is just another service alongside search. +async fn start_searcher_node(registry: Arc) -> SocketAddr { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener); + + let flight_service = build_flight_service(registry, quickwit_search::SearcherPool::default()); + + tokio::spawn(async move { + tonic::transport::Server::builder() + .add_service(flight_service) + .serve_with_incoming(incoming) + .await + .unwrap(); + }); + + // Let the server start accepting connections. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + addr +} + +/// End-to-end: 3 searcher nodes on real TCP, SearcherPool for discovery, +/// distributed full-text join query across 3 splits. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn test_distributed_query_via_searcher_pool() { + // ── Shared split registry (in production: each node has its own) ─ + let registry = Arc::new(SplitRegistry::new()); + + let idx1 = create_index(&[ + (1, 10, 1.5, "electronics"), + (2, 20, 2.5, "electronics"), + ]); + let idx2 = create_index(&[(3, 30, 3.5, "books"), (4, 40, 4.5, "books")]); + let idx3 = create_index(&[ + (5, 50, 5.5, "clothing"), + (6, 60, 6.5, "electronics"), + ]); + registry.insert("split-1".to_string(), idx1); + registry.insert("split-2".to_string(), idx2); + registry.insert("split-3".to_string(), idx3); + + // ── Start 3 searcher nodes (gRPC + Flight on same port) ───────── + let node1_addr = start_searcher_node(registry.clone()).await; + let node2_addr = start_searcher_node(registry.clone()).await; + let node3_addr = start_searcher_node(registry.clone()).await; + + // ── Populate SearcherPool (as Chitchat would in production) ────── + let searcher_pool = SearcherPool::default(); + for addr in [node1_addr, node2_addr, node3_addr] { + searcher_pool.insert( + addr, + quickwit_search::SearchServiceClient::from_service( + Arc::new(quickwit_search::MockSearchService::new()), + addr, + ), + ); + } + + // ── Build DataFusion session via QuickwitSessionBuilder ────────── + let mock_metastore = quickwit_proto::metastore::MetastoreServiceClient::from_mock( + quickwit_proto::metastore::MockMetastoreService::new(), + ); + let session_builder = + QuickwitSessionBuilder::new(mock_metastore, searcher_pool, registry.clone()); + let ctx = session_builder.build_session(); + ctx.register_udf(full_text_udf()); + + // ── Register per-split tantivy-df providers ───────────────────── + for (i, split_id) in ["split-1", "split-2", "split-3"].iter().enumerate() { + let index = registry.get(*split_id).unwrap().value().clone(); + let opener = Arc::new(SplitIndexOpener::from_index( + split_id.to_string(), + index, + registry.clone(), + )); + let prefix = format!("s{}", i + 1); + let o: Arc = opener; + ctx.register_table( + &format!("{prefix}_f"), + Arc::new(TantivyTableProvider::from_opener(o.clone())), + ) + .unwrap(); + ctx.register_table( + &format!("{prefix}_inv"), + Arc::new(TantivyInvertedIndexProvider::from_opener(o)), + ) + .unwrap(); + } + + // ── Execute distributed full-text join ─────────────────────────── + let sql = "\ + SELECT id, price FROM ( \ + SELECT s1_f.id, s1_f.price \ + FROM s1_inv \ + JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id AND s1_f._segment_ord = s1_inv._segment_ord \ + WHERE full_text(s1_inv.category, 'electronics') \ + UNION ALL \ + SELECT s2_f.id, s2_f.price \ + FROM s2_inv \ + JOIN s2_f ON s2_f._doc_id = s2_inv._doc_id AND s2_f._segment_ord = s2_inv._segment_ord \ + WHERE full_text(s2_inv.category, 'electronics') \ + UNION ALL \ + SELECT s3_f.id, s3_f.price \ + FROM s3_inv \ + JOIN s3_f ON s3_f._doc_id = s3_inv._doc_id AND s3_f._segment_ord = s3_inv._segment_ord \ + WHERE full_text(s3_inv.category, 'electronics') \ + ) ORDER BY id"; + + let df: DataFrame = ctx.sql(sql).await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + let plan_str = display_plan_ascii(plan.as_ref(), false); + println!("=== Distributed plan (3 searcher nodes, same port) ===\n{plan_str}\n"); + + let stream = execute_stream(plan, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = collect_batches(&batches); + + // electronics: {1, 2} (split-1) + {6} (split-3) = 3 rows + assert_eq!(batch.num_rows(), 3); + let ids = batch.column(0).as_primitive::(); + let mut id_values: Vec = (0..batch.num_rows()).map(|i| ids.value(i)).collect(); + id_values.sort(); + assert_eq!(id_values, vec![1, 2, 6]); +} diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml index 363065a3403..384bd66764a 100644 --- a/quickwit/quickwit-serve/Cargo.toml +++ b/quickwit/quickwit-serve/Cargo.toml @@ -61,6 +61,10 @@ quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } quickwit-config = { workspace = true } quickwit-control-plane = { workspace = true } +quickwit-datafusion = { workspace = true } +arrow-flight = "57" +datafusion = "52" +datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed" } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } diff --git a/quickwit/quickwit-serve/src/datafusion_api.rs b/quickwit/quickwit-serve/src/datafusion_api.rs new file mode 100644 index 00000000000..2539815fc0a --- /dev/null +++ b/quickwit/quickwit-serve/src/datafusion_api.rs @@ -0,0 +1,70 @@ +//! REST handler for the experimental DataFusion SQL endpoint. +//! +//! `POST /api/v1/{index_id}/datafusion` accepts a SQL query and +//! returns results as JSON. +//! +//! Gated behind `SearcherConfig::enable_datafusion_endpoint`. + +use std::sync::Arc; + +use serde::Deserialize; +use warp::Filter; +use warp::reject::Rejection; + +use crate::with_arg; +use crate::QuickwitServices; + +#[derive(Debug, Deserialize)] +pub struct DataFusionRequest { + /// SQL query to execute. + pub sql: String, +} + +async fn datafusion_handler( + _index_id: String, + request: DataFusionRequest, + services: Arc, +) -> Result { + let ctx = services.datafusion_session_builder.build_session(); + + // TODO: use index_id to register the right tables from the metastore. + // For now, execute against whatever tables are registered on the session. + + let result: Result = async { + let df = ctx + .sql(&request.sql) + .await + .map_err(|e| e.to_string())?; + let results = df.collect().await.map_err(|e| e.to_string())?; + let formatted = + datafusion::common::arrow::util::pretty::pretty_format_batches(&results) + .map_err(|e| e.to_string())?; + Ok(formatted.to_string()) + } + .await; + + match result { + Ok(table) => Ok(warp::reply::with_status( + warp::reply::json(&serde_json::json!({ + "results": table, + })), + warp::http::StatusCode::OK, + )), + Err(err) => Ok(warp::reply::with_status( + warp::reply::json(&serde_json::json!({ + "error": err, + })), + warp::http::StatusCode::BAD_REQUEST, + )), + } +} + +pub fn datafusion_handler_filter( + services: Arc, +) -> impl Filter + Clone { + warp::path!(String / "datafusion") + .and(warp::post()) + .and(warp::body::json()) + .and(with_arg(services)) + .then(datafusion_handler) +} diff --git a/quickwit/quickwit-serve/src/grpc.rs b/quickwit/quickwit-serve/src/grpc.rs index 698c9e07d71..be6977e5626 100644 --- a/quickwit/quickwit-serve/src/grpc.rs +++ b/quickwit/quickwit-serve/src/grpc.rs @@ -201,6 +201,13 @@ pub(crate) async fn start_grpc_server( None }; + // Mount Arrow Flight service for DataFusion distributed execution. + // Runs on the same port as all other gRPC services. + let flight_service = { + enabled_grpc_services.insert("datafusion-flight"); + Some(services.datafusion_flight_service.clone()) + }; + // Mount gRPC jaeger service if present. let jaeger_grpc_service = if let Some(jaeger_service) = services.jaeger_service_opt.clone() { enabled_grpc_services.insert("jaeger"); @@ -248,7 +255,8 @@ pub(crate) async fn start_grpc_server( .add_optional_service(metastore_grpc_service) .add_optional_service(otlp_log_grpc_service) .add_optional_service(otlp_trace_grpc_service) - .add_optional_service(search_grpc_service); + .add_optional_service(search_grpc_service) + .add_optional_service(flight_service); let grpc_listen_addr = tcp_listener.local_addr()?; info!( diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index ca4520ff0ce..89bb076e0ef 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -36,6 +36,7 @@ mod otlp_api; mod rate_modulator; mod rest; mod rest_api_response; +mod datafusion_api; mod search_api; pub(crate) mod simple_list; pub mod tcp_listener; @@ -201,6 +202,13 @@ struct QuickwitServices { /// the root requests. pub search_service: Arc, + /// DataFusion session builder for distributed SQL/Arrow query execution. + pub datafusion_session_builder: quickwit_datafusion::QuickwitSessionBuilder, + + /// Arrow Flight gRPC service for DataFusion distributed execution + /// and external SQL queries. + pub datafusion_flight_service: arrow_flight::flight_service_server::FlightServiceServer, + pub env_filter_reload_fn: EnvFilterReloadFn, /// The control plane listens to various events. @@ -635,7 +643,7 @@ pub async fn serve_quickwit( split_cache_opt, )); - let (search_job_placer, search_service) = setup_searcher( + let (search_job_placer, search_service, searcher_pool) = setup_searcher( &node_config, cluster.change_stream(), // search remains available without a control plane because not all @@ -647,6 +655,18 @@ pub async fn serve_quickwit( .await .context("failed to start searcher service")?; + // Set up DataFusion distributed query execution. + // The Flight service runs on the same gRPC port as all other services. + // Workers discover each other via the same SearcherPool (Chitchat). + let datafusion_registry = std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()); + let datafusion_flight_service = + quickwit_datafusion::build_flight_service(datafusion_registry.clone(), searcher_pool.clone()); + let datafusion_session_builder = quickwit_datafusion::QuickwitSessionBuilder::new( + metastore_through_control_plane.clone(), + searcher_pool, + datafusion_registry, + ); + // The control plane listens for local shards updates to learn about each shard's ingestion // throughput. Ingesters (routers) do so to update their shard table. let local_shards_update_listener_handle_opt = if node_config @@ -738,6 +758,8 @@ pub async fn serve_quickwit( otlp_logs_service_opt, otlp_traces_service_opt, search_service, + datafusion_session_builder, + datafusion_flight_service, env_filter_reload_fn, }); // Setup and start gRPC server. @@ -1014,7 +1036,7 @@ async fn setup_searcher( metastore: MetastoreServiceClient, storage_resolver: StorageResolver, searcher_context: Arc, -) -> anyhow::Result<(SearchJobPlacer, Arc)> { +) -> anyhow::Result<(SearchJobPlacer, Arc, SearcherPool)> { let searcher_pool = SearcherPool::default(); let search_job_placer = SearchJobPlacer::new(searcher_pool.clone()); let search_service = start_searcher_service( @@ -1070,7 +1092,7 @@ async fn setup_searcher( }) }); searcher_pool.listen_for_changes(searcher_change_stream); - Ok((search_job_placer, search_service)) + Ok((search_job_placer, search_service, searcher_pool)) } #[allow(clippy::too_many_arguments)] @@ -1557,7 +1579,7 @@ mod tests { let metastore = metastore_for_test(); let (change_stream, change_stream_tx) = ClusterChangeStream::new_unbounded(); let storage_resolver = StorageResolver::unconfigured(); - let (search_job_placer, _searcher_service) = setup_searcher( + let (search_job_placer, _searcher_service, _searcher_pool) = setup_searcher( &node_config, change_stream, metastore, diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 3f193783b04..35eb9e37d03 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -287,6 +287,28 @@ fn search_routes( .boxed() } +fn datafusion_routes( + quickwit_services: Arc, +) -> impl Filter + Clone { + let enabled = quickwit_services + .node_config + .searcher_config + .enable_datafusion_endpoint; + let filter = crate::datafusion_api::datafusion_handler_filter(quickwit_services); + // When disabled, the filter still exists but always rejects. + warp::any() + .and_then(move || async move { + if enabled { + Ok(()) + } else { + Err(warp::reject::not_found()) + } + }) + .untuple_one() + .and(filter) + .boxed() +} + fn api_v1_routes( quickwit_services: Arc, ) -> impl Filter + Clone { @@ -346,6 +368,8 @@ fn api_v1_routes( .or(index_template_api_handlers( quickwit_services.metastore_client.clone(), )) + .boxed() + .or(datafusion_routes(quickwit_services.clone())) .boxed(), ) } @@ -858,6 +882,20 @@ mod tests { metastore_server_opt: None, node_config: Arc::new(node_config.clone()), search_service: Arc::new(MockSearchService::new()), + datafusion_session_builder: { + let ms = quickwit_proto::metastore::MetastoreServiceClient::from_mock( + quickwit_proto::metastore::MockMetastoreService::new(), + ); + quickwit_datafusion::QuickwitSessionBuilder::new( + ms, + quickwit_search::SearcherPool::default(), + std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()), + ) + }, + datafusion_flight_service: quickwit_datafusion::build_flight_service( + std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()), + quickwit_search::SearcherPool::default(), + ), jaeger_service_opt: None, env_filter_reload_fn: crate::do_nothing_env_filter_reload_fn(), }; From 5dd15c31175512ae0f490318b4f2d986c15bde0f Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 10:32:45 -0500 Subject: [PATCH 03/19] add basic type coercion --- .../src/query_translator.rs | 132 ++++++++- .../tests/schema_evolution.rs | 272 ++++++++++++++++++ 2 files changed, 389 insertions(+), 15 deletions(-) create mode 100644 quickwit/quickwit-datafusion/tests/schema_evolution.rs diff --git a/quickwit/quickwit-datafusion/src/query_translator.rs b/quickwit/quickwit-datafusion/src/query_translator.rs index f6d5421fec6..c9901e93954 100644 --- a/quickwit/quickwit-datafusion/src/query_translator.rs +++ b/quickwit/quickwit-datafusion/src/query_translator.rs @@ -4,10 +4,12 @@ //! across splits, then applies aggregation / sort / limit on top. //! Everything uses the DataFrame API — no SQL strings. +use std::collections::HashMap; use std::ops::Bound; use std::sync::Arc; -use datafusion::common::Result; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::common::{Result, ScalarValue}; use datafusion::error::DataFusionError; use datafusion::logical_expr::expr::Sort; use datafusion::logical_expr::{col, lit, Expr, JoinType, SortExpr}; @@ -225,22 +227,110 @@ fn has_full_text_queries(ast: &QueryAst) -> bool { } } +// ── Schema alignment ──────────────────────────────────────────────── + +/// Compute the canonical schema from the doc mapper's Arrow schema. +/// +/// This is the union of all fields across all splits. The doc mapper +/// always represents the latest version of the index schema. Older +/// splits may be missing fields — those get NULL-filled during alignment. +/// +/// If `canonical_schema` is `None`, we derive it from the union of +/// all split schemas (for cases where no doc mapper is available). +fn compute_canonical_schema(split_dfs: &[DataFrame]) -> SchemaRef { + let mut fields: Vec = Vec::new(); + let mut seen: HashMap = HashMap::new(); + + for df in split_dfs { + for field in df.schema().fields() { + if let Some(existing_type) = seen.get(field.name()) { + // Field exists — type must match (we error later if not). + let _ = existing_type; + } else { + seen.insert(field.name().clone(), field.data_type().clone()); + fields.push(field.as_ref().clone()); + } + } + } + + Arc::new(Schema::new(fields)) +} + +/// Align a DataFrame to the canonical schema. +/// +/// - Missing columns → added as typed NULL literals +/// - Type mismatches → error (no implicit coercion) +/// - Extra columns → kept (will be dropped by final projection) +/// +/// After alignment, all DataFrames have identical schemas and can +/// be safely unioned. +fn align_to_schema( + df: DataFrame, + canonical: &SchemaRef, + split_id: &str, +) -> Result { + let df_schema = df.schema().clone(); + let df_fields: HashMap<&str, &arrow::datatypes::DataType> = df_schema + .fields() + .iter() + .map(|f| (f.name().as_str(), f.data_type())) + .collect(); + + // Check for type mismatches. + for canon_field in canonical.fields() { + if let Some(&split_type) = df_fields.get(canon_field.name().as_str()) { + if split_type != canon_field.data_type() { + return Err(DataFusionError::Plan(format!( + "type mismatch for field '{}' in split {}: \ + split has {:?}, canonical schema has {:?}. \ + Use CAST() in SQL to convert explicitly.", + canon_field.name(), + split_id, + split_type, + canon_field.data_type(), + ))); + } + } + } + + // Build select list: canonical columns in order, NULLs for missing. + let select_exprs: Vec = canonical + .fields() + .iter() + .map(|canon_field| { + if df_fields.contains_key(canon_field.name().as_str()) { + col(canon_field.name().as_str()) + } else { + // Missing column → typed NULL. + let null_value = ScalarValue::try_from(canon_field.data_type()) + .unwrap_or(ScalarValue::Utf8(None)); + lit(null_value).alias(canon_field.name()) + } + }) + .collect(); + + df.select(select_exprs) +} + // ── SearchRequest → DataFrame ─────────────────────────────────────── /// Translate a [`SearchRequest`] into a DataFusion [`DataFrame`] that /// spans all provided splits. /// -/// Builds per-split join plans (inv ⋈ f), unions them, then applies -/// sort / limit / offset on top. Uses the DataFrame API throughout — -/// no SQL strings. +/// Builds per-split join plans (inv ⋈ f), aligns schemas across splits +/// (NULL-fill for missing columns, error on type mismatch), unions them, +/// then applies sort / limit / offset on top. +/// +/// Uses the DataFrame API throughout — no SQL strings. /// -/// For aggregations, use [`translate_aggregation_request`] on the -/// returned DataFrame. +/// `canonical_schema`: if provided, all splits are aligned to this schema. +/// If `None`, the canonical schema is derived from the union of all split schemas. pub fn build_search_plan( ctx: &SessionContext, splits: &[SplitMetadata], opener_factory: &OpenerFactory, request: &SearchRequest, + canonical_schema: Option, ) -> Result { if splits.is_empty() { return Err(DataFusionError::Plan("no splits to search".to_string())); @@ -253,22 +343,34 @@ pub fn build_search_plan( let filter_expr = query_ast_to_expr(&query_ast)?; let has_ft = has_full_text_queries(&query_ast); - // Build per-split plans and union them. - let mut per_split_dfs: Vec = Vec::with_capacity(splits.len()); + // Build per-split plans. + let mut per_split_dfs: Vec<(String, DataFrame)> = Vec::with_capacity(splits.len()); for split in splits { let opener = opener_factory(split); let df = build_split_plan(ctx, opener, &filter_expr, has_ft)?; - per_split_dfs.push(df); + per_split_dfs.push((split.split_id.clone(), df)); } - let mut result = per_split_dfs.remove(0); - for df in per_split_dfs { - result = result.union(df)?; + // Compute or use provided canonical schema. + let canonical = canonical_schema.unwrap_or_else(|| { + let dfs: Vec<&DataFrame> = per_split_dfs.iter().map(|(_, df)| df).collect(); + compute_canonical_schema( + &dfs.into_iter().cloned().collect::>(), + ) + }); + + // Align each split's DataFrame to the canonical schema. + let mut aligned: Vec = Vec::with_capacity(per_split_dfs.len()); + for (split_id, df) in per_split_dfs { + let aligned_df = align_to_schema(df, &canonical, &split_id)?; + aligned.push(aligned_df); } - // Drop internal columns (_doc_id, _segment_ord, _score, virtual text cols) - // that the caller doesn't need. Keep only user-facing columns. - // For now, we keep all columns and let the caller select. + // UNION ALL across splits. + let mut result = aligned.remove(0); + for df in aligned { + result = result.union(df)?; + } // Apply sort. if !request.sort_fields.is_empty() { diff --git a/quickwit/quickwit-datafusion/tests/schema_evolution.rs b/quickwit/quickwit-datafusion/tests/schema_evolution.rs new file mode 100644 index 00000000000..e3813e06e67 --- /dev/null +++ b/quickwit/quickwit-datafusion/tests/schema_evolution.rs @@ -0,0 +1,272 @@ +//! Tests for schema evolution across splits: missing columns get +//! NULL-filled, type mismatches error with a message telling the +//! user to CAST explicitly. + +use std::sync::Arc; + +use arrow::array::{Array, AsArray, RecordBatch}; +use arrow::datatypes::{Float64Type, UInt64Type}; +use datafusion::prelude::*; +use quickwit_metastore::SplitMetadata; +use quickwit_proto::search::SearchRequest; +use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; +use tantivy::{Index, IndexWriter, TantivyDocument}; +use tantivy_datafusion::{IndexOpener, full_text_udf}; + +use quickwit_datafusion::query_translator::build_search_plan; +use quickwit_datafusion::{SplitIndexOpener, SplitRegistry}; + +fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { + arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() +} + +/// Split-1: old schema (id, price) +fn create_old_index() -> Index { + let mut builder = SchemaBuilder::new(); + let id_f = builder.add_u64_field("id", FAST | STORED); + let price_f = builder.add_f64_field("price", FAST); + let schema = builder.build(); + + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, 1); + doc.add_f64(price_f, 1.5); + writer.add_document(doc).unwrap(); + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, 2); + doc.add_f64(price_f, 2.5); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + index +} + +/// Split-2: new schema (id, price, rating) — has an extra field +fn create_new_index() -> Index { + let mut builder = SchemaBuilder::new(); + let id_f = builder.add_u64_field("id", FAST | STORED); + let price_f = builder.add_f64_field("price", FAST); + let rating_f = builder.add_f64_field("rating", FAST); + let schema = builder.build(); + + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, 3); + doc.add_f64(price_f, 3.5); + doc.add_f64(rating_f, 4.8); + writer.add_document(doc).unwrap(); + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, 4); + doc.add_f64(price_f, 4.5); + doc.add_f64(rating_f, 3.2); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + index +} + +fn make_split_meta(split_id: &str) -> SplitMetadata { + SplitMetadata { + split_id: split_id.to_string(), + ..Default::default() + } +} + +fn match_all_request() -> SearchRequest { + SearchRequest { + query_ast: serde_json::to_string(&serde_json::json!({"type": "match_all"})).unwrap(), + ..Default::default() + } +} + +/// Missing column gets NULL-filled: old split lacks "rating", +/// new split has it. After union, old rows have rating=NULL. +#[tokio::test] +async fn test_missing_column_null_filled() { + let registry = Arc::new(SplitRegistry::new()); + + let old_idx = create_old_index(); + let new_idx = create_new_index(); + registry.insert("old-split".to_string(), old_idx); + registry.insert("new-split".to_string(), new_idx); + + let splits = vec![make_split_meta("old-split"), make_split_meta("new-split")]; + + let new_tantivy_schema = { + let idx = registry.get("new-split").unwrap(); + idx.schema() + }; + let reg = registry.clone(); + // Each opener uses the ACTUAL tantivy schema of its split, not the canonical one. + let opener_factory: quickwit_datafusion::OpenerFactory = + Arc::new(move |meta: &SplitMetadata| { + let index = reg.get(&meta.split_id).unwrap().value().clone(); + let tantivy_schema = index.schema(); + let segment_sizes: Vec = index + .reader() + .map(|r| { + r.searcher() + .segment_readers() + .iter() + .map(|sr| sr.max_doc()) + .collect() + }) + .unwrap_or_default(); + Arc::new(SplitIndexOpener::new( + meta.split_id.clone(), + reg.clone(), + tantivy_schema, + segment_sizes, + )) as Arc + }); + + let ctx = SessionContext::new(); + ctx.register_udf(full_text_udf()); + + // Use the new schema as canonical (has the "rating" field). + let canonical = + Some(tantivy_datafusion::tantivy_schema_to_arrow(&new_tantivy_schema)); + + let request = match_all_request(); + let df = build_search_plan(&ctx, &splits, &opener_factory, &request, canonical).unwrap(); + + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + // 4 rows total (2 from each split) + assert_eq!(batch.num_rows(), 4); + + // Check that "rating" column exists + let rating_idx = batch.schema().index_of("rating").unwrap(); + let rating_col = batch.column(rating_idx).as_primitive::(); + + // Old split rows should have NULL ratings, new split rows have values. + // Rows are not guaranteed to be in order, so collect and check. + let mut null_count = 0; + let mut value_count = 0; + for i in 0..batch.num_rows() { + if rating_col.is_null(i) { + null_count += 1; + } else { + value_count += 1; + } + } + assert_eq!(null_count, 2, "old split rows should have NULL rating"); + assert_eq!(value_count, 2, "new split rows should have real rating"); +} + +/// Type mismatch errors with a clear message. +#[tokio::test] +async fn test_type_mismatch_errors() { + let registry = Arc::new(SplitRegistry::new()); + + // Split-1: id as u64 + let idx1 = { + let mut builder = SchemaBuilder::new(); + let id_f = builder.add_u64_field("id", FAST | STORED); + let schema = builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); + let mut doc = TantivyDocument::default(); + doc.add_u64(id_f, 1); + writer.add_document(doc).unwrap(); + writer.commit().unwrap(); + index + }; + registry.insert("split-1".to_string(), idx1); + + let splits = vec![make_split_meta("split-1")]; + + let reg = registry.clone(); + let opener_factory: quickwit_datafusion::OpenerFactory = + Arc::new(move |meta: &SplitMetadata| { + let index = reg.get(&meta.split_id).unwrap().value().clone(); + let tantivy_schema = index.schema(); + let segment_sizes: Vec = index + .reader() + .map(|r| { + r.searcher() + .segment_readers() + .iter() + .map(|sr| sr.max_doc()) + .collect() + }) + .unwrap_or_default(); + Arc::new(SplitIndexOpener::new( + meta.split_id.clone(), + reg.clone(), + tantivy_schema, + segment_sizes, + )) as Arc + }); + + let ctx = SessionContext::new(); + + // Create a canonical schema where "id" is Int64 instead of UInt64. + // This simulates a type mismatch from schema evolution. + let mismatched_canonical = Arc::new(arrow::datatypes::Schema::new(vec![ + arrow::datatypes::Field::new("_doc_id", arrow::datatypes::DataType::UInt32, false), + arrow::datatypes::Field::new("_segment_ord", arrow::datatypes::DataType::UInt32, false), + arrow::datatypes::Field::new("id", arrow::datatypes::DataType::Int64, true), // mismatch: UInt64 vs Int64 + ])); + + let request = match_all_request(); + let result = build_search_plan( + &ctx, + &splits, + &opener_factory, + &request, + Some(mismatched_canonical), + ); + + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("type mismatch") && err.contains("id") && err.contains("CAST"), + "error should mention type mismatch, field name, and CAST. Got: {err}" + ); +} + +/// SQL CAST workaround: user can cast columns explicitly after the plan. +#[tokio::test] +async fn test_sql_cast_workaround() { + let registry = Arc::new(SplitRegistry::new()); + let new_idx = create_new_index(); + registry.insert("split-1".to_string(), new_idx); + + let ctx = SessionContext::new(); + + // Register the split as a table directly so we can run SQL on it. + let index = registry.get("split-1").unwrap().value().clone(); + let opener = Arc::new(SplitIndexOpener::from_index( + "split-1".to_string(), + index, + registry.clone(), + )); + ctx.register_table( + "my_table", + Arc::new(tantivy_datafusion::TantivyTableProvider::from_opener( + opener as Arc, + )), + ) + .unwrap(); + + // User explicitly casts rating from f64 to integer — demonstrating + // explicit type coercion via SQL CAST. + let df = ctx + .sql("SELECT id, CAST(rating AS BIGINT) as rating_int FROM my_table ORDER BY id") + .await + .unwrap(); + let batches = df.collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 2); + let ids = batch.column(0).as_primitive::(); + assert_eq!(ids.value(0), 3); + assert_eq!(ids.value(1), 4); + + // The cast truncated: 4.8 → 4, 3.2 → 3 + let ratings = batch.column(1).as_primitive::(); + assert_eq!(ratings.value(0), 4); + assert_eq!(ratings.value(1), 3); +} From d3ac8c02997a8ce65d329701912ee38d935eafdd Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 13:31:27 -0500 Subject: [PATCH 04/19] upgrade prost --- quickwit/Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index c6e2ad42ba0..863b43f4efd 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -10900,7 +10900,7 @@ dependencies = [ "datafusion-physical-plan", "datafusion-proto", "futures", - "prost 0.13.5", + "prost 0.14.1", "serde", "serde_json", "tantivy", From bfc6492d4434f46a403f185e0a1cf5753edb2daf Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 13:44:34 -0500 Subject: [PATCH 05/19] fix: serialize pushed filters across workers, add StorageSplitOpener --- quickwit/Cargo.lock | 1 + quickwit/quickwit-datafusion/Cargo.toml | 1 + quickwit/quickwit-datafusion/src/lib.rs | 2 +- .../quickwit-datafusion/src/split_opener.rs | 106 ++++++++++++++++-- quickwit/quickwit-search/src/leaf.rs | 2 +- quickwit/quickwit-search/src/lib.rs | 2 +- 6 files changed, 101 insertions(+), 13 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 863b43f4efd..42de418fda4 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -8134,6 +8134,7 @@ dependencies = [ "quickwit-proto", "quickwit-query", "quickwit-search", + "quickwit-storage", "serde", "serde_json", "tantivy", diff --git a/quickwit/quickwit-datafusion/Cargo.toml b/quickwit/quickwit-datafusion/Cargo.toml index 177c738ac01..88bce83b96f 100644 --- a/quickwit/quickwit-datafusion/Cargo.toml +++ b/quickwit/quickwit-datafusion/Cargo.toml @@ -27,6 +27,7 @@ quickwit-metastore = { workspace = true } quickwit-proto = { workspace = true } quickwit-query = { workspace = true } quickwit-search = { workspace = true } +quickwit-storage = { workspace = true } datafusion = "52" datafusion-datasource = "52" diff --git a/quickwit/quickwit-datafusion/src/lib.rs b/quickwit/quickwit-datafusion/src/lib.rs index 15e0e77e8d8..c7bd4990c65 100644 --- a/quickwit/quickwit-datafusion/src/lib.rs +++ b/quickwit/quickwit-datafusion/src/lib.rs @@ -9,6 +9,6 @@ pub mod worker; pub use flight::{QuickwitFlightService, build_flight_service}; pub use resolver::QuickwitWorkerResolver; pub use session::QuickwitSessionBuilder; -pub use split_opener::{SplitIndexOpener, SplitRegistry}; +pub use split_opener::{SplitIndexOpener, SplitRegistry, StorageSplitOpener}; pub use table_provider::{OpenerFactory, QuickwitTableProvider}; pub use worker::build_worker_session_builder; diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs index 98aba9cce14..8a79dbd0cef 100644 --- a/quickwit/quickwit-datafusion/src/split_opener.rs +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -6,21 +6,21 @@ use async_trait::async_trait; use dashmap::DashMap; use datafusion::common::Result; use datafusion::error::DataFusionError; +use quickwit_proto::search::SplitIdAndFooterOffsets; +use quickwit_search::SearcherContext; +use quickwit_storage::Storage; use tantivy::Index; use tantivy_datafusion::IndexOpener; /// Registry of opened tantivy indexes, keyed by split ID. -/// -/// For integration tests this is populated before query execution. -/// In production this would be replaced by `open_index_with_caches()`. +/// Used for integration tests. Production uses [`StorageSplitOpener`]. pub type SplitRegistry = DashMap; +// ── Test opener (DashMap-backed) ──────────────────────────────────── + /// An [`IndexOpener`] backed by an in-memory [`SplitRegistry`]. /// -/// Planning-time metadata (schema, segment sizes) is stored inline so -/// that the opener can answer schema/partition queries without touching -/// the registry. The actual [`open`](IndexOpener::open) call looks up -/// the registry at execution time. +/// For integration tests only. Production uses [`StorageSplitOpener`]. #[derive(Clone)] pub struct SplitIndexOpener { split_id: String, @@ -44,8 +44,6 @@ impl SplitIndexOpener { } } - /// Build an opener by extracting schema and segment sizes from an - /// already-opened index, then inserting it into the registry. pub fn from_index(split_id: String, index: Index, registry: Arc) -> Self { let tantivy_schema = index.schema(); let segment_sizes = index @@ -76,7 +74,6 @@ impl fmt::Debug for SplitIndexOpener { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("SplitIndexOpener") .field("split_id", &self.split_id) - .field("segment_sizes", &self.segment_sizes) .finish() } } @@ -111,3 +108,92 @@ impl IndexOpener for SplitIndexOpener { self } } + +// ── Production opener (storage-backed) ────────────────────────────── + +/// An [`IndexOpener`] that downloads and opens splits from object +/// storage using Quickwit's caching infrastructure. +/// +/// At execution time (on the worker), calls `open_index_with_caches()` +/// to download the split bundle from S3/GCS/local storage, warm the +/// footer cache + fast field cache, and return an opened tantivy `Index`. +/// +/// Planning-time metadata (schema, segment sizes) is stored inline — +/// no I/O during plan construction. +#[derive(Clone)] +pub struct StorageSplitOpener { + split_id: String, + tantivy_schema: tantivy::schema::Schema, + segment_sizes: Vec, + searcher_context: Arc, + storage: Arc, + footer_offsets: SplitIdAndFooterOffsets, +} + +impl StorageSplitOpener { + pub fn new( + split_id: String, + tantivy_schema: tantivy::schema::Schema, + segment_sizes: Vec, + searcher_context: Arc, + storage: Arc, + split_footer_start: u64, + split_footer_end: u64, + ) -> Self { + let footer_offsets = SplitIdAndFooterOffsets { + split_id: split_id.clone(), + split_footer_start, + split_footer_end, + ..Default::default() + }; + Self { + split_id, + tantivy_schema, + segment_sizes, + searcher_context, + storage, + footer_offsets, + } + } +} + +impl fmt::Debug for StorageSplitOpener { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageSplitOpener") + .field("split_id", &self.split_id) + .finish() + } +} + +#[async_trait] +impl IndexOpener for StorageSplitOpener { + async fn open(&self) -> Result { + let (index, _hot_directory) = quickwit_search::leaf::open_index_with_caches( + &self.searcher_context, + self.storage.clone(), + &self.footer_offsets, + None, // tokenizer_manager — TODO: pass from doc mapper + None, // ephemeral cache — TODO: pass from searcher context + ) + .await + .map_err(|e| DataFusionError::Execution(format!("open split {}: {e}", self.split_id)))?; + + Ok(index) + } + + fn schema(&self) -> tantivy::schema::Schema { + self.tantivy_schema.clone() + } + + fn segment_sizes(&self) -> Vec { + self.segment_sizes.clone() + } + + fn identifier(&self) -> &str { + &self.split_id + } + + fn as_any(&self) -> &dyn Any { + self + } +} diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 4caf2587909..4ba3488ffda 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -145,7 +145,7 @@ fn configure_storage_retries( /// - A fast fields cache given by `SearcherContext.storage_long_term_cache`. /// - An ephemeral unbounded cache directory (whose lifetime is tied to the returned `Index` if no /// `ByteRangeCache` is provided). -pub(crate) async fn open_index_with_caches( +pub async fn open_index_with_caches( searcher_context: &SearcherContext, index_storage: Arc, split_and_footer_offsets: &SplitIdAndFooterOffsets, diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index 008556d595f..213c1a6a90e 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -23,7 +23,7 @@ mod collector; mod error; mod fetch_docs; mod find_trace_ids_collector; -mod leaf; +pub mod leaf; mod leaf_cache; mod list_fields; mod list_fields_cache; From f6662ddbbe14130320d340441480853488473096 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 14:00:25 -0500 Subject: [PATCH 06/19] feat: add QuickwitSchemaProvider catalog for lazy index resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements a DataFusion SchemaProvider that lazily resolves Quickwit index names from the metastore. When DataFusion encounters an unknown table name like "my_index", the catalog: 1. Fetches IndexMetadata from the metastore 2. Builds a DocMapper to get the tantivy schema 3. Resolves the storage URI 4. Creates a QuickwitTableProvider with StorageSplitOpener factory 5. Returns the provider — no manual register_table() needed The catalog is registered on the SessionContext when storage_resolver and searcher_context are provided (production path). Tests that don't have storage continue to register tables manually. Also adds quickwit-config and quickwit-doc-mapper dependencies, and makes quickwit-search::leaf module public for open_index_with_caches. Co-Authored-By: Claude Opus 4.6 (1M context) --- metastore-distributed-df-design.md | 477 +++++++++++++++ quickwit/Cargo.lock | 2 + quickwit/quickwit-datafusion/Cargo.toml | 2 + quickwit/quickwit-datafusion/src/catalog.rs | 149 +++++ quickwit/quickwit-datafusion/src/lib.rs | 2 + quickwit/quickwit-datafusion/src/session.rs | 52 +- tantivy-index-opener.md | 411 +++++++++++++ testing-infrastructure.md | 616 ++++++++++++++++++++ 8 files changed, 1701 insertions(+), 10 deletions(-) create mode 100644 metastore-distributed-df-design.md create mode 100644 quickwit/quickwit-datafusion/src/catalog.rs create mode 100644 tantivy-index-opener.md create mode 100644 testing-infrastructure.md diff --git a/metastore-distributed-df-design.md b/metastore-distributed-df-design.md new file mode 100644 index 00000000000..83214e0d960 --- /dev/null +++ b/metastore-distributed-df-design.md @@ -0,0 +1,477 @@ +# Distributed DataFusion Execution with Quickwit Metastore + +## Scope and Constraints + +- **No affinity-aware scheduling** (coming later, we own df-distributed) +- **Full-query retry** on failure, not per-split +- **Output is RecordBatches**, not ES-compat JSON (that's the goal) +- **Dynamic fields**: start with string/list-of-string, handle later +- **Thinking distributed from the ground up** +- **We own tantivy-df** — we can change it + +--- + +## The Key Insight: Lazy Index Opening + +TantivyDF providers currently take an opened `tantivy::Index`. But we own tantivy-df. Change them to take a lazy opener: + +```rust +/// In tantivy-datafusion crate (open source, generic) +#[async_trait] +pub trait IndexOpener: Send + Sync + fmt::Debug { + async fn open(&self) -> Result; + + /// Serializable description for distributed execution + fn to_proto(&self) -> Vec; + fn from_proto(bytes: &[u8]) -> Result>; +} +``` + +**For tests / local usage (unchanged ergonomics):** +```rust +let index = Index::open_in_dir("my_index")?; +let provider = TantivyTableProvider::new(DirectIndexOpener::new(index)); +``` + +**For distributed quickwit:** +```rust +let opener = SplitIndexOpener { + split_id: "abc".into(), + footer_offsets: 700..800, + index_uri: "s3://bucket/index".into(), + searcher_context: searcher_ctx.clone(), + storage_resolver: storage_resolver.clone(), + doc_mapper_str: doc_mapper_json.clone(), +}; +let provider = TantivyTableProvider::new(Arc::new(opener)); +``` + +### What This Gives Us + +- **One plan, everywhere.** The coordinator builds the plan, optimizer rules fire, the same plan is serialized to workers and executed. No wrapper nodes, no re-planning, no codec magic. +- **Optimizer rules fire once, on the coordinator**, against native tantivy-df nodes — TopKPushdown, FastFieldFilterPushdown, AggPushdown, OrdinalGroupByOptimization all work as-is. +- **Workers just deserialize and execute.** `try_decode` reconstructs `SplitIndexOpener` from proto bytes (no I/O). `DataSource::open()` calls `opener.open().await` at stream poll time. + +### Why It Works + +The four tantivy-df optimizer rules (`FastFieldFilterPushdown`, `TopKPushdown`, `AggPushdown`, `OrdinalGroupByOptimization`) operate on **plan structure and metadata**, not on opened indexes. They check "is this a HashJoinExec with InvertedIndexDataSource?" and rearrange nodes. The index is only needed when `execute()` is called. + +So: coordinator builds the plan → optimizer rules fire → plan is serialized (the `IndexOpener` serializes to its proto bytes) → worker deserializes → `execute()` calls `opener.open()` → tantivy-df reads fast fields/inverted index/stored fields. + +--- + +## What Exists Today + +### TantivyDF Three-Provider Architecture + +```sql +SELECT f.ts, f.level, d._document, inv._score +FROM inv +JOIN f ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord +JOIN d ON d._doc_id = f._doc_id AND d._segment_ord = f._segment_ord +WHERE full_text(inv.body, 'error') AND f.status >= 500 +ORDER BY inv._score DESC LIMIT 10 +``` + +| Provider | Reads | Key columns | +|----------|-------|-------------| +| `TantivyTableProvider` | Fast fields → Arrow | `_doc_id`, `_segment_ord`, + fast field cols | +| `TantivyInvertedIndexProvider` | Inverted index → doc IDs + BM25 | `_doc_id`, `_segment_ord`, `_score` | +| `TantivyDocumentProvider` | Stored fields → JSON | `_doc_id`, `_segment_ord`, `_document` | + +Co-partitioned via `Hash([_doc_id, _segment_ord], num_segments)` → shuffle-free joins. + +Optimizer rules: +- **FastFieldFilterPushdown** — merges fast field predicates into inverted index query +- **TopKPushdown** — pushes `ORDER BY _score DESC LIMIT K` into Block-WAND +- **AggPushdown** — replaces DF AggregateExec with native tantivy aggregation +- **OrdinalGroupByOptimization** — dense ordinal array for dictionary GROUP BY + +Dynamic filtering: hash join build side pushes `DynamicFilterPhysicalExpr` into probe side. `DocumentDataSource` only reads stored fields for doc IDs surviving filter + TopK. + +### Schema Chain + +``` +Metastore.list_indexes_metadata(index_id) + → IndexMetadata.index_config.doc_mapping + → build_doc_mapper(&doc_mapping, &search_settings) + → DocMapper.schema() ← tantivy::Schema (full type info) + → tantivy_schema_to_arrow() ← Arrow SchemaRef (fast fields, 8 types) +``` + +Complete. All field types known from the metastore without opening any splits. + +--- + +## Why Two Metastore Calls + +Both on the coordinator. Both before the physical plan is built. + +**Call 1: `list_indexes_metadata(index_id_patterns)`** — in `SchemaProvider::table()` [async] + +Returns the index schema. Needed to: +- Build the `DocMapper` → tantivy `Schema` → Arrow schema +- Know the timestamp field (for extracting time-range filters from the query) +- Know the tag fields (for extracting tag filters) +- Get the index URI (where splits live in storage) + +**Call 2: `list_splits(index_uid, time_range, tags)`** — in `TableProvider::scan()` [async] + +Returns which splits match the query. Depends on call 1: +- Needs `IndexUid` to scope the query +- Needs `DocMapper` to extract tag filters from the query AST +- Needs timestamp field name to extract time-range bounds + +**Structural, not accidental.** DataFusion has exactly two async hooks before the physical plan: `SchemaProvider::table()` and `TableProvider::scan()`. The dependency chain (schema → parse query → extract filters → prune splits) maps to (call 1 → call 2). + +--- + +## Full Architecture + +``` + ┌─────────────────────┐ + │ Query │ + │ (SQL or SearchReq) │ + └──────────┬──────────┘ + │ + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ COORDINATOR │ +│ │ +│ SchemaProvider::table("logs") [async] │ +│ │ │ +│ ├── Metastore: list_indexes_metadata("logs") │ +│ │ → IndexMetadata → DocMapper → Arrow schema │ +│ │ │ +│ └── Returns QuickwitTableProvider │ +│ │ +│ QuickwitTableProvider::scan(projection, filters, limit) [async] │ +│ │ │ +│ ├── Extract time/tag filters from DF Exprs │ +│ ├── Metastore: list_splits(index_uid, time, tags) │ +│ │ → [split_abc, split_def, split_ghi] │ +│ │ │ +│ ├── For each split, create a SplitIndexOpener │ +│ │ (carries: split_id, footer_offsets, index_uri, doc_mapper) │ +│ │ │ +│ └── Build tantivy-df plan with openers: │ +│ Register per-split providers (with lazy openers): │ +│ TantivyTableProvider::new(opener_abc) │ +│ TantivyInvertedIndexProvider::new(opener_abc) │ +│ TantivyDocumentProvider::new(opener_abc) │ +│ Build SQL/DataFrame with joins │ +│ → Returns native tantivy-df exec nodes │ +│ │ +│ DF physical optimizer (coordinator-side): │ +│ ALL tantivy-df rules fire here on native nodes: │ +│ ✓ FastFieldFilterPushdown │ +│ ✓ TopKPushdown (Block-WAND) │ +│ ✓ AggPushdown │ +│ ✓ OrdinalGroupByOptimization │ +│ │ +│ df-distributed optimizer: │ +│ Sees N partitions → N tasks on workers │ +│ Inserts NetworkCoalesceExec │ +│ │ +│ Final coordinator plan: │ +│ │ +│ SortPreservingMergeExec(ts DESC, fetch=10) │ +│ └── NetworkCoalesceExec │ +│ ├── task 0 ────────────────────────────────────────── │ +│ │ HashJoinExec(INNER) │ +│ │ build: InvertedIndexDataSource(opener=abc) │ +│ │ query: "error" AND status>=500 │ +│ │ topk: 10, Block-WAND │ +│ │ probe: FastFieldDataSource(opener=abc) │ +│ │ pushed_filters: [DynamicFilter] │ +│ │ │ +│ ├── task 1: (same, opener=def) ─────────────────── │ +│ └── task 2: (same, opener=ghi) ─────────────────── │ +│ │ +│ Serialize via PhysicalExtensionCodec: │ +│ tantivy-df nodes serialize cleanly because IndexOpener │ +│ is just metadata (split_id, footer_offsets, index_uri) │ +│ │ +└──────────────────┬─────────────────┬─────────────────┬──────────────┘ + │ │ │ + ▼ ▼ ▼ +┌──────────────────────┐ ┌──────────────────────┐ ┌──────────────────────┐ +│ WORKER A │ │ WORKER B │ │ WORKER C │ +│ │ │ │ │ │ +│ try_decode: │ │ try_decode: │ │ try_decode: │ +│ pure deser, │ │ pure deser, │ │ pure deser, │ +│ reconstructs │ │ reconstructs │ │ reconstructs │ +│ SplitIndexOpener │ │ SplitIndexOpener │ │ SplitIndexOpener │ +│ from proto bytes │ │ from proto bytes │ │ from proto bytes │ +│ │ │ │ │ │ +│ execute(): │ │ execute(): │ │ execute(): │ +│ │ │ │ │ │ +│ InvertedIndex │ │ InvertedIndex │ │ InvertedIndex │ +│ DataSource │ │ DataSource │ │ DataSource │ +│ .open() │ │ .open() │ │ .open() │ +│ → open_index_ │ │ → open_index_ │ │ → open_index_ │ +│ with_caches() │ │ with_caches() │ │ with_caches() │ +│ → warmup() │ │ → warmup() │ │ → warmup() │ +│ → read inverted │ │ → read inverted │ │ → read inverted │ +│ index │ │ index │ │ index │ +│ │ │ │ │ │ +│ FastField │ │ FastField │ │ FastField │ +│ DataSource │ │ DataSource │ │ DataSource │ +│ .open() │ │ .open() │ │ .open() │ +│ → (same index, │ │ → (same index, │ │ → (same index, │ +│ already open) │ │ already open) │ │ already open) │ +│ → read fast │ │ → read fast │ │ → read fast │ +│ fields → Arrow │ │ fields → Arrow │ │ fields → Arrow │ +│ │ │ │ │ │ +│ HashJoinExec │ │ HashJoinExec │ │ HashJoinExec │ +│ DynamicFilter ──► │ │ DynamicFilter ──► │ │ DynamicFilter ──► │ +│ probe only reads │ │ probe only reads │ │ probe only reads │ +│ matching docs │ │ matching docs │ │ matching docs │ +│ │ │ │ │ │ +│ ▼ │ │ ▼ │ │ ▼ │ +│ RecordBatches │ │ RecordBatches │ │ RecordBatches │ +│ → Flight stream │ │ → Flight stream │ │ → Flight stream │ +└──────────┬───────────┘ └──────────┬───────────┘ └──────────┬───────────┘ + │ │ │ + └────────────┬───────────┘ │ + └──────────┬─────────────────────────┘ + ▼ + ┌──────────────────┐ + │ COORDINATOR │ + │ │ + │ SortPreserving │ + │ MergeExec │ + │ (ts DESC, k=10) │ + │ │ + │ ▼ │ + │ RecordBatches │ + └──────────────────┘ +``` + +### What Changes in tantivy-df + +```rust +// Before: +pub struct TantivyTableProvider { + index: Index, // ← opened, not serializable + ... +} + +// After: +pub struct TantivyTableProvider { + opener: Arc, // ← lazy, serializable + ... +} + +// IndexOpener trait (in tantivy-datafusion crate): +#[async_trait] +pub trait IndexOpener: Send + Sync + fmt::Debug { + /// Open the tantivy index. Called lazily on first execute(). + /// May be called multiple times (providers share an opener that caches). + async fn open(&self) -> Result; +} + +// For tests (backward compat): +pub struct DirectIndexOpener { index: Index } +impl IndexOpener for DirectIndexOpener { + async fn open(&self) -> Result { Ok(self.index.clone()) } +} + +// For quickwit distributed (in quickwit-datafusion crate): +pub struct SplitIndexOpener { + split_id: String, + footer_offsets: Range, + index_uri: String, + doc_mapper_str: String, + searcher_context: Arc, + storage_resolver: StorageResolver, + opened: OnceCell, // cache the opened index +} +impl IndexOpener for SplitIndexOpener { + async fn open(&self) -> Result { + self.opened.get_or_try_init(|| async { + let storage = self.storage_resolver.resolve(&self.index_uri).await?; + let offsets = SplitIdAndFooterOffsets { split_id: self.split_id.clone(), ... }; + let (index, _) = open_index_with_caches(&self.searcher_context, storage, &offsets, ...).await?; + // warmup happens here too + Ok(index) + }).await.cloned() + } +} +``` + +### Where open() Gets Called: DataSource::open() at Stream Poll Time + +The tantivy-df `DataSource::open()` method is **sync** but wraps its work in `stream::once(async move { ... })`. The actual I/O happens lazily when DataFusion polls the stream. This is where `opener.open().await` goes: + +```rust +// FastFieldDataSource today: +fn open(&self, partition: usize, _ctx: Arc) -> Result { + let index = self.index.clone(); // ← already opened + let stream = stream::once(async move { + generate_and_filter_batch(&index, ...) // ← reads fast fields + }); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) +} + +// FastFieldDataSource with IndexOpener: +fn open(&self, partition: usize, _ctx: Arc) -> Result { + let opener = self.opener.clone(); // ← not yet opened + let stream = stream::once(async move { + let index = opener.open().await?; // ← opens here, async, at poll time + generate_and_filter_batch(&index, ...) // ← reads fast fields + }); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) +} +``` + +Same pattern for `InvertedIndexDataSource` and `DocumentDataSource` — all three already use `stream::once(async move { ... })`, so the opener call slots in naturally. + +### Full Execution Chain on the Worker + +``` +Arrow Flight receives serialized plan bytes + │ + ▼ +PhysicalExtensionCodec::try_decode() + │ Pure deserialization. Reconstructs SplitIndexOpener from proto bytes. + │ No I/O. No index opening. + │ + ▼ +DataSourceExec { data_source: FastFieldDataSource { opener: SplitIndexOpener { split_id: "abc", ... } } } +DataSourceExec { data_source: InvertedIndexDataSource { opener: SplitIndexOpener { split_id: "abc", ... } } } + │ (same Arc shared by all three providers for same split) + │ + ▼ +DataFusion calls DataSourceExec::execute(partition=0) + │ + ▼ +FastFieldDataSource::open(partition=0) [sync — returns a lazy stream] + │ + ▼ +stream::once(async move { [async — runs when DF polls the stream] + │ + ├── opener.open().await ← SplitIndexOpener::open() + │ ├── storage_resolver.resolve(index_uri) async: resolve S3/RAM storage + │ ├── open_index_with_caches(ctx, storage, split_offsets, tokenizer_mgr) + │ │ async: fetch footer, open BundleStorage → HotDirectory → tantivy::Index + │ ├── warmup(searcher, warmup_info) async: prefetch byte ranges + │ └── return Index (cached in OnceCell for other providers) + │ + ├── generate_and_filter_batch(&index, ...) ← reads fast fields → Arrow RecordBatch + │ + └── return RecordBatch +}) +``` + +The `OnceCell` in `SplitIndexOpener` ensures the index is opened exactly once even though `FastFieldDataSource`, `InvertedIndexDataSource`, and `DocumentDataSource` all call `opener.open().await`. Whichever provider polls first triggers the actual opening; the others get the cached result. +``` + +Multiple providers sharing the same `Arc` open the index once (via `OnceCell`). The index is opened on first `execute()` call, not during planning. + +### PhysicalExtensionCodec + +```rust +// In tantivy-datafusion: provide a trait for serializing IndexOpener +pub trait IndexOpenerCodec: Send + Sync { + fn encode(&self, opener: &dyn IndexOpener) -> Result>; + fn decode(&self, bytes: &[u8]) -> Result>; +} + +// In quickwit-datafusion: implement it +impl IndexOpenerCodec for QuickwitOpenerCodec { + fn encode(&self, opener: &dyn IndexOpener) -> Result> { + let split_opener = opener.downcast_ref::()?; + // serialize split_id, footer_offsets, index_uri, doc_mapper_str + } + fn decode(&self, bytes: &[u8]) -> Result> { + // deserialize → SplitIndexOpener (no I/O, just struct construction) + } +} +``` + +### Properties of This Design + +| Concern | How it's handled | +|---------|-----------------| +| **Planning** | One phase, on coordinator. No worker re-planning. | +| **Optimizer rules** | Fire once on coordinator against native tantivy-df nodes. | +| **Custom exec nodes** | None. Plan contains only stock DF + tantivy-df `DataSourceExec` nodes. | +| **Serialization** | `IndexOpener` serializes to proto bytes (split_id, footer_offsets, index_uri). Pure data. | +| **try_decode** | Reconstructs `SplitIndexOpener` struct from proto. No I/O. | +| **Index opening** | In `DataSource::open()` → `stream::once(async { opener.open().await })`. On worker, at poll time. | +| **Plan identity** | Same plan on coordinator, on the wire, and on worker. | + +--- + +## Concrete Code: What Gets Built + +### Change to tantivy-df (small) + +1. Add `IndexOpener` trait +2. Change providers to take `Arc` instead of `Index` +3. Add `DirectIndexOpener` for backward compat +4. Add `IndexOpenerCodec` trait for serialization +5. In `DataSource::open()` / `execute()`: call `opener.open().await` before reading + +### New in quickwit-datafusion crate + +1. `SplitIndexOpener` — implements `IndexOpener`, calls `open_index_with_caches()` +2. `QuickwitOpenerCodec` — implements `IndexOpenerCodec`, serializes split metadata +3. `QuickwitSchemaProvider` — implements `SchemaProvider::table()`, metastore call 1 +4. `QuickwitTableProvider` — implements `TableProvider::scan()`, metastore call 2, creates providers with `SplitIndexOpener` +5. `QuickwitPhysicalExtensionCodec` — wraps tantivy-df codec + `QuickwitOpenerCodec` + +### Test (same for single-node and distributed — same plan) + +```rust +#[tokio::test] +async fn test_df_distributed_plan() { + let sandbox = TestSandbox::create("test-df", DOC_MAPPING, "{}", &["body"]).await.unwrap(); + sandbox.add_documents(vec![ + json!({"body": "hello world", "ts": "2024-01-01T00:00:00Z", "level": "INFO"}), + json!({"body": "error occurred", "ts": "2024-01-02T00:00:00Z", "level": "ERROR"}), + ]).await.unwrap(); + + // Build session with quickwit catalog (metastore-backed) + let ctx = SessionContext::new(); + ctx.register_udf(full_text_udf()); + // ... register tantivy-df optimizer rules + + let schema_provider = QuickwitSchemaProvider::new( + sandbox.metastore(), + sandbox.storage_resolver(), + searcher_context.clone(), + ); + ctx.register_catalog("quickwit", Arc::new(QuickwitCatalog::new(schema_provider))); + + // This query triggers: + // 1. SchemaProvider::table() → metastore call 1 → schema + // 2. TableProvider::scan() → metastore call 2 → splits → SplitIndexOpeners + // 3. Optimizer rules fire on native tantivy-df nodes + // 4. execute() → opener.open() → open_index_with_caches → read + let batches = ctx.sql(" + SELECT f.ts, f.level + FROM quickwit.default.logs_inv inv + JOIN quickwit.default.logs f + ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord + WHERE full_text(inv.body, 'error') + ").await.unwrap().collect().await.unwrap(); + + assert_eq!(batches.iter().map(|b| b.num_rows()).sum::(), 1); +} +``` + +For distributed: add `df-distributed` extensions to the SessionContext. The plan is the same — just gets serialized and sent to workers. + +--- + +## What We Don't Build + +- ES-compat aggregation JSON format (output is RecordBatches) +- Scroll/cursor pagination +- Snippet UDF +- Affinity-aware worker scheduling (later, in df-distributed) +- Per-split retry (later, in worker provider) +- Dynamic field type inference (use string for now) +- SearchRequest → DF plan translation (separate step) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 42de418fda4..daa94f123e7 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -8130,6 +8130,8 @@ dependencies = [ "futures", "prost 0.14.1", "quickwit-common", + "quickwit-config", + "quickwit-doc-mapper", "quickwit-metastore", "quickwit-proto", "quickwit-query", diff --git a/quickwit/quickwit-datafusion/Cargo.toml b/quickwit/quickwit-datafusion/Cargo.toml index 88bce83b96f..7312cffe4d3 100644 --- a/quickwit/quickwit-datafusion/Cargo.toml +++ b/quickwit/quickwit-datafusion/Cargo.toml @@ -23,6 +23,8 @@ tantivy = { workspace = true } tantivy-datafusion = { path = "/Users/alex.bianchi/oss/tantivy/.worktrees/bianchi/tantivydf/tantivy-datafusion" } quickwit-common = { workspace = true } +quickwit-config = { workspace = true } +quickwit-doc-mapper = { workspace = true } quickwit-metastore = { workspace = true } quickwit-proto = { workspace = true } quickwit-query = { workspace = true } diff --git a/quickwit/quickwit-datafusion/src/catalog.rs b/quickwit/quickwit-datafusion/src/catalog.rs new file mode 100644 index 00000000000..6a29a0df08c --- /dev/null +++ b/quickwit/quickwit-datafusion/src/catalog.rs @@ -0,0 +1,149 @@ +//! A DataFusion [`SchemaProvider`] that lazily resolves Quickwit +//! indexes from the metastore. +//! +//! When DataFusion encounters a table name it doesn't know, the +//! catalog calls the metastore to look up the index, builds a +//! [`QuickwitTableProvider`], and returns it. No manual +//! `ctx.register_table()` needed. + +use std::any::Any; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::catalog::SchemaProvider; +use datafusion::common::DataFusionError; +use datafusion::datasource::TableProvider; +use quickwit_config::build_doc_mapper; +use quickwit_metastore::{IndexMetadata, IndexMetadataResponseExt}; +use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_storage::StorageResolver; +use tantivy_datafusion::IndexOpener; +use tokio::sync::Mutex; + +use crate::split_opener::StorageSplitOpener; +use crate::table_provider::{OpenerFactory, QuickwitTableProvider}; + +/// A [`SchemaProvider`] backed by the Quickwit metastore. +/// +/// When DataFusion queries for a table by name, this provider treats +/// the name as a Quickwit index ID, fetches its metadata from the +/// metastore, and returns a [`QuickwitTableProvider`] that discovers +/// splits at scan time. +pub struct QuickwitSchemaProvider { + metastore: Mutex, + storage_resolver: StorageResolver, + searcher_context: Arc, +} + +impl QuickwitSchemaProvider { + pub fn new( + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + searcher_context: Arc, + ) -> Self { + Self { + metastore: Mutex::new(metastore), + storage_resolver, + searcher_context, + } + } + + async fn resolve_index( + &self, + index_id: &str, + ) -> Result>, DataFusionError> { + let metastore = self.metastore.lock().await; + + // Fetch index metadata. + let request = IndexMetadataRequest::for_index_id(index_id.to_string()); + let index_metadata: IndexMetadata = match metastore + .clone() + .index_metadata(request) + .await + { + Ok(response) => response + .deserialize_index_metadata() + .map_err(|e| DataFusionError::External(Box::new(e)))?, + Err(_) => return Ok(None), // index doesn't exist + }; + + let index_uid = index_metadata.index_uid.clone(); + let index_config = &index_metadata.index_config; + + // Build the doc mapper to get the tantivy schema and tokenizer manager. + let doc_mapper = build_doc_mapper( + &index_config.doc_mapping, + &index_config.search_settings, + ) + .map_err(|e| DataFusionError::Internal(format!("build doc mapper: {e}")))?; + + let tantivy_schema = doc_mapper.schema(); + let tokenizer_manager = doc_mapper.tokenizer_manager().clone(); + let storage_resolver = self.storage_resolver.clone(); + let searcher_context = self.searcher_context.clone(); + + // Resolve the index storage URI. + let storage = storage_resolver + .resolve(&index_config.index_uri) + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + // Build the opener factory that creates StorageSplitOpeners. + let schema_for_factory = tantivy_schema.clone(); + let opener_factory: OpenerFactory = Arc::new(move |split_meta| { + Arc::new(StorageSplitOpener::new( + split_meta.split_id.clone(), + schema_for_factory.clone(), + vec![], // segment sizes not known until open — from_opener handles this + searcher_context.clone(), + storage.clone(), + split_meta.footer_offsets.start, + split_meta.footer_offsets.end, + )) as Arc + }); + + let provider = QuickwitTableProvider::new( + index_uid, + metastore.clone(), + opener_factory, + &tantivy_schema, + ); + + Ok(Some(Arc::new(provider))) + } +} + +impl std::fmt::Debug for QuickwitSchemaProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("QuickwitSchemaProvider").finish() + } +} + +#[async_trait] +impl SchemaProvider for QuickwitSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + /// We can't enumerate all indexes cheaply. Return empty. + /// Users must know their index names. + fn table_names(&self) -> Vec { + Vec::new() + } + + /// Lazily resolve a Quickwit index by name. + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { + self.resolve_index(name).await + } + + fn table_exist(&self, _name: &str) -> bool { + // We can't check synchronously. Return true and let table() + // return None if it doesn't exist. + true + } +} diff --git a/quickwit/quickwit-datafusion/src/lib.rs b/quickwit/quickwit-datafusion/src/lib.rs index c7bd4990c65..d6e907b7cb9 100644 --- a/quickwit/quickwit-datafusion/src/lib.rs +++ b/quickwit/quickwit-datafusion/src/lib.rs @@ -1,3 +1,4 @@ +pub mod catalog; pub mod flight; pub mod query_translator; pub mod resolver; @@ -6,6 +7,7 @@ pub mod split_opener; pub mod table_provider; pub mod worker; +pub use catalog::QuickwitSchemaProvider; pub use flight::{QuickwitFlightService, build_flight_service}; pub use resolver::QuickwitWorkerResolver; pub use session::QuickwitSessionBuilder; diff --git a/quickwit/quickwit-datafusion/src/session.rs b/quickwit/quickwit-datafusion/src/session.rs index 9b7f0ce0cad..d8306459448 100644 --- a/quickwit/quickwit-datafusion/src/session.rs +++ b/quickwit/quickwit-datafusion/src/session.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use datafusion::catalog::{CatalogProvider, MemoryCatalogProvider}; use datafusion::execution::SessionStateBuilder; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_distributed::{DistributedExt, DistributedPhysicalOptimizerRule}; @@ -7,8 +8,10 @@ use quickwit_metastore::SplitMetadata; use quickwit_proto::metastore::MetastoreServiceClient; use quickwit_proto::types::IndexUid; use quickwit_search::SearcherPool; -use tantivy_datafusion::{IndexOpener, OpenerMetadata, TantivyCodec, full_text_udf}; +use quickwit_storage::StorageResolver; +use tantivy_datafusion::{IndexOpener, TantivyCodec, full_text_udf}; +use crate::catalog::QuickwitSchemaProvider; use crate::resolver::QuickwitWorkerResolver; use crate::split_opener::{SplitIndexOpener, SplitRegistry}; use crate::table_provider::{OpenerFactory, QuickwitTableProvider}; @@ -18,6 +21,8 @@ pub struct QuickwitSessionBuilder { metastore: MetastoreServiceClient, searcher_pool: SearcherPool, registry: Arc, + storage_resolver: Option, + searcher_context: Option>, } impl QuickwitSessionBuilder { @@ -30,19 +35,28 @@ impl QuickwitSessionBuilder { metastore, searcher_pool, registry, + storage_resolver: None, + searcher_context: None, } } + /// Set the storage resolver for production split opening. + /// When set, the catalog will create `StorageSplitOpener`s. + pub fn with_storage( + mut self, + storage_resolver: StorageResolver, + searcher_context: Arc, + ) -> Self { + self.storage_resolver = Some(storage_resolver); + self.searcher_context = Some(searcher_context); + self + } + /// Build a `SessionContext` configured for distributed query execution. /// - /// The context has: - /// - `TantivyCodec` (stateless) for serializing tantivy-df nodes - /// - `QuickwitWorkerResolver` for discovering searcher nodes - /// - `full_text()` UDF registered - /// - /// The opener factory is NOT set here — it lives on each worker's - /// session config (set in `build_flight_service`). The coordinator - /// doesn't need an opener factory because it never opens indexes. + /// If `storage_resolver` and `searcher_context` are set, registers a + /// [`QuickwitSchemaProvider`] that lazily resolves index names from + /// the metastore. Otherwise (tests), tables must be registered manually. pub fn build_session(&self) -> SessionContext { let config = SessionConfig::new(); let worker_resolver = QuickwitWorkerResolver::new(self.searcher_pool.clone()); @@ -57,10 +71,28 @@ impl QuickwitSessionBuilder { let ctx = SessionContext::new_with_state(state); ctx.register_udf(full_text_udf()); + + // If storage is available, register the Quickwit catalog so + // that index names resolve automatically from the metastore. + if let (Some(storage_resolver), Some(searcher_context)) = + (&self.storage_resolver, &self.searcher_context) + { + let schema_provider = Arc::new(QuickwitSchemaProvider::new( + self.metastore.clone(), + storage_resolver.clone(), + searcher_context.clone(), + )); + let catalog = Arc::new(MemoryCatalogProvider::new()); + catalog + .register_schema("public", schema_provider) + .expect("register quickwit schema"); + ctx.register_catalog("quickwit", catalog); + } + ctx } - /// Register a Quickwit index as a DataFusion table. + /// Register a Quickwit index as a DataFusion table (manual path). pub fn register_index( &self, ctx: &SessionContext, diff --git a/tantivy-index-opener.md b/tantivy-index-opener.md new file mode 100644 index 00000000000..c7b24c21cea --- /dev/null +++ b/tantivy-index-opener.md @@ -0,0 +1,411 @@ +# Tantivy-DataFusion: IndexOpener Implementation Plan + +## Goal + +Make tantivy-datafusion providers work with lazy index opening so the same plan can be built on a coordinator (without opening indexes) and executed on workers (where indexes get opened at stream poll time). + +## Why + +Today all three providers (`TantivyTableProvider`, `TantivyInvertedIndexProvider`, `TantivyDocumentProvider`) take an already-opened `tantivy::Index`. This means: +- The plan can't be serialized across the network (Index holds file handles) +- The coordinator must open every split to build the plan +- Workers can't open splits from their local storage/cache + +With `IndexOpener`, the coordinator builds the plan using only metadata (schema, segment count). Workers call `opener.open()` at execution time to get the actual `Index`. + +## Current Index Usage + +The `Index` is used in two phases: + +### Planning time (in `scan()`) + +| Provider | What it reads from Index | Why | +|----------|------------------------|-----| +| `TantivyTableProvider` | `index.reader().searcher().segment_readers().len()` | Partition count | +| | `searcher.segment_reader(i).max_doc()` | Doc count per segment for chunking | +| | `index.schema()` (via `tantivy_schema_to_arrow_from_index`) | Arrow schema | +| `TantivyInvertedIndexProvider` | `index.schema()` | Field resolution for query parsing | +| | `QueryParser::for_index(&index, fields)` | Parse full_text queries | +| | `index.reader().searcher().segment_readers().len()` | Partition count | +| `TantivyDocumentProvider` | `index.reader().searcher().segment_readers().len()` | Partition count | + +### Execution time (in `DataSource::open()`) + +All three clone the `Index` into their `DataSource` struct, then in `open()` call `index.reader()?.searcher()` to read data. + +## The Change + +### New trait: `IndexOpener` + +```rust +// In tantivy-datafusion/src/index_opener.rs + +/// Provides a tantivy Index on demand. Used to defer index opening +/// from planning time to execution time, enabling distributed execution +/// where the coordinator builds plans without opening indexes and +/// workers open them from local storage. +#[async_trait] +pub trait IndexOpener: Send + Sync + fmt::Debug { + /// Open (or return cached) the tantivy Index. + async fn open(&self) -> Result; + + /// Return the tantivy schema without opening the full index. + /// Used during planning for Arrow schema derivation and query parsing. + fn schema(&self) -> tantivy::schema::Schema; + + /// Return the number of segments and max_doc per segment. + /// Used during planning to determine partition count. + /// Returns empty vec if unknown (single-partition fallback). + fn segment_sizes(&self) -> Vec; +} +``` + +**Why `schema()` and `segment_sizes()` are separate from `open()`:** Planning needs schema and segment info synchronously (in `TableProvider::scan()`). We don't want to force an async index open just to count segments. For the distributed case, this info comes from metadata (the coordinator knows the schema from the metastore, and segment count can be derived from split metadata or defaulted to 1 segment per split). + +### `DirectIndexOpener` — backward compat for tests and local usage + +```rust +// In tantivy-datafusion/src/index_opener.rs + +/// Opens an already-opened Index. Used for tests and local (non-distributed) usage. +/// This is the zero-cost path — schema() and segment_sizes() read from the Index directly. +#[derive(Debug, Clone)] +pub struct DirectIndexOpener { + index: Index, +} + +impl DirectIndexOpener { + pub fn new(index: Index) -> Self { + Self { index } + } +} + +#[async_trait] +impl IndexOpener for DirectIndexOpener { + async fn open(&self) -> Result { + Ok(self.index.clone()) + } + + fn schema(&self) -> tantivy::schema::Schema { + self.index.schema() + } + + fn segment_sizes(&self) -> Vec { + match self.index.reader() { + Ok(reader) => { + let searcher = reader.searcher(); + (0..searcher.segment_readers().len()) + .map(|i| searcher.segment_reader(i as u32).max_doc()) + .collect() + } + Err(_) => vec![], + } + } +} +``` + +### Convenience constructor on providers + +To keep the existing API ergonomic for tests: + +```rust +impl TantivyTableProvider { + /// Create from an already-opened Index (backward compat). + pub fn new(index: Index) -> Self { + Self::from_opener(Arc::new(DirectIndexOpener::new(index))) + } + + /// Create from an IndexOpener (for distributed execution). + pub fn from_opener(opener: Arc) -> Self { + let tantivy_schema = opener.schema(); + let arrow_schema = tantivy_schema_to_arrow(&tantivy_schema); + // Note: uses tantivy_schema_to_arrow (schema-only), not + // tantivy_schema_to_arrow_from_index (needs opened index for cardinality). + // Multi-valued detection deferred to execution time. + Self { + opener, + arrow_schema, + query: None, + aggregations: None, + } + } +} +``` + +Same pattern for `TantivyInvertedIndexProvider::new(index)` / `from_opener(opener)` and `TantivyDocumentProvider::new(index)` / `from_opener(opener)`. + +## Files to Change + +### 1. New file: `src/index_opener.rs` + +Contains: +- `IndexOpener` trait +- `DirectIndexOpener` struct + impl + +~60 lines. + +### 2. `src/table_provider.rs` + +**Struct change:** +```rust +// Before: +pub struct TantivyTableProvider { + index: Index, + arrow_schema: SchemaRef, + query: Option>, + aggregations: Option>, +} + +// After: +pub struct TantivyTableProvider { + opener: Arc, + arrow_schema: SchemaRef, + query: Option>, + aggregations: Option>, +} +``` + +**`scan()` change:** Replace `self.index.reader()?.searcher().segment_readers().len()` with `self.opener.segment_sizes()`: + +```rust +// Before: +let reader = self.index.reader()?; +let searcher = reader.searcher(); +let num_segments = searcher.segment_readers().len(); +// ... uses searcher.segment_reader(i).max_doc() for chunking + +// After: +let segment_sizes = self.opener.segment_sizes(); +let num_segments = segment_sizes.len().max(1); +// ... uses segment_sizes[i] for chunking +``` + +**DataSource change:** Replace `index: Index` with `opener: Arc`: + +```rust +// Before: +struct FastFieldDataSource { + index: Index, + ... +} + +// After: +struct FastFieldDataSource { + opener: Arc, + ... +} +``` + +**`open()` change:** +```rust +// Before: +fn open(&self, partition: usize, _ctx: Arc) -> Result { + let index = self.index.clone(); + let stream = stream::once(async move { + generate_and_filter_batch(&index, ...) + }); + ... +} + +// After: +fn open(&self, partition: usize, _ctx: Arc) -> Result { + let opener = self.opener.clone(); + let stream = stream::once(async move { + let index = opener.open().await?; + generate_and_filter_batch(&index, ...) + }); + ... +} +``` + +### 3. `src/inverted_index_provider.rs` + +Same pattern as table_provider: +- Struct: `index: Index` → `opener: Arc` +- Constructor: `new(index)` stays (wraps in DirectIndexOpener), add `from_opener(opener)` +- `scan()`: Use `opener.schema()` for field resolution and query parsing, `opener.segment_sizes()` for partition count +- `InvertedIndexDataSource`: `index: Index` → `opener: Arc` +- `open()`: `opener.open().await?` inside the `stream::once(async { ... })` + +### 4. `src/document_provider.rs` + +Same pattern: +- Struct: `index: Index` → `opener: Arc` +- Constructor: same +- `scan()`: `opener.segment_sizes()` for partition count +- `DocumentDataSource`: `index: Index` → `opener: Arc` +- `open()`: `opener.open().await?` inside `stream::once(async { ... })` + +### 5. `src/agg_exec.rs` + +```rust +// Before: +pub(crate) struct TantivyAggregateExec { + index: Index, + ... +} + +// After: +pub(crate) struct TantivyAggregateExec { + opener: Arc, + ... +} +``` + +`execute()`: `opener.open().await?` inside the stream. + +### 6. `src/agg_pushdown.rs` + +The `AggPushdown` optimizer rule creates `TantivyAggregateExec`. It currently extracts the `Index` from `FastFieldDataSource`. Change to extract and pass the `opener` instead. + +### 7. `src/filter_pushdown.rs` + +The `FastFieldFilterPushdown` rule reads `index.schema()` from `InvertedIndexDataSource` to convert physical expressions to tantivy queries. Change to read `opener.schema()` instead. + +### 8. `src/catalog.rs` + +`TantivySchema::table()` currently opens the index and calls `TantivyTableProvider::new(index)`. Change to create a `DirectIndexOpener` and call `from_opener()`. (Behavior unchanged, just flows through the new path.) + +### 9. `src/schema_mapping.rs` + +`tantivy_schema_to_arrow_from_index()` needs an opened `Index` to detect multi-valued fields. This function is called in `TantivyTableProvider::new()`. With `from_opener()`, we only have the schema (not an opened index), so we fall back to `tantivy_schema_to_arrow()` (all scalar types). Multi-valued detection happens at execution time when the index is actually opened. + +This is acceptable because: +- Multi-valued fields are uncommon in the quickwit use case +- The schema mismatch (scalar vs List) would cause a runtime error, not silent wrong results +- A future improvement can add `field_cardinalities()` to `IndexOpener` if needed + +### 10. `src/lib.rs` + +Add: `pub mod index_opener;` and `pub use index_opener::{IndexOpener, DirectIndexOpener};` + +## What Does NOT Change + +- `src/fast_field_reader.rs` — reads from an opened `Index`, called from `open()`. No change. +- `src/full_text_udf.rs` — pure UDF, no Index reference. +- `src/topk_pushdown.rs` — operates on plan structure, doesn't touch Index. +- `src/ordinal_group_by.rs` — operates on plan structure, doesn't touch Index. +- All existing tests — `TantivyTableProvider::new(index)` still works (wraps in DirectIndexOpener). + +## Test Plan + +### Existing tests pass unchanged + +All tests use `TantivyTableProvider::new(index)` which now wraps in `DirectIndexOpener`. Behavior identical. Run full test suite, expect zero failures. + +### New tests for IndexOpener + +```rust +#[tokio::test] +async fn test_direct_opener_schema() { + let index = create_test_index(); + let opener = DirectIndexOpener::new(index.clone()); + assert_eq!(opener.schema(), index.schema()); +} + +#[tokio::test] +async fn test_direct_opener_segment_sizes() { + let index = create_test_index(); // single segment, 5 docs + let opener = DirectIndexOpener::new(index); + let sizes = opener.segment_sizes(); + assert_eq!(sizes.len(), 1); + assert_eq!(sizes[0], 5); +} + +#[tokio::test] +async fn test_direct_opener_multi_segment() { + let index = create_multi_segment_test_index(); // 2 segments: 3 + 2 docs + let opener = DirectIndexOpener::new(index); + let sizes = opener.segment_sizes(); + assert_eq!(sizes.len(), 2); + assert_eq!(sizes[0], 3); + assert_eq!(sizes[1], 2); +} + +#[tokio::test] +async fn test_from_opener_matches_new() { + let index = create_test_index(); + let provider_old = TantivyTableProvider::new(index.clone()); + let provider_new = TantivyTableProvider::from_opener(Arc::new(DirectIndexOpener::new(index))); + + // Same schema + assert_eq!(provider_old.schema(), provider_new.schema()); + + // Same query results + let ctx_old = SessionContext::new(); + ctx_old.register_table("t", Arc::new(provider_old)).unwrap(); + let old_result = ctx_old.sql("SELECT id, price FROM t ORDER BY id") + .await.unwrap().collect().await.unwrap(); + + let ctx_new = SessionContext::new(); + ctx_new.register_table("t", Arc::new(provider_new)).unwrap(); + let new_result = ctx_new.sql("SELECT id, price FROM t ORDER BY id") + .await.unwrap().collect().await.unwrap(); + + assert_eq!(old_result, new_result); +} + +#[tokio::test] +async fn test_lazy_opener_called_at_execute_time() { + use std::sync::atomic::{AtomicBool, Ordering}; + + let index = create_test_index(); + let opened = Arc::new(AtomicBool::new(false)); + let opened_clone = opened.clone(); + + // Custom opener that tracks when open() is called + struct TrackingOpener { + inner: DirectIndexOpener, + opened: Arc, + } + impl fmt::Debug for TrackingOpener { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "TrackingOpener") + } + } + #[async_trait] + impl IndexOpener for TrackingOpener { + async fn open(&self) -> Result { + self.opened.store(true, Ordering::SeqCst); + self.inner.open().await + } + fn schema(&self) -> tantivy::schema::Schema { self.inner.schema() } + fn segment_sizes(&self) -> Vec { self.inner.segment_sizes() } + } + + let opener = Arc::new(TrackingOpener { + inner: DirectIndexOpener::new(index), + opened: opened_clone, + }); + let provider = TantivyTableProvider::from_opener(opener); + + let ctx = SessionContext::new(); + ctx.register_table("t", Arc::new(provider)).unwrap(); + + // After registration and planning, open() should NOT have been called + let df = ctx.sql("SELECT id FROM t").await.unwrap(); + assert!(!opened.load(Ordering::SeqCst), "Index should not be opened during planning"); + + // After collect(), open() SHOULD have been called + let _batches = df.collect().await.unwrap(); + assert!(opened.load(Ordering::SeqCst), "Index should be opened during execution"); +} +``` + +## Order of Implementation + +1. Create `src/index_opener.rs` with `IndexOpener` trait + `DirectIndexOpener` +2. Update `src/lib.rs` exports +3. Change `src/table_provider.rs` (TantivyTableProvider + FastFieldDataSource) +4. Run existing tests — should pass +5. Change `src/inverted_index_provider.rs` +6. Change `src/document_provider.rs` +7. Run existing tests — should pass +8. Change `src/agg_exec.rs` +9. Change `src/agg_pushdown.rs` (pass opener instead of index) +10. Change `src/filter_pushdown.rs` (use opener.schema()) +11. Change `src/catalog.rs` +12. Run full test suite +13. Add new IndexOpener-specific tests + +Each step is independently compilable and testable. The `new(index)` constructor stays throughout, so existing tests never break. diff --git a/testing-infrastructure.md b/testing-infrastructure.md new file mode 100644 index 00000000000..834ce3e736c --- /dev/null +++ b/testing-infrastructure.md @@ -0,0 +1,616 @@ +# Testing Infrastructure for Distributed DataFusion + Quickwit + +## Goals + +1. **Plan verification tests** — confirm the full physical plan structure (optimizer rules fired, correct node composition) +2. **Multi-worker execution tests** — simulate multiple workers with their own tantivy splits, execute queries across pseudo-workers locally +3. **Correctness tests** — same query returns same results in single-node vs distributed execution + +--- + +## Building Blocks from Each Codebase + +### df-distributed: In-Process Workers via `start_localhost_context` + +df-distributed tests spin up real Arrow Flight gRPC servers on random localhost ports: + +```rust +// From df-distributed/src/test_utils/localhost.rs +let (ctx, _guard, workers) = start_localhost_context(3, DefaultSessionBuilder).await; +``` + +This creates: +- A `SessionContext` configured with `DistributedPhysicalOptimizerRule` and a `WorkerResolver` pointing at the 3 workers +- 3 `Worker` instances, each running a Tonic Flight server on a random port +- A `_guard` that shuts down workers when dropped + +Custom worker state (for custom codecs) via `WorkerSessionBuilder`: + +```rust +async fn build_state(ctx: WorkerQueryContext) -> Result { + Ok(ctx.builder + .with_distributed_user_codec(MyCodec) + .build()) +} +let (ctx, _guard, workers) = start_localhost_context(3, build_state).await; +ctx.set_distributed_user_codec(MyCodec); +``` + +### TantivyDF: Plan Assertions via EXPLAIN + +```rust +fn plan_to_string(batches: &[RecordBatch]) -> String { + let batch = collect_batches(batches); + let plan_col = batch.column(1).as_string::(); + (0..batch.num_rows()) + .map(|i| plan_col.value(i)) + .collect::>() + .join("\n") +} + +// Usage: +let df = ctx.sql("EXPLAIN SELECT ...").await?; +let plan = plan_to_string(&df.collect().await?); +assert!(plan.contains("InvertedIndexDataSource(segments=1, query=true, topk=Some(1))")); +``` + +TantivyDF also uses exact string matching on the physical plan portion of EXPLAIN output. + +### TantivyDF: Test Index Creation + +```rust +// Single segment (5 docs, 8 field types) +let index = create_test_index(); + +// Two segments (3 + 2 docs, different ordinal dictionaries) +let index = create_multi_segment_test_index(); + +// With deleted documents +let index = create_test_index_with_deletes(); +``` + +### Quickwit: TestSandbox (Real Splits in RAM) + +```rust +let sandbox = TestSandbox::create("my-index", doc_mapping_yaml, "{}", &["body"]).await?; +sandbox.add_documents(vec![json!({...}), json!({...})]).await?; // creates 1 split +sandbox.add_documents(vec![json!({...})]).await?; // creates another split + +// Access: +sandbox.metastore() // MetastoreServiceClient +sandbox.storage() // Arc (RAM) +sandbox.doc_mapper() // Arc +sandbox.storage_resolver() // StorageResolver +sandbox.index_uid() // IndexUid +``` + +Each `add_documents()` call creates one real tantivy split in RamStorage, published in the metastore. + +### Quickwit: Opening a Split Directly + +```rust +// From tests.rs:1012-1052 — bypasses root_search, calls leaf directly +let splits = sandbox.metastore() + .list_splits(ListSplitsRequest::try_from_index_uid(sandbox.index_uid()).unwrap()) + .await?.collect_splits().await?; +let split_offsets: Vec<_> = splits.iter() + .map(|s| extract_split_and_footer_offsets(&s.split_metadata)) + .collect(); + +let searcher_ctx = Arc::new(SearcherContext::new(SearcherConfig::default(), None)); + +// Open the tantivy::Index from the split +let (index, _) = open_index_with_caches( + &searcher_ctx, sandbox.storage(), &split_offsets[0], + Some(sandbox.doc_mapper().tokenizer_manager()), None, +).await?; +// `index` is now a standard tantivy::Index — hand it to TantivyDF +``` + +--- + +## Test Layer 1: Plan Verification + +Verify the physical plan structure to confirm optimizer rules fire correctly and nodes compose as expected. + +### 1a. Single-Split Plan Structure + +```rust +#[tokio::test] +async fn test_plan_single_split_fast_field_only() { + let sandbox = TestSandbox::create("plan-test", DOC_MAPPING, "{}", &["body"]).await.unwrap(); + sandbox.add_documents(test_docs()).await.unwrap(); + + // Open split, create tantivy-df providers + let (index, _) = open_split(&sandbox, 0).await; + let ctx = create_tantivy_df_session(&index); + + // Verify plan structure + let df = ctx.sql("EXPLAIN SELECT f.ts, f.level FROM f WHERE f.level = 'ERROR'").await.unwrap(); + let plan = plan_to_string(&df.collect().await.unwrap()); + + // Should show FastFieldDataSource with pushed filter + assert!(plan.contains("FastFieldDataSource")); + assert!(!plan.contains("InvertedIndexDataSource")); // no text search = no inverted index +} + +#[tokio::test] +async fn test_plan_full_text_with_topk() { + let (index, _) = open_split(&sandbox, 0).await; + let ctx = create_tantivy_df_session_with_rules(&index); // all 4 optimizer rules + + let df = ctx.sql("EXPLAIN + SELECT f.ts, inv._score + FROM inv + JOIN f ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord + WHERE full_text(inv.body, 'error') + ORDER BY inv._score DESC LIMIT 10 + ").await.unwrap(); + let plan = plan_to_string(&df.collect().await.unwrap()); + + // TopKPushdown should have fired + assert!(plan.contains("topk=Some(10)")); + // InvertedIndexDataSource should have the query + assert!(plan.contains("InvertedIndexDataSource(segments=1, query=true")); +} + +#[tokio::test] +async fn test_plan_filter_pushdown_merges_into_inverted() { + let (index, _) = open_split(&sandbox, 0).await; + let ctx = create_tantivy_df_session_with_rules(&index); + + let df = ctx.sql("EXPLAIN + SELECT f.ts + FROM inv + JOIN f ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord + WHERE full_text(inv.body, 'error') AND f.status >= 500 + ").await.unwrap(); + let plan = plan_to_string(&df.collect().await.unwrap()); + + // FastFieldFilterPushdown should have moved status>=500 into inverted index + // FastFieldDataSource should only have DynamicFilter (no user predicates) + assert!(plan.contains("pushed_filters=[DynamicFilter")); + assert!(!plan.contains("status")); // status filter no longer on fast field side +} +``` + +### 1b. Multi-Split Plan Structure + +```rust +#[tokio::test] +async fn test_plan_multi_split_partitioning() { + let sandbox = TestSandbox::create("multi-plan", DOC_MAPPING, "{}", &["body"]).await.unwrap(); + sandbox.add_documents(docs_batch_1()).await.unwrap(); // split 1 + sandbox.add_documents(docs_batch_2()).await.unwrap(); // split 2 + sandbox.add_documents(docs_batch_3()).await.unwrap(); // split 3 + + // Open all 3 splits + let indexes = open_all_splits(&sandbox).await; // Vec<(Index, SplitMetadata)> + + // Create session with union of all splits + let ctx = SessionContext::new(); + // Register per-split providers, or a QuickwitTableProvider that unions them + for (i, (index, _)) in indexes.iter().enumerate() { + ctx.register_table(&format!("f_{i}"), Arc::new(TantivyTableProvider::new(index.clone()))).unwrap(); + } + + // Verify the plan has N partitions + let df = ctx.sql("EXPLAIN SELECT * FROM f_0 UNION ALL SELECT * FROM f_1 UNION ALL SELECT * FROM f_2").await.unwrap(); + let plan = plan_to_string(&df.collect().await.unwrap()); + + // Should show 3 DataSourceExec nodes + let datasource_count = plan.matches("DataSourceExec").count(); + assert_eq!(datasource_count, 3); +} +``` + +### 1c. df-distributed Plan Structure (Snapshot Tests) + +Using df-distributed's `display_plan_ascii` for full distributed plan verification: + +```rust +#[tokio::test] +async fn test_distributed_plan_structure() { + // Start workers with tantivy-df support + let (ctx, _guard, _workers) = start_localhost_context(3, build_tantivy_worker_state).await; + ctx.set_distributed_user_codec(TantivyIndexOpenerCodec::new(storage_resolver, searcher_ctx)); + + // Register QuickwitTableProvider (metastore-backed) + register_quickwit_table(&ctx, &sandbox).await; + + let df = ctx.sql("SELECT ts, level FROM logs WHERE level = 'ERROR' ORDER BY ts DESC LIMIT 10").await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + let plan_str = display_plan_ascii(plan.as_ref(), false); + + // Snapshot test: verify stage decomposition + assert_snapshot!(plan_str, @r" + ┌───── DistributedExec ── Tasks: t0:[p0] + │ SortPreservingMergeExec: [ts DESC], fetch=10 + │ [Stage 1] => NetworkCoalesceExec: ... + └────────────────────────────────────── + ┌───── Stage 1 ── Tasks: t0:[p0] t1:[p0] t2:[p0] + │ SortExec: TopK(fetch=10), expr=[ts DESC] + │ DataSourceExec: FastFieldDataSource(...) + └────────────────────────────────────── + "); +} +``` + +--- + +## Test Layer 2: Multi-Worker Execution with Real Tantivy Splits + +### The Pattern + +1. Use `TestSandbox` to create N splits with known, distinct documents +2. Start N in-process df-distributed workers +3. Each worker opens its assigned split via `IndexOpener` +4. Execute a query through the coordinator +5. Verify results match expected output + +### Core Infrastructure + +```rust +/// Create a TestSandbox with N splits, each containing distinct documents. +async fn create_multi_split_sandbox( + num_splits: usize, +) -> (TestSandbox, Vec) { + let sandbox = TestSandbox::create("distributed-test", DOC_MAPPING, "{}", &["body"]).await.unwrap(); + + for i in 0..num_splits { + sandbox.add_documents(vec![ + json!({"body": format!("document {} in split {}", 1, i), "ts": format!("2024-01-0{}T00:00:00Z", i+1), "level": "INFO", "status": 200}), + json!({"body": format!("error {} in split {}", 2, i), "ts": format!("2024-01-0{}T01:00:00Z", i+1), "level": "ERROR", "status": 500}), + ]).await.unwrap(); + } + + let splits = sandbox.metastore() + .list_splits(ListSplitsRequest::try_from_index_uid(sandbox.index_uid()).unwrap()) + .await.unwrap() + .collect_splits().await.unwrap() + .into_iter().map(|s| s.split_metadata).collect(); + + (sandbox, splits) +} + +/// Worker session builder that knows how to open quickwit splits. +async fn build_tantivy_worker_state( + ctx: WorkerQueryContext, +) -> Result { + Ok(ctx.builder + .with_distributed_user_codec(QuickwitIndexOpenerCodec::new( + storage_resolver.clone(), + searcher_context.clone(), + )) + .build()) +} + +/// Helper to open a split from a TestSandbox and create tantivy-df providers. +async fn open_split(sandbox: &TestSandbox, split_idx: usize) -> (Index, SplitMetadata) { + let splits = sandbox.metastore() + .list_splits(ListSplitsRequest::try_from_index_uid(sandbox.index_uid()).unwrap()) + .await.unwrap().collect_splits().await.unwrap(); + let split = &splits[split_idx].split_metadata; + let offsets = extract_split_and_footer_offsets(split); + let searcher_ctx = Arc::new(SearcherContext::new(SearcherConfig::default(), None)); + let (index, _) = open_index_with_caches( + &searcher_ctx, sandbox.storage(), &offsets, + Some(sandbox.doc_mapper().tokenizer_manager()), None, + ).await.unwrap(); + (index, split.clone()) +} + +/// Helper to set up a session with all tantivy-df providers + optimizer rules for a single index. +fn create_tantivy_df_session(index: &Index) -> SessionContext { + let num_segments = index.searchable_segments().unwrap().len(); + let config = SessionConfig::new().with_target_partitions(num_segments.max(1)); + let state = SessionStateBuilder::new() + .with_config(config) + .with_default_features() + .with_physical_optimizer_rule(Arc::new(FastFieldFilterPushdown::new())) + .with_physical_optimizer_rule(Arc::new(TopKPushdown::new())) + .with_physical_optimizer_rule(Arc::new(AggPushdown::new())) + .with_physical_optimizer_rule(Arc::new(OrdinalGroupByOptimization::new())) + .build(); + let ctx = SessionContext::new_with_state(state); + ctx.register_udf(full_text_udf()); + ctx.register_table("f", Arc::new(TantivyTableProvider::new(index.clone()))).unwrap(); + ctx.register_table("inv", Arc::new(TantivyInvertedIndexProvider::new(index.clone()))).unwrap(); + ctx.register_table("d", Arc::new(TantivyDocumentProvider::new(index.clone()))).unwrap(); + ctx +} +``` + +### 2a. Single-Node Correctness (Baseline) + +```rust +#[tokio::test] +async fn test_single_node_fast_field_query() { + let (sandbox, splits) = create_multi_split_sandbox(3).await; + let (index, _) = open_split(&sandbox, 0).await; + let ctx = create_tantivy_df_session(&index); + + let batches = ctx.sql("SELECT ts, level FROM f WHERE level = 'ERROR'") + .await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 1); // one ERROR doc per split + sandbox.assert_quit().await; +} + +#[tokio::test] +async fn test_single_node_full_text_search() { + let (sandbox, _) = create_multi_split_sandbox(1).await; + let (index, _) = open_split(&sandbox, 0).await; + let ctx = create_tantivy_df_session(&index); + + let batches = ctx.sql(" + SELECT f.ts, f.level + FROM inv + JOIN f ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord + WHERE full_text(inv.body, 'error') + ").await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 1); + sandbox.assert_quit().await; +} + +#[tokio::test] +async fn test_single_node_aggregation() { + let (sandbox, _) = create_multi_split_sandbox(1).await; + let (index, _) = open_split(&sandbox, 0).await; + let ctx = create_tantivy_df_session(&index); + + let batches = ctx.sql("SELECT level, COUNT(*) as cnt FROM f GROUP BY level ORDER BY cnt DESC") + .await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 2); // INFO and ERROR + sandbox.assert_quit().await; +} +``` + +### 2b. Multi-Worker Distributed Execution + +```rust +#[tokio::test] +async fn test_distributed_fast_field_query_across_splits() { + let (sandbox, splits) = create_multi_split_sandbox(3).await; + + // Start 3 workers, each will handle one split + let (ctx, _guard, _workers) = start_localhost_context(3, |wctx| async move { + Ok(wctx.builder + .with_distributed_user_codec(QuickwitIndexOpenerCodec::new( + sandbox.storage_resolver(), + Arc::new(SearcherContext::new(SearcherConfig::default(), None)), + )) + .build()) + }).await; + ctx.set_distributed_user_codec(QuickwitIndexOpenerCodec::new(...)); + + // Register QuickwitTableProvider — calls metastore, returns SplitIndexOpeners + let provider = QuickwitTableProvider::new( + sandbox.metastore(), sandbox.storage_resolver(), + Arc::new(SearcherContext::new(SearcherConfig::default(), None)), + sandbox.index_uid(), + ); + ctx.register_table("logs", Arc::new(provider)).unwrap(); + + // Query across all 3 splits + let batches = ctx.sql("SELECT ts, level FROM logs WHERE level = 'ERROR' ORDER BY ts") + .await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + // 3 splits × 1 ERROR doc each = 3 results + assert_eq!(batch.num_rows(), 3); +} + +#[tokio::test] +async fn test_distributed_aggregation_across_splits() { + let (sandbox, _) = create_multi_split_sandbox(3).await; + let (ctx, _guard, _) = start_distributed_context(&sandbox, 3).await; + + let batches = ctx.sql(" + SELECT level, COUNT(*) as cnt FROM logs GROUP BY level ORDER BY level + ").await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + // 3 splits × 2 docs each: 3 ERROR + 3 INFO + assert_eq!(batch.num_rows(), 2); + // Verify counts + let counts: Vec = batch.column_by_name("cnt").unwrap() + .as_primitive::().iter().map(|v| v.unwrap()).collect(); + assert_eq!(counts, vec![3, 3]); // ERROR=3, INFO=3 +} + +#[tokio::test] +async fn test_distributed_topk_across_splits() { + let (sandbox, _) = create_multi_split_sandbox(3).await; + let (ctx, _guard, _) = start_distributed_context(&sandbox, 3).await; + + let batches = ctx.sql(" + SELECT ts, level FROM logs ORDER BY ts DESC LIMIT 2 + ").await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + // Global top-2 by timestamp across 3 splits + assert_eq!(batch.num_rows(), 2); +} + +#[tokio::test] +async fn test_distributed_full_text_with_topk() { + let (sandbox, _) = create_multi_split_sandbox(3).await; + let (ctx, _guard, _) = start_distributed_context(&sandbox, 3).await; + + let batches = ctx.sql(" + SELECT f.ts, inv._score + FROM inv + JOIN f ON f._doc_id = inv._doc_id AND f._segment_ord = inv._segment_ord + WHERE full_text(inv.body, 'error') + ORDER BY inv._score DESC LIMIT 2 + ").await.unwrap().collect().await.unwrap(); + let batch = collect_batches(&batches); + + assert_eq!(batch.num_rows(), 2); +} +``` + +### 2c. Correctness: Local vs Distributed Same Results + +```rust +#[tokio::test] +async fn test_local_vs_distributed_same_results() { + let (sandbox, splits) = create_multi_split_sandbox(3).await; + + // Local: open all splits, query in one session + let mut local_batches = Vec::new(); + for i in 0..splits.len() { + let (index, _) = open_split(&sandbox, i).await; + let ctx = create_tantivy_df_session(&index); + let batches = ctx.sql("SELECT level, COUNT(*) as cnt FROM f GROUP BY level") + .await.unwrap().collect().await.unwrap(); + local_batches.extend(batches); + } + // Manually merge local results (sum counts per level) + + // Distributed: query across all splits via workers + let (ctx, _guard, _) = start_distributed_context(&sandbox, 3).await; + let distributed_batches = ctx.sql("SELECT level, COUNT(*) as cnt FROM logs GROUP BY level ORDER BY level") + .await.unwrap().collect().await.unwrap(); + + // Compare + assert_eq!(local_merged_result, distributed_result); +} +``` + +--- + +## Test Layer 3: Codec Serialization Round-Trip + +Verify that `IndexOpener` serializes and deserializes correctly across the Flight boundary. + +```rust +#[tokio::test] +async fn test_index_opener_codec_roundtrip() { + let sandbox = TestSandbox::create("codec-test", DOC_MAPPING, "{}", &["body"]).await.unwrap(); + sandbox.add_documents(test_docs()).await.unwrap(); + + let splits = get_splits(&sandbox).await; + let opener = SplitIndexOpener::new( + splits[0].split_id.clone(), + splits[0].footer_offsets.clone(), + sandbox.index_uid().index_id.to_string(), + serde_json::to_string(&sandbox.doc_mapper()).unwrap(), + Arc::new(SearcherContext::new(SearcherConfig::default(), None)), + sandbox.storage_resolver(), + ); + + let codec = QuickwitIndexOpenerCodec::new(...); + + // Encode + let mut buf = Vec::new(); + codec.encode(&opener, &mut buf).unwrap(); + + // Decode + let decoded_opener = codec.decode(&buf).unwrap(); + + // Verify the decoded opener can actually open the index + let index = decoded_opener.open().await.unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + assert!(searcher.num_docs() > 0); + + sandbox.assert_quit().await; +} +``` + +--- + +## Test Helpers to Build + +```rust +// Helper: batch concatenation +fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { + arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() +} + +// Helper: extract plan string from EXPLAIN +fn plan_to_string(batches: &[RecordBatch]) -> String { + let batch = collect_batches(batches); + let plan_col = batch.column(1).as_string::(); + (0..batch.num_rows()) + .map(|i| plan_col.value(i)) + .collect::>() + .join("\n") +} + +// Helper: open all splits from a sandbox +async fn open_all_splits(sandbox: &TestSandbox) -> Vec<(Index, SplitMetadata)> { + let splits = sandbox.metastore() + .list_splits(ListSplitsRequest::try_from_index_uid(sandbox.index_uid()).unwrap()) + .await.unwrap().collect_splits().await.unwrap(); + let searcher_ctx = Arc::new(SearcherContext::new(SearcherConfig::default(), None)); + let mut result = Vec::new(); + for split in splits { + let offsets = extract_split_and_footer_offsets(&split.split_metadata); + let (index, _) = open_index_with_caches( + &searcher_ctx, sandbox.storage(), &offsets, + Some(sandbox.doc_mapper().tokenizer_manager()), None, + ).await.unwrap(); + result.push((index, split.split_metadata)); + } + result +} + +// Helper: start distributed context with quickwit split support +async fn start_distributed_context( + sandbox: &TestSandbox, + num_workers: usize, +) -> (SessionContext, impl Drop, Vec) { + let storage_resolver = sandbox.storage_resolver(); + let searcher_ctx = Arc::new(SearcherContext::new(SearcherConfig::default(), None)); + + let (ctx, guard, workers) = start_localhost_context(num_workers, move |wctx| { + let sr = storage_resolver.clone(); + let sc = searcher_ctx.clone(); + async move { + Ok(wctx.builder + .with_distributed_user_codec(QuickwitIndexOpenerCodec::new(sr, sc)) + .build()) + } + }).await; + + ctx.set_distributed_user_codec(QuickwitIndexOpenerCodec::new( + sandbox.storage_resolver(), + Arc::new(SearcherContext::new(SearcherConfig::default(), None)), + )); + + // Register table backed by metastore + let provider = QuickwitTableProvider::from_sandbox(sandbox); + ctx.register_table("logs", Arc::new(provider)).unwrap(); + + (ctx, guard, workers) +} +``` + +--- + +## Test Matrix + +| Test | Layer | Splits | Workers | What it verifies | +|------|-------|--------|---------|-----------------| +| `test_plan_single_split_fast_field_only` | Plan | 1 | 0 | FastFieldDataSource in plan, no inverted index | +| `test_plan_full_text_with_topk` | Plan | 1 | 0 | TopKPushdown fires, `topk=Some(K)` in plan | +| `test_plan_filter_pushdown_merges_into_inverted` | Plan | 1 | 0 | FastFieldFilterPushdown fires, predicates move | +| `test_plan_multi_split_partitioning` | Plan | 3 | 0 | 3 DataSourceExec nodes in union | +| `test_distributed_plan_structure` | Plan | 3 | 3 | Stage decomposition, NetworkCoalesceExec | +| `test_single_node_fast_field_query` | Exec | 1 | 0 | Basic fast field read → Arrow | +| `test_single_node_full_text_search` | Exec | 1 | 0 | Inverted index join, full_text UDF | +| `test_single_node_aggregation` | Exec | 1 | 0 | GROUP BY + COUNT over fast fields | +| `test_distributed_fast_field_query_across_splits` | Exec | 3 | 3 | Cross-split query, results merged | +| `test_distributed_aggregation_across_splits` | Exec | 3 | 3 | Partial/final agg across workers | +| `test_distributed_topk_across_splits` | Exec | 3 | 3 | Global top-K from local top-K per worker | +| `test_distributed_full_text_with_topk` | Exec | 3 | 3 | Full pipeline: text search + TopK + distributed | +| `test_local_vs_distributed_same_results` | Correctness | 3 | 3 | Same results from both paths | +| `test_index_opener_codec_roundtrip` | Codec | 1 | 0 | SplitIndexOpener survives encode/decode | From 2aeb96f9f2813c3450a9e6e53d9e6a769124923c Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 14:03:33 -0500 Subject: [PATCH 07/19] feat: pass tokenizer manager through StorageSplitOpener StorageSplitOpener now accepts a TokenizerManager via with_tokenizer_manager(). The catalog passes the doc mapper's tokenizer manager when creating opener factories. This ensures full-text queries on fields with custom tokenizers (language-specific, lowercase, custom patterns) work correctly. Without the tokenizer manager, tantivy falls back to the default tokenizer which produces wrong query results. Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/src/catalog.rs | 21 +++++++++++-------- .../quickwit-datafusion/src/split_opener.rs | 16 ++++++++++++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/quickwit/quickwit-datafusion/src/catalog.rs b/quickwit/quickwit-datafusion/src/catalog.rs index 6a29a0df08c..93d641274aa 100644 --- a/quickwit/quickwit-datafusion/src/catalog.rs +++ b/quickwit/quickwit-datafusion/src/catalog.rs @@ -93,15 +93,18 @@ impl QuickwitSchemaProvider { // Build the opener factory that creates StorageSplitOpeners. let schema_for_factory = tantivy_schema.clone(); let opener_factory: OpenerFactory = Arc::new(move |split_meta| { - Arc::new(StorageSplitOpener::new( - split_meta.split_id.clone(), - schema_for_factory.clone(), - vec![], // segment sizes not known until open — from_opener handles this - searcher_context.clone(), - storage.clone(), - split_meta.footer_offsets.start, - split_meta.footer_offsets.end, - )) as Arc + Arc::new( + StorageSplitOpener::new( + split_meta.split_id.clone(), + schema_for_factory.clone(), + vec![], // segment sizes discovered at open time + searcher_context.clone(), + storage.clone(), + split_meta.footer_offsets.start, + split_meta.footer_offsets.end, + ) + .with_tokenizer_manager(tokenizer_manager.clone()), + ) as Arc }); let provider = QuickwitTableProvider::new( diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs index 8a79dbd0cef..948854c16d9 100644 --- a/quickwit/quickwit-datafusion/src/split_opener.rs +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -7,6 +7,7 @@ use dashmap::DashMap; use datafusion::common::Result; use datafusion::error::DataFusionError; use quickwit_proto::search::SplitIdAndFooterOffsets; +use quickwit_query::tokenizers::TokenizerManager; use quickwit_search::SearcherContext; use quickwit_storage::Storage; use tantivy::Index; @@ -128,6 +129,7 @@ pub struct StorageSplitOpener { searcher_context: Arc, storage: Arc, footer_offsets: SplitIdAndFooterOffsets, + tokenizer_manager: Option, } impl StorageSplitOpener { @@ -153,8 +155,18 @@ impl StorageSplitOpener { searcher_context, storage, footer_offsets, + tokenizer_manager: None, } } + + /// Set the tokenizer manager from the index's doc mapper. + /// + /// Required for full-text queries on fields with custom tokenizers. + /// Without it, tantivy falls back to the default tokenizer. + pub fn with_tokenizer_manager(mut self, tm: TokenizerManager) -> Self { + self.tokenizer_manager = Some(tm); + self + } } impl fmt::Debug for StorageSplitOpener { @@ -172,8 +184,8 @@ impl IndexOpener for StorageSplitOpener { &self.searcher_context, self.storage.clone(), &self.footer_offsets, - None, // tokenizer_manager — TODO: pass from doc mapper - None, // ephemeral cache — TODO: pass from searcher context + self.tokenizer_manager.as_ref(), + None, // ephemeral cache — can be added for hot-path optimization ) .await .map_err(|e| DataFusionError::Execution(format!("open split {}: {e}", self.split_id)))?; From d2fa8088407afd07713c4d9dea01868241129c8c Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 14:21:05 -0500 Subject: [PATCH 08/19] fix: remove unnecessary RepartitionExec in join plans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set target_partitions=1 on the coordinator session. This makes the DF optimizer use CollectLeft join mode (broadcast build side) instead of Partitioned mode with hash repartition. Cross-worker parallelism is handled by df-distributed's stage decomposition. Per-split parallelism comes from tantivy segment count (declared via output_partitioning on the DataSource). target_partitions was only adding wasteful shuffles. Before: HashJoinExec(Partitioned) + 2x RepartitionExec per split After: HashJoinExec(CollectLeft) — no repartition, no shuffle Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/src/session.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/quickwit/quickwit-datafusion/src/session.rs b/quickwit/quickwit-datafusion/src/session.rs index d8306459448..00ac7090086 100644 --- a/quickwit/quickwit-datafusion/src/session.rs +++ b/quickwit/quickwit-datafusion/src/session.rs @@ -58,7 +58,13 @@ impl QuickwitSessionBuilder { /// [`QuickwitSchemaProvider`] that lazily resolves index names from /// the metastore. Otherwise (tests), tables must be registered manually. pub fn build_session(&self) -> SessionContext { - let config = SessionConfig::new(); + // Set target_partitions=1 so the optimizer doesn't add + // RepartitionExec nodes inside per-split join plans. + // Cross-worker parallelism is handled by df-distributed's + // stage decomposition, not DF's partition count. Per-split + // parallelism comes from tantivy segment count, which the + // providers declare via their output_partitioning(). + let config = SessionConfig::new().with_target_partitions(1); let worker_resolver = QuickwitWorkerResolver::new(self.searcher_pool.clone()); let state = SessionStateBuilder::new() From 85d306cdd3bd51eea04bc3493fb23266487d15ec Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 14:54:01 -0500 Subject: [PATCH 09/19] feat: rewrite integration test with real splits + storage + catalog Integration test now uses: - TestSandbox to create real indexed splits on RAM storage - StorageSplitOpener calling open_index_with_caches (with ByteRangeCache) - Real metastore for split discovery - SearcherPool for worker discovery - Flight service on tonic server Known gap: InvertedIndexDataSource does sync I/O on storage-backed directories. Quickwit's existing search path uses warmup() to pre-fetch posting lists before tantivy executes. The DF path needs warmup integration for full-text queries over storage-backed splits. Fast-field-only queries work without warmup. Also adds footer_start/footer_end to OpenerMetadata and the codec proto so storage-backed openers get the split bundle offsets across the wire. Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/Cargo.lock | 2 + quickwit/quickwit-datafusion/Cargo.toml | 3 + quickwit/quickwit-datafusion/src/flight.rs | 138 ++------ .../quickwit-datafusion/src/split_opener.rs | 12 +- .../tests/serve_integration.rs | 302 ++++++++++++------ quickwit/quickwit-serve/Cargo.toml | 1 + quickwit/quickwit-serve/src/lib.rs | 12 +- quickwit/quickwit-serve/src/rest.rs | 14 +- 8 files changed, 260 insertions(+), 224 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index daa94f123e7..b83bcc98781 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -8132,6 +8132,7 @@ dependencies = [ "quickwit-common", "quickwit-config", "quickwit-doc-mapper", + "quickwit-indexing", "quickwit-metastore", "quickwit-proto", "quickwit-query", @@ -8698,6 +8699,7 @@ dependencies = [ "serde_json", "serde_qs 0.15.0", "serde_with", + "tantivy-datafusion", "tempfile", "thiserror 2.0.17", "time", diff --git a/quickwit/quickwit-datafusion/Cargo.toml b/quickwit/quickwit-datafusion/Cargo.toml index 7312cffe4d3..3569cc1b97a 100644 --- a/quickwit/quickwit-datafusion/Cargo.toml +++ b/quickwit/quickwit-datafusion/Cargo.toml @@ -43,7 +43,10 @@ arrow-flight = "57" datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed", features = ["integration"] } dashmap = "6" quickwit-common = { workspace = true, features = ["testsuite"] } +quickwit-config = { workspace = true } +quickwit-indexing = { workspace = true, features = ["testsuite"] } quickwit-proto = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } quickwit-search = { workspace = true, features = ["testsuite"] } +quickwit-storage = { workspace = true } tokio = { workspace = true, features = ["test-util", "macros"] } diff --git a/quickwit/quickwit-datafusion/src/flight.rs b/quickwit/quickwit-datafusion/src/flight.rs index 7a1431ec0d4..f620cca52b4 100644 --- a/quickwit/quickwit-datafusion/src/flight.rs +++ b/quickwit/quickwit-datafusion/src/flight.rs @@ -1,4 +1,3 @@ -use std::pin::Pin; use std::sync::Arc; use arrow_flight::flight_service_server::{FlightService, FlightServiceServer}; @@ -11,38 +10,24 @@ use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_distributed::{DistributedExt, DistributedPhysicalOptimizerRule, Worker}; use futures::stream::BoxStream; use futures::TryStreamExt; -use tantivy_datafusion::{ - IndexOpener, OpenerFactoryExt, OpenerMetadata, TantivyCodec, full_text_udf, -}; +use tantivy_datafusion::{OpenerFactory, OpenerFactoryExt, TantivyCodec, full_text_udf}; use tonic::{Request, Response, Status, Streaming}; use crate::resolver::QuickwitWorkerResolver; -use crate::split_opener::{SplitIndexOpener, SplitRegistry}; /// A Flight service that handles both: /// - **df-distributed plan fragments** (worker execution) /// - **SQL queries from external clients** (via `do_get` with SQL string tickets) -/// -/// Dispatch: if the ticket decodes as a df-distributed protobuf, route -/// to the worker. Otherwise treat the ticket bytes as a UTF-8 SQL string. pub struct QuickwitFlightService { worker: Worker, - registry: Arc, + opener_factory: OpenerFactory, searcher_pool: quickwit_search::SearcherPool, } impl QuickwitFlightService { fn build_client_session(&self) -> SessionContext { - let mut config = SessionConfig::new(); - let registry = self.registry.clone(); - config.set_opener_factory(Arc::new(move |meta: OpenerMetadata| { - Arc::new(SplitIndexOpener::new( - meta.identifier, - registry.clone(), - meta.tantivy_schema, - meta.segment_sizes, - )) as Arc - })); + let mut config = SessionConfig::new().with_target_partitions(1); + config.set_opener_factory(self.opener_factory.clone()); let worker_resolver = QuickwitWorkerResolver::new(self.searcher_pool.clone()); @@ -59,16 +44,12 @@ impl QuickwitFlightService { ctx } - /// Execute a SQL query and return a Flight stream of RecordBatches. async fn execute_sql( &self, sql: &str, ) -> Result>>, Status> { let ctx = self.build_client_session(); - // TODO: register index tables from metastore based on query. - // For now, execute against an empty context (useful for SHOW TABLES, SELECT 1, etc.) - let df = ctx .sql(sql) .await @@ -94,27 +75,20 @@ impl QuickwitFlightService { /// Build the combined Flight service. /// -/// Handles both df-distributed worker traffic AND external SQL queries -/// on the same gRPC port. +/// The `opener_factory` is registered on the worker's `SessionConfig` +/// for decoding plan fragments. It's also used by the client-facing +/// SQL execution path. pub fn build_flight_service( - registry: Arc, + opener_factory: OpenerFactory, searcher_pool: quickwit_search::SearcherPool, ) -> FlightServiceServer { - let reg = registry.clone(); + let factory_for_worker = opener_factory.clone(); let worker = Worker::from_session_builder( move |ctx: datafusion_distributed::WorkerQueryContext| { - let registry = reg.clone(); + let factory = factory_for_worker.clone(); Box::pin(async move { let mut config = SessionConfig::new(); - config.set_opener_factory(Arc::new(move |meta: OpenerMetadata| { - Arc::new(SplitIndexOpener::new( - meta.identifier, - registry.clone(), - meta.tantivy_schema, - meta.segment_sizes, - )) as Arc - })); - + config.set_opener_factory(factory); Ok(ctx .builder .with_config(config) @@ -126,12 +100,12 @@ pub fn build_flight_service( FlightServiceServer::new(QuickwitFlightService { worker, - registry, + opener_factory, searcher_pool, }) } -// ── FlightService impl: dispatch worker vs SQL ────────────────────── +// ── FlightService impl ────────────────────────────────────────────── #[tonic::async_trait] impl FlightService for QuickwitFlightService { @@ -148,87 +122,21 @@ impl FlightService for QuickwitFlightService { request: Request, ) -> Result, Status> { let ticket = request.get_ref(); - - // Try to parse as a df-distributed plan fragment. - // df-distributed encodes DoGet as a prost::Message. - // If the ticket is valid protobuf with the right fields, it's a worker request. - // Otherwise, treat as UTF-8 SQL from an external client. if let Ok(sql) = std::str::from_utf8(&ticket.ticket) { - // Heuristic: df-distributed tickets are protobuf (binary), - // not valid UTF-8 text. If we got valid UTF-8 and it looks - // like SQL (or any human-readable string), handle as SQL. - // Pure protobuf tickets will almost never be valid UTF-8. - if sql.len() > 0 && !sql.starts_with('\0') { + if !sql.is_empty() && !sql.starts_with('\0') { return self.execute_sql(sql).await; } } - - // Delegate to df-distributed worker. self.worker.do_get(request).await } - // All other methods delegate to the worker (which returns unimplemented for most). - - async fn handshake( - &self, - request: Request>, - ) -> Result, Status> { - self.worker.handshake(request).await - } - - async fn list_flights( - &self, - request: Request, - ) -> Result, Status> { - self.worker.list_flights(request).await - } - - async fn get_flight_info( - &self, - request: Request, - ) -> Result, Status> { - self.worker.get_flight_info(request).await - } - - async fn get_schema( - &self, - request: Request, - ) -> Result, Status> { - self.worker.get_schema(request).await - } - - async fn do_put( - &self, - request: Request>, - ) -> Result, Status> { - self.worker.do_put(request).await - } - - async fn do_exchange( - &self, - request: Request>, - ) -> Result, Status> { - self.worker.do_exchange(request).await - } - - async fn do_action( - &self, - request: Request, - ) -> Result, Status> { - self.worker.do_action(request).await - } - - async fn list_actions( - &self, - request: Request, - ) -> Result, Status> { - self.worker.list_actions(request).await - } - - async fn poll_flight_info( - &self, - request: Request, - ) -> Result, Status> { - self.worker.poll_flight_info(request).await - } + async fn handshake(&self, r: Request>) -> Result, Status> { self.worker.handshake(r).await } + async fn list_flights(&self, r: Request) -> Result, Status> { self.worker.list_flights(r).await } + async fn get_flight_info(&self, r: Request) -> Result, Status> { self.worker.get_flight_info(r).await } + async fn get_schema(&self, r: Request) -> Result, Status> { self.worker.get_schema(r).await } + async fn do_put(&self, r: Request>) -> Result, Status> { self.worker.do_put(r).await } + async fn do_exchange(&self, r: Request>) -> Result, Status> { self.worker.do_exchange(r).await } + async fn do_action(&self, r: Request) -> Result, Status> { self.worker.do_action(r).await } + async fn list_actions(&self, r: Request) -> Result, Status> { self.worker.list_actions(r).await } + async fn poll_flight_info(&self, r: Request) -> Result, Status> { self.worker.poll_flight_info(r).await } } diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs index 948854c16d9..53af04d1830 100644 --- a/quickwit/quickwit-datafusion/src/split_opener.rs +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -179,13 +179,23 @@ impl fmt::Debug for StorageSplitOpener { #[async_trait] impl IndexOpener for StorageSplitOpener { + fn footer_range(&self) -> (u64, u64) { + (self.footer_offsets.split_footer_start, self.footer_offsets.split_footer_end) + } + async fn open(&self) -> Result { + // Use an unbounded byte range cache so that tantivy can do + // synchronous reads on the storage-backed directory. Without + // this, StorageDirectory errors on sync reads. + let byte_range_cache = quickwit_storage::ByteRangeCache::with_infinite_capacity( + &quickwit_storage::STORAGE_METRICS.shortlived_cache, + ); let (index, _hot_directory) = quickwit_search::leaf::open_index_with_caches( &self.searcher_context, self.storage.clone(), &self.footer_offsets, self.tokenizer_manager.as_ref(), - None, // ephemeral cache — can be added for hot-path optimization + Some(byte_range_cache), ) .await .map_err(|e| DataFusionError::Execution(format!("open split {}: {e}", self.split_id)))?; diff --git a/quickwit/quickwit-datafusion/tests/serve_integration.rs b/quickwit/quickwit-datafusion/tests/serve_integration.rs index 25f393589fd..7e46fddc237 100644 --- a/quickwit/quickwit-datafusion/tests/serve_integration.rs +++ b/quickwit/quickwit-datafusion/tests/serve_integration.rs @@ -1,11 +1,14 @@ -//! Integration test that exercises the real serving path: -//! Flight service on the same gRPC port, SearcherPool for worker -//! discovery, QuickwitSessionBuilder for distributed execution. +//! Integration test that exercises the full production path: //! -//! Each simulated "searcher node" runs a tonic server with the Flight -//! service — the same way quickwit-serve/grpc.rs mounts it. Workers -//! discover each other via the SearcherPool, just like Chitchat would -//! populate it in production. +//! - Real splits created via TestSandbox (indexing pipeline → storage) +//! - StorageSplitOpener calling open_index_with_caches +//! - QuickwitSchemaProvider catalog resolving indexes from metastore +//! - Flight + SearchService on the same tonic server +//! - SearcherPool for worker discovery +//! - Distributed query execution via Arrow Flight +//! +//! The only difference from production is localhost networking +//! and manual SearcherPool population (instead of Chitchat). use std::net::SocketAddr; use std::sync::Arc; @@ -16,52 +19,36 @@ use datafusion::physical_plan::execute_stream; use datafusion::prelude::*; use datafusion_distributed::display_plan_ascii; use futures::TryStreamExt; -use quickwit_search::SearcherPool; -use tantivy::schema::{SchemaBuilder, FAST, STORED, TEXT}; -use tantivy::{Index, IndexWriter, TantivyDocument}; +use quickwit_config::SearcherConfig; +use quickwit_indexing::TestSandbox; +use quickwit_search::{ + MockSearchService, SearchServiceClient, SearcherPool, SearcherContext, +}; use tantivy_datafusion::{ full_text_udf, IndexOpener, TantivyInvertedIndexProvider, TantivyTableProvider, }; use tokio::net::TcpListener; use quickwit_datafusion::session::QuickwitSessionBuilder; -use quickwit_datafusion::{SplitIndexOpener, SplitRegistry, build_flight_service}; - -fn create_index(docs: &[(u64, i64, f64, &str)]) -> Index { - let mut builder = SchemaBuilder::new(); - let id_f = builder.add_u64_field("id", FAST | STORED); - let score_f = builder.add_i64_field("score", FAST); - let price_f = builder.add_f64_field("price", FAST); - let cat_f = builder.add_text_field("category", TEXT | FAST | STORED); - let schema = builder.build(); - - let index = Index::create_in_ram(schema); - let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000).unwrap(); - for &(id, score, price, category) in docs { - let mut doc = TantivyDocument::default(); - doc.add_u64(id_f, id); - doc.add_i64(score_f, score); - doc.add_f64(price_f, price); - doc.add_text(cat_f, category); - writer.add_document(doc).unwrap(); - } - writer.commit().unwrap(); - index -} +use quickwit_datafusion::split_opener::{SplitRegistry, StorageSplitOpener}; +use quickwit_datafusion::build_flight_service; +use tantivy_datafusion::{OpenerFactory, OpenerMetadata}; fn collect_batches(batches: &[RecordBatch]) -> RecordBatch { arrow::compute::concat_batches(&batches[0].schema(), batches).unwrap() } -/// Start a tonic gRPC server with the Flight service on a random port. -/// This mirrors what `quickwit-serve/src/grpc.rs` does — same port, -/// same tonic server, Flight is just another service alongside search. -async fn start_searcher_node(registry: Arc) -> SocketAddr { +/// Start a tonic server with the Flight service — matching how +/// grpc.rs mounts it alongside SearchService on the same port. +async fn start_searcher_node( + opener_factory: OpenerFactory, + searcher_pool: SearcherPool, +) -> SocketAddr { let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); let addr = listener.local_addr().unwrap(); let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener); - let flight_service = build_flight_service(registry, quickwit_search::SearcherPool::default()); + let flight_service = build_flight_service(opener_factory, searcher_pool); tokio::spawn(async move { tonic::transport::Server::builder() @@ -71,65 +58,154 @@ async fn start_searcher_node(registry: Arc) -> SocketAddr { .unwrap(); }); - // Let the server start accepting connections. tokio::time::sleep(std::time::Duration::from_millis(50)).await; addr } -/// End-to-end: 3 searcher nodes on real TCP, SearcherPool for discovery, -/// distributed full-text join query across 3 splits. +const DOC_MAPPING_YAML: &str = r#" +field_mappings: + - name: id + type: u64 + fast: true + stored: true + - name: category + type: text + tokenizer: default + fast: true + stored: true + - name: price + type: f64 + fast: true +"#; + +/// End-to-end: real splits on storage, real opener, real catalog, +/// real Flight servers, distributed query execution. #[tokio::test(flavor = "multi_thread", worker_threads = 4)] -async fn test_distributed_query_via_searcher_pool() { - // ── Shared split registry (in production: each node has its own) ─ - let registry = Arc::new(SplitRegistry::new()); +async fn test_full_production_path() { + // ── Create a real index with splits via TestSandbox ────────────── + let sandbox = TestSandbox::create("test-df-index", DOC_MAPPING_YAML, "{}", &["category"]) + .await + .unwrap(); + + // Index two batches → two splits. + sandbox + .add_documents(vec![ + serde_json::json!({"id": 1, "category": "electronics", "price": 1.5}), + serde_json::json!({"id": 2, "category": "electronics", "price": 2.5}), + serde_json::json!({"id": 3, "category": "books", "price": 3.5}), + ]) + .await + .unwrap(); - let idx1 = create_index(&[ - (1, 10, 1.5, "electronics"), - (2, 20, 2.5, "electronics"), - ]); - let idx2 = create_index(&[(3, 30, 3.5, "books"), (4, 40, 4.5, "books")]); - let idx3 = create_index(&[ - (5, 50, 5.5, "clothing"), - (6, 60, 6.5, "electronics"), - ]); - registry.insert("split-1".to_string(), idx1); - registry.insert("split-2".to_string(), idx2); - registry.insert("split-3".to_string(), idx3); - - // ── Start 3 searcher nodes (gRPC + Flight on same port) ───────── - let node1_addr = start_searcher_node(registry.clone()).await; - let node2_addr = start_searcher_node(registry.clone()).await; - let node3_addr = start_searcher_node(registry.clone()).await; - - // ── Populate SearcherPool (as Chitchat would in production) ────── + sandbox + .add_documents(vec![ + serde_json::json!({"id": 4, "category": "books", "price": 4.5}), + serde_json::json!({"id": 5, "category": "clothing", "price": 5.5}), + ]) + .await + .unwrap(); + + let metastore = sandbox.metastore(); + let storage_resolver = sandbox.storage_resolver(); + let index_uid = sandbox.index_uid(); + let doc_mapper = sandbox.doc_mapper(); + let tantivy_schema = doc_mapper.schema(); + let searcher_context = Arc::new(SearcherContext::new(SearcherConfig::default(), None)); + + // ── Build a storage-backed opener factory ────────────────────────── + // This is the same factory that production uses (via the catalog). + let storage = sandbox.storage(); + let sc = searcher_context.clone(); + let ts = tantivy_schema.clone(); + let opener_factory: OpenerFactory = Arc::new(move |meta: OpenerMetadata| { + Arc::new(StorageSplitOpener::new( + meta.identifier, + ts.clone(), + meta.segment_sizes, + sc.clone(), + storage.clone(), + meta.footer_start, + meta.footer_end, + )) as Arc + }); + + // ── Start 2 searcher nodes (Flight on same port as SearchService) ─ + let registry = Arc::new(SplitRegistry::new()); let searcher_pool = SearcherPool::default(); - for addr in [node1_addr, node2_addr, node3_addr] { + + let node1_addr = start_searcher_node(opener_factory.clone(), searcher_pool.clone()).await; + let node2_addr = start_searcher_node(opener_factory.clone(), searcher_pool.clone()).await; + + // Populate SearcherPool (as Chitchat would in production). + for addr in [node1_addr, node2_addr] { searcher_pool.insert( addr, - quickwit_search::SearchServiceClient::from_service( - Arc::new(quickwit_search::MockSearchService::new()), + SearchServiceClient::from_service( + Arc::new(MockSearchService::new()), addr, ), ); } - // ── Build DataFusion session via QuickwitSessionBuilder ────────── - let mock_metastore = quickwit_proto::metastore::MetastoreServiceClient::from_mock( - quickwit_proto::metastore::MockMetastoreService::new(), - ); - let session_builder = - QuickwitSessionBuilder::new(mock_metastore, searcher_pool, registry.clone()); + // ── Build DataFusion session with catalog ──────────────────────── + let session_builder = QuickwitSessionBuilder::new( + metastore.clone(), + searcher_pool.clone(), + registry.clone(), + ) + .with_storage(storage_resolver.clone(), searcher_context.clone()); + let ctx = session_builder.build_session(); ctx.register_udf(full_text_udf()); - // ── Register per-split tantivy-df providers ───────────────────── - for (i, split_id) in ["split-1", "split-2", "split-3"].iter().enumerate() { - let index = registry.get(*split_id).unwrap().value().clone(); - let opener = Arc::new(SplitIndexOpener::from_index( - split_id.to_string(), - index, - registry.clone(), + // ── Register index tables using real openers ───────────────────── + // In production, the QuickwitSchemaProvider catalog does this lazily. + // Here we register manually to test with StorageSplitOpener. + let storage = storage_resolver + .resolve(&sandbox.index_uid().to_string().parse::() + .unwrap_or_else(|_| { + // Fallback: use the ram:// URI pattern from TestSandbox + format!("ram://quickwit-test-indexes/test-df-index") + .parse() + .unwrap() + })) + .await + .unwrap_or_else(|_| { + // Use the sandbox's storage directly + sandbox.storage() + }); + + // List splits from the metastore and register per-split tables. + use quickwit_metastore::{ + ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitState, + }; + use quickwit_proto::metastore::{ListSplitsRequest, MetastoreService}; + + let query = ListSplitsQuery::for_index(index_uid.clone()) + .with_split_state(SplitState::Published); + let request = ListSplitsRequest::try_from_list_splits_query(&query).unwrap(); + let splits = metastore + .clone() + .list_splits(request) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + + assert!(splits.len() >= 2, "expected at least 2 splits, got {}", splits.len()); + + for (i, split_meta) in splits.iter().enumerate() { + let opener = Arc::new(StorageSplitOpener::new( + split_meta.split_id.clone(), + tantivy_schema.clone(), + vec![], // segment sizes discovered at open time + searcher_context.clone(), + storage.clone(), + split_meta.footer_offsets.start, + split_meta.footer_offsets.end, )); + let prefix = format!("s{}", i + 1); let o: Arc = opener; ctx.register_table( @@ -144,38 +220,58 @@ async fn test_distributed_query_via_searcher_pool() { .unwrap(); } - // ── Execute distributed full-text join ─────────────────────────── - let sql = "\ - SELECT id, price FROM ( \ - SELECT s1_f.id, s1_f.price \ - FROM s1_inv \ - JOIN s1_f ON s1_f._doc_id = s1_inv._doc_id AND s1_f._segment_ord = s1_inv._segment_ord \ - WHERE full_text(s1_inv.category, 'electronics') \ - UNION ALL \ - SELECT s2_f.id, s2_f.price \ - FROM s2_inv \ - JOIN s2_f ON s2_f._doc_id = s2_inv._doc_id AND s2_f._segment_ord = s2_inv._segment_ord \ - WHERE full_text(s2_inv.category, 'electronics') \ - UNION ALL \ - SELECT s3_f.id, s3_f.price \ - FROM s3_inv \ - JOIN s3_f ON s3_f._doc_id = s3_inv._doc_id AND s3_f._segment_ord = s3_inv._segment_ord \ - WHERE full_text(s3_inv.category, 'electronics') \ - ) ORDER BY id"; - - let df: DataFrame = ctx.sql(sql).await.unwrap(); + // ── Build and execute a distributed query ──────────────────────── + let num_splits = splits.len(); + let union_parts: Vec = (1..=num_splits) + .map(|i| { + format!( + "SELECT s{i}_f.id, s{i}_f.price \ + FROM s{i}_inv \ + JOIN s{i}_f ON s{i}_f._doc_id = s{i}_inv._doc_id \ + AND s{i}_f._segment_ord = s{i}_inv._segment_ord \ + WHERE full_text(s{i}_inv.category, 'electronics')" + ) + }) + .collect(); + let sql = format!( + "SELECT id, price FROM ({}) ORDER BY id", + union_parts.join(" UNION ALL ") + ); + + println!("SQL: {sql}"); + + let df: DataFrame = ctx.sql(&sql).await.unwrap(); let plan = df.create_physical_plan().await.unwrap(); let plan_str = display_plan_ascii(plan.as_ref(), false); - println!("=== Distributed plan (3 searcher nodes, same port) ===\n{plan_str}\n"); + println!("=== Distributed plan (real splits, real storage) ===\n{plan_str}\n"); + + // Verify the plan structure. + assert!( + plan_str.contains("CollectLeft"), + "plan should use CollectLeft join mode (no hash repartition)\n\n{plan_str}" + ); + assert!( + plan_str.contains("InvertedIndexDataSource"), + "plan should contain InvertedIndexDataSource\n\n{plan_str}" + ); + assert!( + plan_str.contains("FastFieldDataSource"), + "plan should contain FastFieldDataSource\n\n{plan_str}" + ); + // Execute the distributed query. let stream = execute_stream(plan, ctx.task_ctx()).unwrap(); let batches: Vec = stream.try_collect().await.unwrap(); let batch = collect_batches(&batches); - // electronics: {1, 2} (split-1) + {6} (split-3) = 3 rows - assert_eq!(batch.num_rows(), 3); + // "electronics" should match ids {1, 2} + assert_eq!(batch.num_rows(), 2, "expected 2 electronics docs"); let ids = batch.column(0).as_primitive::(); let mut id_values: Vec = (0..batch.num_rows()).map(|i| ids.value(i)).collect(); id_values.sort(); - assert_eq!(id_values, vec![1, 2, 6]); + assert_eq!(id_values, vec![1, 2]); + + println!("full production path: OK"); + + sandbox.assert_quit().await; } diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml index 384bd66764a..6b72236738b 100644 --- a/quickwit/quickwit-serve/Cargo.toml +++ b/quickwit/quickwit-serve/Cargo.toml @@ -65,6 +65,7 @@ quickwit-datafusion = { workspace = true } arrow-flight = "57" datafusion = "52" datafusion-distributed = { git = "https://github.com/datafusion-contrib/datafusion-distributed" } +tantivy-datafusion = { path = "/Users/alex.bianchi/oss/tantivy/.worktrees/bianchi/tantivydf/tantivy-datafusion" } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 89bb076e0ef..7ffc17a4960 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -659,8 +659,18 @@ pub async fn serve_quickwit( // The Flight service runs on the same gRPC port as all other services. // Workers discover each other via the same SearcherPool (Chitchat). let datafusion_registry = std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()); + let df_registry = datafusion_registry.clone(); + let datafusion_opener_factory: tantivy_datafusion::OpenerFactory = + std::sync::Arc::new(move |meta: tantivy_datafusion::OpenerMetadata| { + std::sync::Arc::new(quickwit_datafusion::SplitIndexOpener::new( + meta.identifier, + df_registry.clone(), + meta.tantivy_schema, + meta.segment_sizes, + )) as std::sync::Arc + }); let datafusion_flight_service = - quickwit_datafusion::build_flight_service(datafusion_registry.clone(), searcher_pool.clone()); + quickwit_datafusion::build_flight_service(datafusion_opener_factory, searcher_pool.clone()); let datafusion_session_builder = quickwit_datafusion::QuickwitSessionBuilder::new( metastore_through_control_plane.clone(), searcher_pool, diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 35eb9e37d03..dc8227fa2e3 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -892,10 +892,16 @@ mod tests { std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()), ) }, - datafusion_flight_service: quickwit_datafusion::build_flight_service( - std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()), - quickwit_search::SearcherPool::default(), - ), + datafusion_flight_service: { + let reg = std::sync::Arc::new(quickwit_datafusion::SplitRegistry::new()); + let factory: tantivy_datafusion::OpenerFactory = + std::sync::Arc::new(move |meta: tantivy_datafusion::OpenerMetadata| { + std::sync::Arc::new(quickwit_datafusion::SplitIndexOpener::new( + meta.identifier, reg.clone(), meta.tantivy_schema, meta.segment_sizes, + )) as std::sync::Arc + }); + quickwit_datafusion::build_flight_service(factory, quickwit_search::SearcherPool::default()) + }, jaeger_service_opt: None, env_filter_reload_fn: crate::do_nothing_env_filter_reload_fn(), }; From 59232498eb8632ec70a8e542538b69c631635ab5 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 15:20:19 -0500 Subject: [PATCH 10/19] feat: add warmup for storage-backed splits, fix segment sizes Adds warmup module to tantivy-datafusion that pre-fetches inverted index and fast field data before tantivy executes synchronously. Tantivy does sync I/O in its query path. Storage-backed directories (S3/GCS) only support async reads. The warmup runs after opener.open() and before generate_batch(): - InvertedIndexDataSource: warms term dictionaries + posting lists - FastFieldDataSource: warms fast field column file slices Also fixes segment sizes: callers now pass split_meta.num_docs as the segment size (1 segment per split), so the provider creates the correct partition count. No more vec![0] fallback. The serve_integration test now passes end-to-end: - TestSandbox creates real indexed splits on RAM storage - StorageSplitOpener calls open_index_with_caches - warmup pre-fetches inverted index + fast field data - Distributed full-text join query returns correct results Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/Cargo.lock | 1 + quickwit/quickwit-datafusion/src/catalog.rs | 2 +- quickwit/quickwit-datafusion/src/split_opener.rs | 8 ++++---- quickwit/quickwit-datafusion/tests/serve_integration.rs | 9 ++++++++- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index b83bcc98781..774ed63c8d6 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -10909,6 +10909,7 @@ dependencies = [ "serde", "serde_json", "tantivy", + "tokio", ] [[package]] diff --git a/quickwit/quickwit-datafusion/src/catalog.rs b/quickwit/quickwit-datafusion/src/catalog.rs index 93d641274aa..961a125e4d7 100644 --- a/quickwit/quickwit-datafusion/src/catalog.rs +++ b/quickwit/quickwit-datafusion/src/catalog.rs @@ -97,7 +97,7 @@ impl QuickwitSchemaProvider { StorageSplitOpener::new( split_meta.split_id.clone(), schema_for_factory.clone(), - vec![], // segment sizes discovered at open time + vec![split_meta.num_docs as u32], // 1 segment per split searcher_context.clone(), storage.clone(), split_meta.footer_offsets.start, diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs index 53af04d1830..e0894c74e90 100644 --- a/quickwit/quickwit-datafusion/src/split_opener.rs +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -183,6 +183,10 @@ impl IndexOpener for StorageSplitOpener { (self.footer_offsets.split_footer_start, self.footer_offsets.split_footer_end) } + fn segment_sizes(&self) -> Vec { + self.segment_sizes.clone() + } + async fn open(&self) -> Result { // Use an unbounded byte range cache so that tantivy can do // synchronous reads on the storage-backed directory. Without @@ -207,10 +211,6 @@ impl IndexOpener for StorageSplitOpener { self.tantivy_schema.clone() } - fn segment_sizes(&self) -> Vec { - self.segment_sizes.clone() - } - fn identifier(&self) -> &str { &self.split_id } diff --git a/quickwit/quickwit-datafusion/tests/serve_integration.rs b/quickwit/quickwit-datafusion/tests/serve_integration.rs index 7e46fddc237..eb64ea29005 100644 --- a/quickwit/quickwit-datafusion/tests/serve_integration.rs +++ b/quickwit/quickwit-datafusion/tests/serve_integration.rs @@ -196,10 +196,12 @@ async fn test_full_production_path() { assert!(splits.len() >= 2, "expected at least 2 splits, got {}", splits.len()); for (i, split_meta) in splits.iter().enumerate() { + // Use num_docs as the segment size — each split is typically + // one tantivy Index with 1 segment after merging. let opener = Arc::new(StorageSplitOpener::new( split_meta.split_id.clone(), tantivy_schema.clone(), - vec![], // segment sizes discovered at open time + vec![split_meta.num_docs as u32], searcher_context.clone(), storage.clone(), split_meta.footer_offsets.start, @@ -262,6 +264,11 @@ async fn test_full_production_path() { // Execute the distributed query. let stream = execute_stream(plan, ctx.task_ctx()).unwrap(); let batches: Vec = stream.try_collect().await.unwrap(); + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + println!("total rows: {total_rows}, batches: {}", batches.len()); + assert!(total_rows > 0, "expected results but got 0 rows"); + let batch = collect_batches(&batches); // "electronics" should match ids {1, 2} From 37a228f4b120752378906522dafbaecda25d99d3 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 15:29:16 -0500 Subject: [PATCH 11/19] fix: cache opened Index per split, warmup once not twice StorageSplitOpener now caches the opened+warmed Index via OnceCell. When both InvertedIndexDataSource and FastFieldDataSource for the same split call opener.open(), only the first call downloads from storage and runs warmup. The second call returns the cached Index. Warmup moved from the individual DataSources into the opener: - No more warmup in InvertedIndexDataSource::open() - No more warmup in FastFieldDataSource::open() - Single warmup_all() call in StorageSplitOpener::open() - One download, one warmup per split regardless of how many DataSources reference it Co-Authored-By: Claude Opus 4.6 (1M context) --- .../quickwit-datafusion/src/split_opener.rs | 47 ++++++++++++++++--- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs index e0894c74e90..d75c75d77b0 100644 --- a/quickwit/quickwit-datafusion/src/split_opener.rs +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -12,6 +12,7 @@ use quickwit_search::SearcherContext; use quickwit_storage::Storage; use tantivy::Index; use tantivy_datafusion::IndexOpener; +use tokio::sync::OnceCell; /// Registry of opened tantivy indexes, keyed by split ID. /// Used for integration tests. Production uses [`StorageSplitOpener`]. @@ -119,9 +120,13 @@ impl IndexOpener for SplitIndexOpener { /// to download the split bundle from S3/GCS/local storage, warm the /// footer cache + fast field cache, and return an opened tantivy `Index`. /// +/// The opened Index is cached: the first `open()` call downloads, +/// warms, and caches. Subsequent calls return the cached Index. +/// This ensures that the inverted index and fast field DataSources +/// for the same split share one download and one warmup. +/// /// Planning-time metadata (schema, segment sizes) is stored inline — /// no I/O during plan construction. -#[derive(Clone)] pub struct StorageSplitOpener { split_id: String, tantivy_schema: tantivy::schema::Schema, @@ -130,6 +135,26 @@ pub struct StorageSplitOpener { storage: Arc, footer_offsets: SplitIdAndFooterOffsets, tokenizer_manager: Option, + /// Cache: opened + warmed Index. Shared across all DataSources + /// for this split. + cached_index: Arc>, +} + +impl Clone for StorageSplitOpener { + fn clone(&self) -> Self { + Self { + split_id: self.split_id.clone(), + tantivy_schema: self.tantivy_schema.clone(), + segment_sizes: self.segment_sizes.clone(), + searcher_context: self.searcher_context.clone(), + storage: self.storage.clone(), + footer_offsets: self.footer_offsets.clone(), + tokenizer_manager: self.tokenizer_manager.clone(), + // Share the same cache across clones — so inv + ff + // DataSources for the same split share one open + warmup. + cached_index: self.cached_index.clone(), + } + } } impl StorageSplitOpener { @@ -156,13 +181,11 @@ impl StorageSplitOpener { storage, footer_offsets, tokenizer_manager: None, + cached_index: Arc::new(OnceCell::new()), } } /// Set the tokenizer manager from the index's doc mapper. - /// - /// Required for full-text queries on fields with custom tokenizers. - /// Without it, tantivy falls back to the default tokenizer. pub fn with_tokenizer_manager(mut self, tm: TokenizerManager) -> Self { self.tokenizer_manager = Some(tm); self @@ -188,9 +211,13 @@ impl IndexOpener for StorageSplitOpener { } async fn open(&self) -> Result { - // Use an unbounded byte range cache so that tantivy can do - // synchronous reads on the storage-backed directory. Without - // this, StorageDirectory errors on sync reads. + // Return cached Index if already opened. This ensures the + // inverted index and fast field DataSources for the same split + // share one download, one warmup. + if let Some(index) = self.cached_index.get() { + return Ok(index.clone()); + } + let byte_range_cache = quickwit_storage::ByteRangeCache::with_infinite_capacity( &quickwit_storage::STORAGE_METRICS.shortlived_cache, ); @@ -204,6 +231,12 @@ impl IndexOpener for StorageSplitOpener { .await .map_err(|e| DataFusionError::Execution(format!("open split {}: {e}", self.split_id)))?; + // Warm up all data so tantivy's sync reads hit cache. + tantivy_datafusion::warmup::warmup_all(&index).await?; + + // Cache for subsequent calls (inv + ff DataSources share this). + let _ = self.cached_index.set(index.clone()); + Ok(index) } From 6adb196332b4e5f20671108a1d9d76ad196f128d Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Sat, 14 Feb 2026 15:38:56 -0500 Subject: [PATCH 12/19] fix: targeted warmup per DataSource, no over-fetching Each DataSource now warms only the fields it actually needs: - InvertedIndexDataSource: warms term dict + postings for only the queried text fields (from raw_queries field names) - FastFieldDataSource: warms only the projected fast field columns (from projected_schema field names) The opener no longer calls warmup_all(). The ByteRangeCache is shared across DataSources via the cached Index (OnceCell), so data fetched by one DataSource is available to the other without re-downloading. No double I/O, no over-fetching. Adds warmup_fast_fields_by_name() for targeted fast field warmup. Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/src/split_opener.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/quickwit/quickwit-datafusion/src/split_opener.rs b/quickwit/quickwit-datafusion/src/split_opener.rs index d75c75d77b0..4d210a8e003 100644 --- a/quickwit/quickwit-datafusion/src/split_opener.rs +++ b/quickwit/quickwit-datafusion/src/split_opener.rs @@ -231,8 +231,9 @@ impl IndexOpener for StorageSplitOpener { .await .map_err(|e| DataFusionError::Execution(format!("open split {}: {e}", self.split_id)))?; - // Warm up all data so tantivy's sync reads hit cache. - tantivy_datafusion::warmup::warmup_all(&index).await?; + // No warmup here — each DataSource warms only the fields it + // needs on first poll. The ByteRangeCache is shared across all + // DataSources via the cached Index, so no data is fetched twice. // Cache for subsequent calls (inv + ff DataSources share this). let _ = self.cached_index.set(index.clone()); From 15491d7fa63454053867dfb42a296854ced793cd Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 07:48:48 -0500 Subject: [PATCH 13/19] feat: add demo binary showing all three query paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cargo run -p quickwit-datafusion --example demo Three demos with real indexed data (TestSandbox): 1. Decomposed join plan (inv ⋈ f) — distributed via Flight workers, shows full DistributedExec plan with stage decomposition 2. Unified table provider — hides join complexity, single table SQL, uses UnifiedTantivyTableProvider from tantivy-df 3. SearchRequest → DataFrame — ES-compatible query path via build_search_plan(), translates QueryAst to DataFusion plan, uses LeftSemi join to avoid duplicate columns Also fixes build_split_plan to use LeftSemi join instead of Inner join to avoid duplicate _doc_id/_segment_ord columns in the output. Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/examples/demo.rs | 356 ++++++++++++++++++ .../src/query_translator.rs | 14 +- 2 files changed, 365 insertions(+), 5 deletions(-) create mode 100644 quickwit/quickwit-datafusion/examples/demo.rs diff --git a/quickwit/quickwit-datafusion/examples/demo.rs b/quickwit/quickwit-datafusion/examples/demo.rs new file mode 100644 index 00000000000..4311a9e6d66 --- /dev/null +++ b/quickwit/quickwit-datafusion/examples/demo.rs @@ -0,0 +1,356 @@ +//! Demo: Distributed DataFusion execution over Quickwit splits. +//! +//! Shows three query paths: +//! 1. Decomposed join plan (inv ⋈ f) — full visibility into tantivy nodes +//! 2. Unified table provider — hides join complexity, single table +//! 3. SearchRequest → DataFrame — Elasticsearch-compatible query path +//! +//! Run: cargo run -p quickwit-datafusion --example demo + +use std::sync::Arc; + +use datafusion::physical_plan::execute_stream; +use datafusion::prelude::*; +use datafusion_distributed::display_plan_ascii; +use futures::TryStreamExt; +use quickwit_config::SearcherConfig; +use quickwit_indexing::TestSandbox; +use quickwit_search::{MockSearchService, SearchServiceClient, SearcherPool, SearcherContext}; +use tantivy_datafusion::{ + full_text_udf, IndexOpener, TantivyInvertedIndexProvider, TantivyTableProvider, + UnifiedTantivyTableProvider, +}; +use tokio::net::TcpListener; + +use quickwit_datafusion::query_translator::{build_search_plan, query_ast_to_expr}; +use quickwit_datafusion::session::QuickwitSessionBuilder; +use quickwit_datafusion::split_opener::{SplitRegistry, StorageSplitOpener}; +use quickwit_datafusion::build_flight_service; +use quickwit_metastore::{ + ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, SplitState, +}; +use quickwit_proto::metastore::{ListSplitsRequest, MetastoreService}; + +const DOC_MAPPING: &str = r#" +field_mappings: + - name: id + type: u64 + fast: true + stored: true + - name: category + type: text + tokenizer: default + fast: true + stored: true + - name: price + type: f64 + fast: true + - name: score + type: i64 + fast: true +"#; + +async fn start_worker( + opener_factory: tantivy_datafusion::OpenerFactory, + pool: SearcherPool, +) -> std::net::SocketAddr { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener); + let flight = build_flight_service(opener_factory, pool); + tokio::spawn(async move { + tonic::transport::Server::builder() + .add_service(flight) + .serve_with_incoming(incoming) + .await + .unwrap(); + }); + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + addr +} + +fn section(title: &str) { + println!("\n{}", "=".repeat(60)); + println!(" {title}"); + println!("{}\n", "=".repeat(60)); +} + +#[tokio::main(flavor = "multi_thread", worker_threads = 4)] +async fn main() -> anyhow::Result<()> { + // ── Create index with real splits ──────────────────────────────── + println!("📦 Creating index with TestSandbox..."); + let sandbox = TestSandbox::create("demo-logs", DOC_MAPPING, "{}", &["category"]).await?; + + sandbox + .add_documents(vec![ + serde_json::json!({"id": 1, "category": "electronics", "price": 9.99, "score": 85}), + serde_json::json!({"id": 2, "category": "electronics", "price": 24.99, "score": 92}), + serde_json::json!({"id": 3, "category": "books", "price": 14.99, "score": 78}), + serde_json::json!({"id": 4, "category": "books", "price": 7.99, "score": 65}), + ]) + .await?; + + sandbox + .add_documents(vec![ + serde_json::json!({"id": 5, "category": "clothing", "price": 49.99, "score": 88}), + serde_json::json!({"id": 6, "category": "electronics", "price": 199.99, "score": 95}), + serde_json::json!({"id": 7, "category": "books", "price": 29.99, "score": 91}), + ]) + .await?; + + let metastore = sandbox.metastore(); + let storage = sandbox.storage(); + let doc_mapper = sandbox.doc_mapper(); + let index_uid = sandbox.index_uid(); + let tantivy_schema = doc_mapper.schema(); + let searcher_context = Arc::new(SearcherContext::new(SearcherConfig::default(), None)); + + // List splits. + let query = ListSplitsQuery::for_index(index_uid.clone()) + .with_split_state(SplitState::Published); + let request = ListSplitsRequest::try_from_list_splits_query(&query)?; + let splits = metastore + .clone() + .list_splits(request) + .await? + .collect_splits_metadata() + .await?; + + println!( + " {} splits created ({} total docs)\n", + splits.len(), + splits.iter().map(|s| s.num_docs).sum::() + ); + + // ── Build opener factory + workers ─────────────────────────────── + let sc = searcher_context.clone(); + let st = storage.clone(); + let ts = tantivy_schema.clone(); + let opener_factory: tantivy_datafusion::OpenerFactory = + Arc::new(move |meta: tantivy_datafusion::OpenerMetadata| { + Arc::new(StorageSplitOpener::new( + meta.identifier, + ts.clone(), + meta.segment_sizes, + sc.clone(), + st.clone(), + meta.footer_start, + meta.footer_end, + )) as Arc + }); + + let searcher_pool = SearcherPool::default(); + let addr1 = start_worker(opener_factory.clone(), searcher_pool.clone()).await; + let addr2 = start_worker(opener_factory.clone(), searcher_pool.clone()).await; + for addr in [addr1, addr2] { + searcher_pool.insert( + addr, + SearchServiceClient::from_service(Arc::new(MockSearchService::new()), addr), + ); + } + println!("🚀 Started 2 Flight workers on :{}, :{}", addr1.port(), addr2.port()); + + let session_builder = QuickwitSessionBuilder::new( + metastore.clone(), + searcher_pool.clone(), + Arc::new(SplitRegistry::new()), + ); + + // ═══════════════════════════════════════════════════════════════ + // DEMO 1: Decomposed join plan (inv ⋈ f) + // ═══════════════════════════════════════════════════════════════ + section("DEMO 1: Decomposed Join Plan (inv ⋈ f)"); + { + let ctx = session_builder.build_session(); + ctx.register_udf(full_text_udf()); + + for (i, split) in splits.iter().enumerate() { + let opener = Arc::new(StorageSplitOpener::new( + split.split_id.clone(), + tantivy_schema.clone(), + vec![split.num_docs as u32], + searcher_context.clone(), + storage.clone(), + split.footer_offsets.start, + split.footer_offsets.end, + )); + let prefix = format!("s{}", i + 1); + let o: Arc = opener; + ctx.register_table( + &format!("{prefix}_f"), + Arc::new(TantivyTableProvider::from_opener(o.clone())), + )?; + ctx.register_table( + &format!("{prefix}_inv"), + Arc::new(TantivyInvertedIndexProvider::from_opener(o)), + )?; + } + + let n = splits.len(); + let union_parts: Vec = (1..=n) + .map(|i| format!( + "SELECT s{i}_f.id, s{i}_f.price \ + FROM s{i}_inv \ + JOIN s{i}_f ON s{i}_f._doc_id = s{i}_inv._doc_id \ + AND s{i}_f._segment_ord = s{i}_inv._segment_ord \ + WHERE full_text(s{i}_inv.category, 'electronics')" + )) + .collect(); + let sql = format!( + "SELECT id, price FROM ({}) ORDER BY id", + union_parts.join(" UNION ALL ") + ); + + println!("SQL:\n {sql}\n"); + + let df: DataFrame = ctx.sql(&sql).await?; + let plan = df.create_physical_plan().await?; + println!("Distributed Plan:\n{}\n", display_plan_ascii(plan.as_ref(), false)); + + let stream = execute_stream(plan, ctx.task_ctx())?; + let batches: Vec<_> = stream.try_collect().await?; + let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; + println!("Results:\n{formatted}\n"); + } + + // ═══════════════════════════════════════════════════════════════ + // DEMO 2: Unified table provider (hides join complexity) + // ═══════════════════════════════════════════════════════════════ + section("DEMO 2: Unified Table Provider"); + { + let ctx = SessionContext::new_with_config( + SessionConfig::new().with_target_partitions(1), + ); + ctx.register_udf(full_text_udf()); + + // Register each split as a unified table, then UNION ALL. + for (i, split) in splits.iter().enumerate() { + let opener = Arc::new(StorageSplitOpener::new( + split.split_id.clone(), + tantivy_schema.clone(), + vec![split.num_docs as u32], + searcher_context.clone(), + storage.clone(), + split.footer_offsets.start, + split.footer_offsets.end, + )); + ctx.register_table( + &format!("s{}", i + 1), + Arc::new(UnifiedTantivyTableProvider::from_opener(opener)), + )?; + } + + let n = splits.len(); + let union_parts: Vec = (1..=n) + .map(|i| format!( + "SELECT id, price FROM s{i} WHERE full_text(category, 'electronics')" + )) + .collect(); + let sql = format!( + "SELECT id, price FROM ({}) ORDER BY id", + union_parts.join(" UNION ALL ") + ); + + println!("SQL:\n {sql}\n"); + + let df: DataFrame = ctx.sql(&sql).await?; + let batches: Vec<_> = df.collect().await?; + let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; + println!("Results:\n{formatted}\n"); + } + + // ═══════════════════════════════════════════════════════════════ + // DEMO 3: SearchRequest → DataFrame (ES-compatible path) + // ═══════════════════════════════════════════════════════════════ + section("DEMO 3: SearchRequest → DataFrame"); + { + let ctx = SessionContext::new_with_config( + SessionConfig::new().with_target_partitions(1), + ); + ctx.register_udf(full_text_udf()); + + // Build a SearchRequest like Quickwit's REST handler would. + let search_request = quickwit_proto::search::SearchRequest { + query_ast: serde_json::to_string(&serde_json::json!({ + "type": "full_text", + "field": "category", + "text": "books", + "params": { + "mode": { "type": "phrase_fallback_to_intersection" } + }, + "lenient": false + }))?, + max_hits: 10, + sort_fields: vec![quickwit_proto::search::SortField { + field_name: "price".to_string(), + sort_order: quickwit_proto::search::SortOrder::Desc as i32, + ..Default::default() + }], + ..Default::default() + }; + + println!("SearchRequest:"); + println!(" query: full_text(category, 'books')"); + println!(" sort: price DESC"); + println!(" max_hits: 10\n"); + + // For local execution (not distributed), use in-memory openers + // that pre-open the indexes. The StorageSplitOpener + warmup + // path is demonstrated in Demo 1 via the distributed workers. + let demo3_registry = Arc::new(SplitRegistry::new()); + for split in &splits { + let split_opener = StorageSplitOpener::new( + split.split_id.clone(), + tantivy_schema.clone(), + vec![split.num_docs as u32], + searcher_context.clone(), + storage.clone(), + split.footer_offsets.start, + split.footer_offsets.end, + ); + // Pre-open and cache in the registry for local execution. + let index = IndexOpener::open(&split_opener).await + .expect("failed to open split for demo 3"); + demo3_registry.insert(split.split_id.clone(), index); + } + let reg = demo3_registry.clone(); + let ts2 = tantivy_schema.clone(); + let opener_factory_for_plan: quickwit_datafusion::OpenerFactory = + Arc::new(move |meta: &quickwit_metastore::SplitMetadata| { + Arc::new(quickwit_datafusion::split_opener::SplitIndexOpener::new( + meta.split_id.clone(), + reg.clone(), + ts2.clone(), + vec![meta.num_docs as u32], + )) as Arc + }); + + println!("Building search plan..."); + match build_search_plan( + &ctx, + &splits, + &opener_factory_for_plan, + &search_request, + None, + ) { + Ok(df) => { + println!("Plan built, executing..."); + match df.collect().await { + Ok(batches) => { + let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; + println!("Results:\n{formatted}\n"); + } + Err(e) => println!("Execution error: {e}\n"), + } + } + Err(e) => println!("Plan error: {e}\n"), + } + } + + println!("✅ All demos complete."); + + // Exit immediately — TestSandbox's Universe panics on drop if + // actors are still running, which is expected for a demo binary. + std::process::exit(0); + Ok(()) +} diff --git a/quickwit/quickwit-datafusion/src/query_translator.rs b/quickwit/quickwit-datafusion/src/query_translator.rs index c9901e93954..186bb5caeaf 100644 --- a/quickwit/quickwit-datafusion/src/query_translator.rs +++ b/quickwit/quickwit-datafusion/src/query_translator.rs @@ -202,14 +202,18 @@ fn build_split_plan( // Apply the full-text filter on the inverted index side. let df_inv = df_inv.filter(query_filter.clone())?; - // Join on (_doc_id, _segment_ord). - df_inv.join( - df_f, - JoinType::Inner, + // Use semi join — keeps only left columns where join matches. + // Then we don't get duplicate columns. But we want the fast field + // columns, not the inv columns. So we semi-join f against inv. + let df_f_filtered = df_f.join( + df_inv, + JoinType::LeftSemi, &["_doc_id", "_segment_ord"], &["_doc_id", "_segment_ord"], None, - ) + )?; + + Ok(df_f_filtered) } /// Check if a QueryAst contains any full-text query nodes. From 0cb07e5323420932400edfbaca88706434793a69 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 08:01:26 -0500 Subject: [PATCH 14/19] feat: QuickwitTableProvider uses UnifiedTantivyTableProvider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QuickwitTableProvider now creates UnifiedTantivyTableProvider per split instead of plain TantivyTableProvider. This means users can query a single table name and get full-text search, fast fields, scores, and document retrieval — the inv ⋈ f ⋈ d joins are hidden. Before: SELECT * FROM logs → only fast fields After: SELECT * FROM logs WHERE full_text(category, 'books') → full-text search + fast fields + _score + _document Co-Authored-By: Claude Opus 4.6 (1M context) --- .../quickwit-datafusion/src/table_provider.rs | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/quickwit/quickwit-datafusion/src/table_provider.rs b/quickwit/quickwit-datafusion/src/table_provider.rs index 8af44d87f3d..59337578599 100644 --- a/quickwit/quickwit-datafusion/src/table_provider.rs +++ b/quickwit/quickwit-datafusion/src/table_provider.rs @@ -1,7 +1,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::datatypes::SchemaRef; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use async_trait::async_trait; use datafusion::catalog::Session; use datafusion::common::Result; @@ -18,7 +18,7 @@ use quickwit_proto::metastore::{ ListSplitsRequest, MetastoreService, MetastoreServiceClient, }; use quickwit_proto::types::IndexUid; -use tantivy_datafusion::{IndexOpener, TantivyTableProvider}; +use tantivy_datafusion::{IndexOpener, UnifiedTantivyTableProvider}; use tokio::sync::Mutex; /// Factory that creates an [`IndexOpener`] from split metadata. @@ -32,7 +32,12 @@ pub type OpenerFactory = /// /// At scan time, queries the metastore for published splits, creates /// an [`IndexOpener`] per split via the provided factory, builds -/// per-split tantivy-df providers, and unions them. +/// per-split [`UnifiedTantivyTableProvider`]s (which internally +/// compose inv ⋈ f ⋈ d joins), and unions them. +/// +/// Users query a single table name (e.g. `SELECT * FROM logs WHERE +/// full_text(category, 'books')`) — the split decomposition and +/// joins are hidden. pub struct QuickwitTableProvider { index_uid: IndexUid, metastore: Mutex, @@ -47,7 +52,12 @@ impl QuickwitTableProvider { opener_factory: OpenerFactory, tantivy_schema: &tantivy::schema::Schema, ) -> Self { - let arrow_schema = tantivy_datafusion::tantivy_schema_to_arrow(tantivy_schema); + // Build unified schema: fast fields + _score + _document. + let ff_schema = tantivy_datafusion::tantivy_schema_to_arrow(tantivy_schema); + let mut fields: Vec> = ff_schema.fields().to_vec(); + fields.push(Arc::new(Field::new("_score", DataType::Float32, true))); + fields.push(Arc::new(Field::new("_document", DataType::Utf8, false))); + let arrow_schema = Arc::new(Schema::new(fields)); Self { index_uid, metastore: Mutex::new(metastore), @@ -119,7 +129,7 @@ impl TableProvider for QuickwitTableProvider { let mut execs = Vec::with_capacity(splits.len()); for split_meta in &splits { let opener = (self.opener_factory)(split_meta); - let provider = TantivyTableProvider::from_opener(opener); + let provider = UnifiedTantivyTableProvider::from_opener(opener); let exec = provider.scan(state, projection, filters, limit).await?; execs.push(exec); } From 7c8a5ec2914a1ee49d16b26032897fecd08e66a1 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 08:11:05 -0500 Subject: [PATCH 15/19] fix: print physical plans in demo 2 and demo 3 Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/examples/demo.rs | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/quickwit/quickwit-datafusion/examples/demo.rs b/quickwit/quickwit-datafusion/examples/demo.rs index 4311a9e6d66..5ba11b867ea 100644 --- a/quickwit/quickwit-datafusion/examples/demo.rs +++ b/quickwit/quickwit-datafusion/examples/demo.rs @@ -254,7 +254,11 @@ async fn main() -> anyhow::Result<()> { println!("SQL:\n {sql}\n"); let df: DataFrame = ctx.sql(&sql).await?; - let batches: Vec<_> = df.collect().await?; + let plan = df.create_physical_plan().await?; + println!("Physical Plan:\n{}\n", + datafusion::physical_plan::displayable(plan.as_ref()).indent(true)); + let batches: Vec<_> = execute_stream(plan, ctx.task_ctx())? + .try_collect().await?; let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; println!("Results:\n{formatted}\n"); } @@ -334,16 +338,23 @@ async fn main() -> anyhow::Result<()> { None, ) { Ok(df) => { - println!("Plan built, executing..."); - match df.collect().await { - Ok(batches) => { - let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; - println!("Results:\n{formatted}\n"); + match df.create_physical_plan().await { + Ok(plan) => { + println!("Physical Plan:\n{}\n", + datafusion::physical_plan::displayable(plan.as_ref()).indent(true)); + match execute_stream(plan, ctx.task_ctx()) { + Ok(stream) => { + let batches: Vec<_> = stream.try_collect().await?; + let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; + println!("Results:\n{formatted}\n"); + } + Err(e) => println!("Execution error: {e}\n"), + } } - Err(e) => println!("Execution error: {e}\n"), + Err(e) => println!("Plan error: {e}\n"), } } - Err(e) => println!("Plan error: {e}\n"), + Err(e) => println!("Build error: {e}\n"), } } From 23697aa611f303b649d969584b64c7c22e38048e Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 08:15:39 -0500 Subject: [PATCH 16/19] fix: resolve all cargo warnings - Add doc comments for quickwit-search::leaf module and map_bound fn - Remove unused import Sort from query_translator - Remove unused import query_ast_to_expr from demo - Suppress unreachable_code warning on demo process::exit Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/examples/demo.rs | 9 ++++++--- quickwit/quickwit-datafusion/src/query_translator.rs | 1 - quickwit/quickwit-search/src/leaf.rs | 2 +- quickwit/quickwit-search/src/lib.rs | 1 + 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/quickwit/quickwit-datafusion/examples/demo.rs b/quickwit/quickwit-datafusion/examples/demo.rs index 5ba11b867ea..65d56b4c8e7 100644 --- a/quickwit/quickwit-datafusion/examples/demo.rs +++ b/quickwit/quickwit-datafusion/examples/demo.rs @@ -22,7 +22,7 @@ use tantivy_datafusion::{ }; use tokio::net::TcpListener; -use quickwit_datafusion::query_translator::{build_search_plan, query_ast_to_expr}; +use quickwit_datafusion::query_translator::build_search_plan; use quickwit_datafusion::session::QuickwitSessionBuilder; use quickwit_datafusion::split_opener::{SplitRegistry, StorageSplitOpener}; use quickwit_datafusion::build_flight_service; @@ -362,6 +362,9 @@ async fn main() -> anyhow::Result<()> { // Exit immediately — TestSandbox's Universe panics on drop if // actors are still running, which is expected for a demo binary. - std::process::exit(0); - Ok(()) + #[allow(unreachable_code)] + { + std::process::exit(0); + Ok(()) + } } diff --git a/quickwit/quickwit-datafusion/src/query_translator.rs b/quickwit/quickwit-datafusion/src/query_translator.rs index 186bb5caeaf..c5058b34d4a 100644 --- a/quickwit/quickwit-datafusion/src/query_translator.rs +++ b/quickwit/quickwit-datafusion/src/query_translator.rs @@ -11,7 +11,6 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::common::{Result, ScalarValue}; use datafusion::error::DataFusionError; -use datafusion::logical_expr::expr::Sort; use datafusion::logical_expr::{col, lit, Expr, JoinType, SortExpr}; use datafusion::prelude::{DataFrame, SessionContext}; use quickwit_metastore::SplitMetadata; diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 4ba3488ffda..1b201ba211f 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -685,7 +685,7 @@ fn visit_aggregation_mut( modified_something } -// equivalent to Bound::map, which is unstable +/// Maps a `Bound` to `Bound` (equivalent to unstable `Bound::map`). pub fn map_bound(bound: Bound, f: impl FnOnce(T) -> U) -> Bound { use Bound::*; match bound { diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index 213c1a6a90e..61a7d3e3865 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -23,6 +23,7 @@ mod collector; mod error; mod fetch_docs; mod find_trace_ids_collector; +/// Split opening and leaf search execution. pub mod leaf; mod leaf_cache; mod list_fields; From 85b9b55d8a0490fdbb2d8377bf7f069ad5db10ac Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 08:33:28 -0500 Subject: [PATCH 17/19] feat: demo 2 uses single 'logs' table with distributed plan Demo 2 now registers one QuickwitTableProvider as 'logs' and queries it directly: SELECT id, price FROM logs WHERE ... All three demos now show distributed plans via display_plan_ascii. Known issue: UnifiedTantivyTableProvider routes full_text() to FastFieldDataSource pushed filters instead of activating the InvertedIndexDataSource join path. Needs investigation. Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/examples/demo.rs | 61 +++++++++---------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/quickwit/quickwit-datafusion/examples/demo.rs b/quickwit/quickwit-datafusion/examples/demo.rs index 65d56b4c8e7..32e0c3e2513 100644 --- a/quickwit/quickwit-datafusion/examples/demo.rs +++ b/quickwit/quickwit-datafusion/examples/demo.rs @@ -216,47 +216,44 @@ async fn main() -> anyhow::Result<()> { // ═══════════════════════════════════════════════════════════════ // DEMO 2: Unified table provider (hides join complexity) // ═══════════════════════════════════════════════════════════════ - section("DEMO 2: Unified Table Provider"); + section("DEMO 2: Unified Table Provider (distributed)"); { - let ctx = SessionContext::new_with_config( - SessionConfig::new().with_target_partitions(1), - ); - ctx.register_udf(full_text_udf()); + let ctx = session_builder.build_session(); - // Register each split as a unified table, then UNION ALL. - for (i, split) in splits.iter().enumerate() { - let opener = Arc::new(StorageSplitOpener::new( - split.split_id.clone(), - tantivy_schema.clone(), - vec![split.num_docs as u32], - searcher_context.clone(), - storage.clone(), - split.footer_offsets.start, - split.footer_offsets.end, - )); - ctx.register_table( - &format!("s{}", i + 1), - Arc::new(UnifiedTantivyTableProvider::from_opener(opener)), - )?; - } + // Register a single "logs" table backed by QuickwitTableProvider. + // It queries the metastore, discovers splits, and unions them + // automatically — the user just queries "logs". + let st2 = storage.clone(); + let sc2 = searcher_context.clone(); + let ts2 = tantivy_schema.clone(); + let logs_opener_factory: quickwit_datafusion::OpenerFactory = + Arc::new(move |meta: &quickwit_metastore::SplitMetadata| { + Arc::new(StorageSplitOpener::new( + meta.split_id.clone(), + ts2.clone(), + vec![meta.num_docs as u32], + sc2.clone(), + st2.clone(), + meta.footer_offsets.start, + meta.footer_offsets.end, + )) as Arc + }); - let n = splits.len(); - let union_parts: Vec = (1..=n) - .map(|i| format!( - "SELECT id, price FROM s{i} WHERE full_text(category, 'electronics')" - )) - .collect(); - let sql = format!( - "SELECT id, price FROM ({}) ORDER BY id", - union_parts.join(" UNION ALL ") + let logs_provider = quickwit_datafusion::QuickwitTableProvider::new( + index_uid.clone(), + metastore.clone(), + logs_opener_factory, + &tantivy_schema, ); + ctx.register_table("logs", Arc::new(logs_provider))?; + + let sql = "SELECT id, price FROM logs WHERE full_text(category, 'electronics') ORDER BY id"; println!("SQL:\n {sql}\n"); let df: DataFrame = ctx.sql(&sql).await?; let plan = df.create_physical_plan().await?; - println!("Physical Plan:\n{}\n", - datafusion::physical_plan::displayable(plan.as_ref()).indent(true)); + println!("Distributed Plan:\n{}\n", display_plan_ascii(plan.as_ref(), false)); let batches: Vec<_> = execute_stream(plan, ctx.task_ctx())? .try_collect().await?; let formatted = datafusion::common::arrow::util::pretty::pretty_format_batches(&batches)?; From 69c60c479e3604fd181f5059c3cf71917b06c2ad Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 08:43:16 -0500 Subject: [PATCH 18/19] fix: full_text filter pushdown through QuickwitTableProvider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QuickwitTableProvider now implements supports_filters_pushdown() to declare it handles full_text() filters. This ensures DF pushes the filter into scan() which passes it to UnifiedTantivyTableProvider, which activates the InvertedIndexDataSource join path. Before: full_text() stayed as a FilterExec above the scan — inverted index was never activated, all rows returned. After: full_text() pushed into scan → UnifiedTantivyTableProvider builds inv ⋈ f join → only matching docs returned. Also makes extract_full_text_call() public in tantivy-df. Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/examples/demo.rs | 1 - .../quickwit-datafusion/src/table_provider.rs | 20 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/quickwit/quickwit-datafusion/examples/demo.rs b/quickwit/quickwit-datafusion/examples/demo.rs index 32e0c3e2513..5406bc268f4 100644 --- a/quickwit/quickwit-datafusion/examples/demo.rs +++ b/quickwit/quickwit-datafusion/examples/demo.rs @@ -18,7 +18,6 @@ use quickwit_indexing::TestSandbox; use quickwit_search::{MockSearchService, SearchServiceClient, SearcherPool, SearcherContext}; use tantivy_datafusion::{ full_text_udf, IndexOpener, TantivyInvertedIndexProvider, TantivyTableProvider, - UnifiedTantivyTableProvider, }; use tokio::net::TcpListener; diff --git a/quickwit/quickwit-datafusion/src/table_provider.rs b/quickwit/quickwit-datafusion/src/table_provider.rs index 59337578599..dc6c43e22f5 100644 --- a/quickwit/quickwit-datafusion/src/table_provider.rs +++ b/quickwit/quickwit-datafusion/src/table_provider.rs @@ -7,7 +7,7 @@ use datafusion::catalog::Session; use datafusion::common::Result; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::DataFusionError; -use datafusion::logical_expr::Expr; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_plan::union::UnionExec; use datafusion::physical_plan::ExecutionPlan; use quickwit_metastore::{ @@ -18,7 +18,7 @@ use quickwit_proto::metastore::{ ListSplitsRequest, MetastoreService, MetastoreServiceClient, }; use quickwit_proto::types::IndexUid; -use tantivy_datafusion::{IndexOpener, UnifiedTantivyTableProvider}; +use tantivy_datafusion::{IndexOpener, UnifiedTantivyTableProvider, extract_full_text_call}; use tokio::sync::Mutex; /// Factory that creates an [`IndexOpener`] from split metadata. @@ -110,6 +110,22 @@ impl TableProvider for QuickwitTableProvider { TableType::Base } + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + Ok(filters + .iter() + .map(|f| { + if extract_full_text_call(f).is_some() { + TableProviderFilterPushDown::Exact + } else { + TableProviderFilterPushDown::Unsupported + } + }) + .collect()) + } + async fn scan( &self, state: &dyn Session, From 5f3059fd9969739732c4a0d7578a25db99af3fc3 Mon Sep 17 00:00:00 2001 From: Alex Bianchi Date: Wed, 18 Feb 2026 08:46:31 -0500 Subject: [PATCH 19/19] fix: demo 3 uses distributed session and shows stage plan Co-Authored-By: Claude Opus 4.6 (1M context) --- quickwit/quickwit-datafusion/examples/demo.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/quickwit/quickwit-datafusion/examples/demo.rs b/quickwit/quickwit-datafusion/examples/demo.rs index 5406bc268f4..700b48c3884 100644 --- a/quickwit/quickwit-datafusion/examples/demo.rs +++ b/quickwit/quickwit-datafusion/examples/demo.rs @@ -264,10 +264,7 @@ async fn main() -> anyhow::Result<()> { // ═══════════════════════════════════════════════════════════════ section("DEMO 3: SearchRequest → DataFrame"); { - let ctx = SessionContext::new_with_config( - SessionConfig::new().with_target_partitions(1), - ); - ctx.register_udf(full_text_udf()); + let ctx = session_builder.build_session(); // Build a SearchRequest like Quickwit's REST handler would. let search_request = quickwit_proto::search::SearchRequest { @@ -336,8 +333,8 @@ async fn main() -> anyhow::Result<()> { Ok(df) => { match df.create_physical_plan().await { Ok(plan) => { - println!("Physical Plan:\n{}\n", - datafusion::physical_plan::displayable(plan.as_ref()).indent(true)); + println!("Distributed Plan:\n{}\n", + display_plan_ascii(plan.as_ref(), false)); match execute_stream(plan, ctx.task_ctx()) { Ok(stream) => { let batches: Vec<_> = stream.try_collect().await?;