From c13273e493e39c28892f7f51bdec29643ffb72eb Mon Sep 17 00:00:00 2001 From: Gabor Szarnyas Date: Mon, 26 Jan 2026 18:08:39 +0000 Subject: [PATCH 1/2] Update DuckDB Vortex partitioned implementation --- duckdb-vortex-partitioned/benchmark.sh | 39 ++++++++------------------ duckdb-vortex-partitioned/create.sql | 6 ++++ duckdb-vortex-partitioned/queries.sql | 4 +-- duckdb-vortex-partitioned/run.sh | 4 +++ 4 files changed, 23 insertions(+), 30 deletions(-) create mode 100644 duckdb-vortex-partitioned/create.sql diff --git a/duckdb-vortex-partitioned/benchmark.sh b/duckdb-vortex-partitioned/benchmark.sh index 8bd613314..b07a88e47 100755 --- a/duckdb-vortex-partitioned/benchmark.sh +++ b/duckdb-vortex-partitioned/benchmark.sh @@ -1,45 +1,28 @@ #!/bin/bash +set -Eeuo pipefail + # Install -sudo apt-get update -y -sudo apt-get install -y ninja-build cmake build-essential make ccache pip clang pkg-config - -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --no-modify-path - -export CC=clang -export CXX=clang++ -git clone https://github.com/vortex-data/duckdb-vortex --recursive -cd duckdb-vortex -git fetch --tags -git checkout v0.44.0 -git submodule update --init --recursive -GEN=ninja NATIVE_ARCH=1 LTO=thin make -export PATH="`pwd`/build/release/:$PATH" -cd .. +export HOME=${HOME:=~} +curl https://install.duckdb.org | sh +export PATH=$HOME'/.duckdb/cli/latest':$PATH + +duckdb -c "INSTALL vortex;" # Load the data -seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' +seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue --quiet https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet' # Convert parquet files to vortex partitioned echo -n "Load time: " seq 0 99 | command time -f '%e' xargs -P"$(nproc)" -I{} bash -c ' if [ ! -f "hits_{}.vortex" ]; then - duckdb -c " - COPY ( - SELECT * - REPLACE ( - make_date(EventDate) AS EventDate, - epoch_ms(EventTime * 1000) as EventTime - ) - FROM read_parquet('"'"'hits_{}.parquet'"'"', binary_as_string=True) - ) - TO '"'"'hits_{}.vortex'"'"' (FORMAT VORTEX) - " + duckdb -c "LOAD vortex;" -c "COPY (SELECT * REPLACE (URL::VARCHAR AS URL, Title::VARCHAR AS Title, Referer::VARCHAR AS Referer) FROM '"'"'hits_{}.parquet'"'"') TO '"'"'hits_{}.vortex'"'"' (FORMAT vortex);" fi ' +# Create view and macro echo -n "Load time: " -command time -f '%e' duckdb hits-partitioned.db -c "CREATE VIEW hits AS SELECT * FROM read_vortex('hits_*.vortex')"; +command time -f '%e' duckdb hits-partitioned.db -f create.sql # Run the queries echo 'partitioned' diff --git a/duckdb-vortex-partitioned/create.sql b/duckdb-vortex-partitioned/create.sql new file mode 100644 index 000000000..69febc742 --- /dev/null +++ b/duckdb-vortex-partitioned/create.sql @@ -0,0 +1,6 @@ +LOAD vortex; + +CREATE VIEW hits AS + SELECT * REPLACE (make_date(EventDate) AS EventDate) + FROM read_vortex('hits_*.vortex'); +CREATE MACRO toDateTime(t) AS epoch_ms(t * 1000); diff --git a/duckdb-vortex-partitioned/queries.sql b/duckdb-vortex-partitioned/queries.sql index b4115ee3a..ceebb80d7 100644 --- a/duckdb-vortex-partitioned/queries.sql +++ b/duckdb-vortex-partitioned/queries.sql @@ -16,7 +16,7 @@ SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; -SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; SELECT UserID FROM hits WHERE UserID = 435090932899640449; SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; @@ -40,4 +40,4 @@ SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate > SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; +SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000; diff --git a/duckdb-vortex-partitioned/run.sh b/duckdb-vortex-partitioned/run.sh index 71bd5c4a5..30484964b 100755 --- a/duckdb-vortex-partitioned/run.sh +++ b/duckdb-vortex-partitioned/run.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -Eeuo pipefail + TRIES=3 cat queries.sql | while read -r query; do @@ -9,6 +11,8 @@ cat queries.sql | while read -r query; do echo "$query"; cli_params=() cli_params+=("-c") + cli_params+=("LOAD vortex;") + cli_params+=("-c") cli_params+=(".timer on") for i in $(seq 1 $TRIES); do cli_params+=("-c") From 5e5c4f7653e733bd66068fd66778b382410ae7e8 Mon Sep 17 00:00:00 2001 From: Gabor Szarnyas Date: Mon, 26 Jan 2026 20:18:38 +0000 Subject: [PATCH 2/2] Update DuckDB Vortex partitioned results --- .../results/c6a.4xlarge.json | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/duckdb-vortex-partitioned/results/c6a.4xlarge.json b/duckdb-vortex-partitioned/results/c6a.4xlarge.json index c508ae522..bb892584b 100644 --- a/duckdb-vortex-partitioned/results/c6a.4xlarge.json +++ b/duckdb-vortex-partitioned/results/c6a.4xlarge.json @@ -1,6 +1,6 @@ { "system": "DuckDB (Vortex, partitioned)", - "date": "2025-08-06", + "date": "2026-01-26", "machine": "c6a.4xlarge", "cluster_size": 1, "proprietary": "no", @@ -10,52 +10,52 @@ "tags": ["Rust", "column-oriented", "embedded", "stateless", "lukewarm-cold-run"], - "load_time": 742.26, - "data_size": 15961049404, + "load_time": 306.10, + "data_size": 62297495012, "result": [ - [0.184,0.013,0.003], - [0.523,0.014,0.014], - [1.610,0.035,0.035], - [3.435,0.049,0.052], - [3.466,0.329,0.332], - [4.172,0.297,0.292], - [0.181,0.022,0.020], - [0.567,0.019,0.018], - [4.334,0.415,0.405], - [4.319,0.561,0.558], - [2.784,0.097,0.091], - [3.485,0.124,0.107], - [4.721,0.307,0.316], - [7.135,0.672,0.675], - [4.478,0.343,0.342], - [2.783,0.392,0.387], - [7.061,0.852,0.847], - [6.856,0.740,0.628], - [9.200,1.517,1.505], - [1.971,0.038,0.031], - [33.849,0.556,0.530], - [36.486,0.679,0.636], - [40.129,1.065,1.072], - [7.566,0.392,0.382], - [1.880,0.122,0.062], - [4.896,0.096,0.098], - [1.791,0.126,0.031], - [34.787,0.863,0.790], - [28.059,9.317,9.314], - [0.717,0.033,0.033], - [7.835,0.279,0.317], - [13.900,0.420,0.403], - [10.751,1.919,1.892], - [34.222,2.047,1.980], - [34.208,2.288,2.131], - [1.861,0.511,0.506], - [0.258,0.025,0.024], - [0.840,0.012,0.021], - [1.098,0.024,0.018], - [1.265,0.063,0.053], - [0.815,0.022,0.008], - [0.889,0.010,0.010], - [0.833,0.032,0.012] + [0.081,0.018,0.006], + [0.079,0.027,0.024], + [0.667,0.038,0.041], + [1.835,0.057,0.055], + [1.839,0.256,0.257], + [9.272,0.599,0.610], + [0.093,0.017,0.018], + [0.102,0.031,0.026], + [2.230,0.408,0.412], + [2.011,0.556,0.555], + [7.812,0.290,0.288], + [7.884,0.305,0.301], + [9.172,0.608,0.575], + [10.977,0.967,0.951], + [9.324,0.648,0.621], + [1.305,0.358,0.357], + [10.873,1.085,1.112], + [10.675,0.880,0.917], + [12.196,1.869,1.859], + [0.723,0.037,0.036], + [15.900,0.729,0.709], + [21.164,0.877,0.884], + [29.683,1.692,1.686], + [43.726,13.442,1.409], + [0.870,1.021,1.554], + [9.743,0.394,0.334], + [1.740,1.337,0.620], + [15.907,0.733,0.719], + [13.537,6.344,6.282], + [0.118,0.038,0.037], + [11.198,0.750,0.751], + [13.790,0.897,0.902], + [5.408,1.860,1.927], + [15.674,2.265,2.199], + [15.707,3.102,3.082], + [0.671,0.548,0.552], + [0.105,0.027,0.029], + [0.089,0.016,0.016], + [0.534,0.021,0.022], + [0.645,0.063,0.063], + [0.330,0.013,0.012], + [0.457,0.014,0.014], + [0.429,0.018,0.018] ] }