From 0d2aa4383092e7bf544a5c95426ee9dccac8ed48 Mon Sep 17 00:00:00 2001 From: Daniel Mesejo Date: Wed, 18 Feb 2026 22:10:36 +0100 Subject: [PATCH] chore(ci): use tpchgen-cli for generating the tpch dataset closes #1120 --- .github/workflows/test.yml | 1 + benchmarks/tpch/tpch-gen.sh | 14 ++------------ examples/tpch/convert_data_to_parquet.py | 14 +------------- 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c4bf833ec..d8a4b4d39 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -133,6 +133,7 @@ jobs: - name: Run dbgen to create 1 Gb dataset if: ${{ steps.cache-tpch-dataset.outputs.cache-hit != 'true' }} run: | + uv tool install tpchgen-cli cd benchmarks/tpch RUN_IN_CI=TRUE ./tpch-gen.sh 1 diff --git a/benchmarks/tpch/tpch-gen.sh b/benchmarks/tpch/tpch-gen.sh index 139c300a2..bcba08027 100755 --- a/benchmarks/tpch/tpch-gen.sh +++ b/benchmarks/tpch/tpch-gen.sh @@ -34,21 +34,11 @@ fi #popd # Generate data into the ./data directory if it does not already exist -FILE=./data/supplier.tbl +FILE=./data/supplier.csv if test -f "$FILE"; then echo "$FILE exists." else - docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1 - - # workaround for https://github.com/apache/arrow-datafusion/issues/6147 - mv data/customer.tbl data/customer.csv - mv data/lineitem.tbl data/lineitem.csv - mv data/nation.tbl data/nation.csv - mv data/orders.tbl data/orders.csv - mv data/part.tbl data/part.csv - mv data/partsupp.tbl data/partsupp.csv - mv data/region.tbl data/region.csv - mv data/supplier.tbl data/supplier.csv + tpchgen-cli -s $1 --format=csv --output-dir=./data ls -l data fi diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py index af554c39e..bed71db22 100644 --- a/examples/tpch/convert_data_to_parquet.py +++ b/examples/tpch/convert_data_to_parquet.py @@ -121,22 +121,10 @@ # For convenience, go ahead and convert the schema column names to lowercase curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val] - # Pre-collect the output columns so we can ignore the null field we add - # in to handle the trailing | in the file - output_cols = [r[0] for r in curr_schema] - - curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema] - - # Trailing | requires extra field for in processing - curr_schema.append(("some_null", pa.null())) - schema = pa.schema(curr_schema) source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve() dest_file = (curr_dir / f"./data/{filename}.parquet").resolve() - df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|") - - df = df.select(*output_cols) - + df = ctx.read_csv(source_file, schema=schema, has_header=True) df.write_parquet(dest_file, compression="snappy")