chore(ci): use tpchgen-cli for generating the tpch dataset

mesejo · mesejo · commit 0d2aa4383092 · 2026-02-18T22:10:36.000+01:00
closes #1120
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -133,6 +133,7 @@ jobs:
       - name: Run dbgen to create 1 Gb dataset
         if: ${{ steps.cache-tpch-dataset.outputs.cache-hit != 'true' }}
         run: |
+          uv tool install tpchgen-cli 
           cd benchmarks/tpch
           RUN_IN_CI=TRUE ./tpch-gen.sh 1
 
diff --git a/benchmarks/tpch/tpch-gen.sh b/benchmarks/tpch/tpch-gen.sh
@@ -34,21 +34,11 @@ fi
 #popd
 
 # Generate data into the ./data directory if it does not already exist
-FILE=./data/supplier.tbl
+FILE=./data/supplier.csv
 if test -f "$FILE"; then
     echo "$FILE exists."
 else
-  docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1
-
-  # workaround for https://github.com/apache/arrow-datafusion/issues/6147
-  mv data/customer.tbl data/customer.csv
-  mv data/lineitem.tbl data/lineitem.csv
-  mv data/nation.tbl data/nation.csv
-  mv data/orders.tbl data/orders.csv
-  mv data/part.tbl data/part.csv
-  mv data/partsupp.tbl data/partsupp.csv
-  mv data/region.tbl data/region.csv
-  mv data/supplier.tbl data/supplier.csv
+  tpchgen-cli -s $1 --format=csv --output-dir=./data
 
   ls -l data
 fi
diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py
@@ -121,22 +121,10 @@
     # For convenience, go ahead and convert the schema column names to lowercase
     curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val]
 
-    # Pre-collect the output columns so we can ignore the null field we add
-    # in to handle the trailing | in the file
-    output_cols = [r[0] for r in curr_schema]
-
-    curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema]
-
-    # Trailing | requires extra field for in processing
-    curr_schema.append(("some_null", pa.null()))
-
     schema = pa.schema(curr_schema)
 
     source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve()
     dest_file = (curr_dir / f"./data/{filename}.parquet").resolve()
 
-    df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|")
-
-    df = df.select(*output_cols)
-
+    df = ctx.read_csv(source_file, schema=schema, has_header=True)
     df.write_parquet(dest_file, compression="snappy")