Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ jobs:
- name: Run dbgen to create 1 Gb dataset
if: ${{ steps.cache-tpch-dataset.outputs.cache-hit != 'true' }}
run: |
uv tool install tpchgen-cli
cd benchmarks/tpch
RUN_IN_CI=TRUE ./tpch-gen.sh 1

Expand Down
14 changes: 2 additions & 12 deletions benchmarks/tpch/tpch-gen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,11 @@ fi
#popd

# Generate data into the ./data directory if it does not already exist
FILE=./data/supplier.tbl
FILE=./data/supplier.csv
if test -f "$FILE"; then
echo "$FILE exists."
else
docker run -v `pwd`/data:/data $TERMINAL_FLAG --rm ghcr.io/scalytics/tpch-docker:main $VERBOSE_OUTPUT -s $1

# workaround for https://github.com/apache/arrow-datafusion/issues/6147
mv data/customer.tbl data/customer.csv
mv data/lineitem.tbl data/lineitem.csv
mv data/nation.tbl data/nation.csv
mv data/orders.tbl data/orders.csv
mv data/part.tbl data/part.csv
mv data/partsupp.tbl data/partsupp.csv
mv data/region.tbl data/region.csv
mv data/supplier.tbl data/supplier.csv
tpchgen-cli -s $1 --format=csv --output-dir=./data

ls -l data
fi
Expand Down
14 changes: 1 addition & 13 deletions examples/tpch/convert_data_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,22 +121,10 @@
# For convenience, go ahead and convert the schema column names to lowercase
curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val]

# Pre-collect the output columns so we can ignore the null field we add
# in to handle the trailing | in the file
output_cols = [r[0] for r in curr_schema]

curr_schema = [pa.field(r[0], r[1], nullable=False) for r in curr_schema]

# Trailing | requires extra field for in processing
curr_schema.append(("some_null", pa.null()))

schema = pa.schema(curr_schema)

source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve()
dest_file = (curr_dir / f"./data/{filename}.parquet").resolve()

df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|")

df = df.select(*output_cols)

df = ctx.read_csv(source_file, schema=schema, has_header=True)
df.write_parquet(dest_file, compression="snappy")
Loading