From bc43f1fd332b434735adaf7f2797c7c2534e26ef Mon Sep 17 00:00:00 2001 From: arnavb Date: Tue, 30 Sep 2025 08:05:27 +0000 Subject: [PATCH] update --- .../parquet/hadoop/ParquetOutputFormat.java | 10 ++++ .../TestByteStreamSplitConfiguration.java | 52 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java index 4036668683..868ae634c1 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java @@ -80,6 +80,9 @@ * # To enable/disable dictionary encoding * parquet.enable.dictionary=true # false to disable dictionary encoding * + * # To enable/disable BYTE_STREAM_SPLIT encoding + * parquet.enable.bytestreamsplit=false # true to enable BYTE_STREAM_SPLIT encoding + * * # To enable/disable summary metadata aggregation at the end of a MR job * # The default is true (enabled) * parquet.enable.summary-metadata=true # false to disable summary aggregation @@ -137,6 +140,7 @@ public static enum JobSummaryLevel { public static final String WRITE_SUPPORT_CLASS = "parquet.write.support.class"; public static final String DICTIONARY_PAGE_SIZE = "parquet.dictionary.page.size"; public static final String ENABLE_DICTIONARY = "parquet.enable.dictionary"; + public static final String ENABLE_BYTE_STREAM_SPLIT = "parquet.enable.bytestreamsplit"; public static final String VALIDATION = "parquet.validation"; public static final String WRITER_VERSION = "parquet.writer.version"; public static final String MEMORY_POOL_RATIO = "parquet.memory.pool.ratio"; @@ -270,6 +274,11 @@ public static boolean getEnableDictionary(Configuration configuration) { return configuration.getBoolean(ENABLE_DICTIONARY, ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED); } + public static boolean getByteStreamSplitEnabled(Configuration configuration) { + return configuration.getBoolean( + ENABLE_BYTE_STREAM_SPLIT, ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED); + } + public static int getMinRowCountForPageSizeCheck(Configuration configuration) { return configuration.getInt( MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK); @@ -503,6 +512,7 @@ public RecordWriter getRecordWriter(Configuration conf, Path file, Comp .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) .withDictionaryEncoding(getEnableDictionary(conf)) + .withByteStreamSplitEncoding(getByteStreamSplitEnabled(conf)) .withWriterVersion(getWriterVersion(conf)) .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf)) .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf)) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java new file mode 100644 index 0000000000..a756d167a4 --- /dev/null +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.hadoop; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.column.ParquetProperties; +import org.junit.Test; + +public class TestByteStreamSplitConfiguration { + @Test + public void testDefault() throws Exception { + Configuration conf = new Configuration(); + // default should be false + assertEquals( + ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED, + ParquetOutputFormat.getByteStreamSplitEnabled(conf)); + } + + @Test + public void testSetTrue() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, true); + assertTrue(ParquetOutputFormat.getByteStreamSplitEnabled(conf)); + } + + @Test + public void testSetFalse() throws Exception { + Configuration conf = new Configuration(); + conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, false); + assertFalse(ParquetOutputFormat.getByteStreamSplitEnabled(conf)); + } +}