From bc43f1fd332b434735adaf7f2797c7c2534e26ef Mon Sep 17 00:00:00 2001
From: arnavb <arnavb@uber.com>
Date: Tue, 30 Sep 2025 08:05:27 +0000
Subject: [PATCH] update

---
 .../parquet/hadoop/ParquetOutputFormat.java   | 10 ++++
 .../TestByteStreamSplitConfiguration.java     | 52 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
index 4036668683..868ae634c1 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
@@ -80,6 +80,9 @@
  * # To enable/disable dictionary encoding
  * parquet.enable.dictionary=true # false to disable dictionary encoding
  *
+ * # To enable/disable BYTE_STREAM_SPLIT encoding
+ * parquet.enable.bytestreamsplit=false # true to enable BYTE_STREAM_SPLIT encoding
+ *
  * # To enable/disable summary metadata aggregation at the end of a MR job
  * # The default is true (enabled)
  * parquet.enable.summary-metadata=true # false to disable summary aggregation
@@ -137,6 +140,7 @@ public static enum JobSummaryLevel {
   public static final String WRITE_SUPPORT_CLASS = "parquet.write.support.class";
   public static final String DICTIONARY_PAGE_SIZE = "parquet.dictionary.page.size";
   public static final String ENABLE_DICTIONARY = "parquet.enable.dictionary";
+  public static final String ENABLE_BYTE_STREAM_SPLIT = "parquet.enable.bytestreamsplit";
   public static final String VALIDATION = "parquet.validation";
   public static final String WRITER_VERSION = "parquet.writer.version";
   public static final String MEMORY_POOL_RATIO = "parquet.memory.pool.ratio";
@@ -270,6 +274,11 @@ public static boolean getEnableDictionary(Configuration configuration) {
     return configuration.getBoolean(ENABLE_DICTIONARY, ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED);
   }
 
+  public static boolean getByteStreamSplitEnabled(Configuration configuration) {
+    return configuration.getBoolean(
+        ENABLE_BYTE_STREAM_SPLIT, ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED);
+  }
+
   public static int getMinRowCountForPageSizeCheck(Configuration configuration) {
     return configuration.getInt(
         MIN_ROW_COUNT_FOR_PAGE_SIZE_CHECK, ParquetProperties.DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK);
@@ -503,6 +512,7 @@ public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, Comp
         .withPageSize(getPageSize(conf))
         .withDictionaryPageSize(getDictionaryPageSize(conf))
         .withDictionaryEncoding(getEnableDictionary(conf))
+        .withByteStreamSplitEncoding(getByteStreamSplitEnabled(conf))
         .withWriterVersion(getWriterVersion(conf))
         .estimateRowCountForPageSizeCheck(getEstimatePageSizeCheck(conf))
         .withMinRowCountForPageSizeCheck(getMinRowCountForPageSizeCheck(conf))
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
new file mode 100644
index 0000000000..a756d167a4
--- /dev/null
+++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestByteStreamSplitConfiguration.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.column.ParquetProperties;
+import org.junit.Test;
+
+public class TestByteStreamSplitConfiguration {
+  @Test
+  public void testDefault() throws Exception {
+    Configuration conf = new Configuration();
+    // default should be false
+    assertEquals(
+        ParquetProperties.DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED,
+        ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+  }
+
+  @Test
+  public void testSetTrue() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, true);
+    assertTrue(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+  }
+
+  @Test
+  public void testSetFalse() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setBoolean(ParquetOutputFormat.ENABLE_BYTE_STREAM_SPLIT, false);
+    assertFalse(ParquetOutputFormat.getByteStreamSplitEnabled(conf));
+  }
+}