From 807bce3d61eaa8cf8b2b612a89b8ef37ff00711a Mon Sep 17 00:00:00 2001 From: joeyutong Date: Tue, 13 May 2025 20:01:58 +0800 Subject: [PATCH 1/3] GH-3213: Add the configuration for ByteStreamSplit encoding --- .../main/java/org/apache/parquet/hadoop/ParquetWriter.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java index d4c4bc1040..b3a7a934b8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java @@ -671,6 +671,11 @@ public SELF withByteStreamSplitEncoding(boolean enableByteStreamSplit) { return self(); } + public SELF withByteStreamSplitEncoding(String columnPath, boolean enableByteStreamSplit) { + encodingPropsBuilder.withByteStreamSplitEncoding(columnPath, enableByteStreamSplit); + return self(); + } + /** * Enable or disable dictionary encoding of the specified column for the constructed writer. * From 76b6469f6548ab197fcc924a013cc4bb9fa02051 Mon Sep 17 00:00:00 2001 From: joeyutong Date: Tue, 13 May 2025 20:01:58 +0800 Subject: [PATCH 2/3] GH-3213: Add the configuration for ByteStreamSplit encoding --- .../parquet/hadoop/TestParquetWriter.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index 739aa85d2c..c1555682a5 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -612,6 +612,46 @@ public void testSizeStatisticsAndStatisticsControl() throws Exception { } } + @Test + public void testByteStreamSplitEncodingControl() throws Exception { + MessageType schema = Types.buildMessage() + .required(FLOAT) + .named("float_field") + .required(INT32) + .named("int32_field") + .named("test_schema"); + + File file = temp.newFile(); + temp.delete(); + + Path path = new Path(file.getAbsolutePath()); + SimpleGroupFactory factory = new SimpleGroupFactory(schema); + try (ParquetWriter writer = ExampleParquetWriter.builder(path) + .withType(schema) + .withByteStreamSplitEncoding(true) + .withByteStreamSplitEncoding("int32_field", true) + .build()) { + writer.write(factory.newGroup() + .append("float_field", 0.3f) + .append("int32_field", 42)); + } + + try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { + for (BlockMetaData block : reader.getFooter().getBlocks()) { + for (ColumnChunkMetaData column : block.getColumns()) { + assertTrue(column.getEncodings().contains(Encoding.BYTE_STREAM_SPLIT)); + } + } + } + + try (ParquetReader reader = + ParquetReader.builder(new GroupReadSupport(), path).build()) { + Group group = reader.read(); + assertEquals(0.3f, group.getFloat("float_field", 0), 0.0); + assertEquals(42, group.getInteger("int32_field", 0)); + } + } + @Test public void testV2WriteAllNullValues() throws Exception { testV2WriteAllNullValues(null, null); From 5aaf5d0314ea27934ff08a4f16328c3cb336368c Mon Sep 17 00:00:00 2001 From: joeyutong Date: Tue, 13 May 2025 20:01:58 +0800 Subject: [PATCH 3/3] GH-3213: Add the configuration for ByteStreamSplit encoding --- .../java/org/apache/parquet/hadoop/TestParquetWriter.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java index c1555682a5..e8707fc40d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetWriter.java @@ -631,9 +631,7 @@ public void testByteStreamSplitEncodingControl() throws Exception { .withByteStreamSplitEncoding(true) .withByteStreamSplitEncoding("int32_field", true) .build()) { - writer.write(factory.newGroup() - .append("float_field", 0.3f) - .append("int32_field", 42)); + writer.write(factory.newGroup().append("float_field", 0.3f).append("int32_field", 42)); } try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) { @@ -645,7 +643,7 @@ public void testByteStreamSplitEncodingControl() throws Exception { } try (ParquetReader reader = - ParquetReader.builder(new GroupReadSupport(), path).build()) { + ParquetReader.builder(new GroupReadSupport(), path).build()) { Group group = reader.read(); assertEquals(0.3f, group.getFloat("float_field", 0), 0.0); assertEquals(42, group.getInteger("int32_field", 0));