From b1ef8ce65145fc872236cca845f4d783fff603b0 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 21 Aug 2025 09:46:50 +0000 Subject: [PATCH 1/5] update --- .../commands/ShowSizeStatisticsCommand.java | 39 +++++++++++++++++-- .../ShowSizeStatisticsCommandTest.java | 20 ++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java index 0821d260e0..272d000289 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -27,6 +27,8 @@ import com.google.common.collect.Lists; import java.io.IOException; import java.util.List; +import java.util.Set; +import java.util.HashSet; import org.apache.commons.text.TextStringBuilder; import org.apache.parquet.cli.BaseCommand; import org.apache.parquet.column.statistics.SizeStatistics; @@ -47,6 +49,16 @@ public ShowSizeStatisticsCommand(Logger console) { @Parameter(description = "") List targets; + @Parameter( + names = {"-c", "--column", "--columns"}, + description = "List of columns (dot paths) to include") + List columns; + + @Parameter( + names = {"-r", "--row-group", "--row-groups"}, + description = "List of row-group indexes to include (0-based)") + List rowGroups; + @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -60,9 +72,13 @@ public int run() throws IOException { console.info("\nFile path: {}", source); - List rowGroups = footer.getBlocks(); - for (int index = 0, n = rowGroups.size(); index < n; index++) { - printRowGroupSizeStats(console, index, rowGroups.get(index), schema); + List blocks = footer.getBlocks(); + Set allowedRowGroups = rowGroups == null ? null : new HashSet<>(rowGroups); + for (int index = 0, n = blocks.size(); index < n; index++) { + if (allowedRowGroups != null && !allowedRowGroups.contains(index)) { + continue; + } + printRowGroupSizeStats(console, index, blocks.get(index), schema); console.info(""); } } @@ -84,7 +100,16 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row console.info( String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram")); + Set allowedColumns = null; + if (columns != null && !columns.isEmpty()) { + allowedColumns = new HashSet<>(columns); + } + for (ColumnChunkMetaData column : rowGroup.getColumns()) { + String dotPath = column.getPath().toDotString(); + if (allowedColumns != null && !allowedColumns.contains(dotPath)) { + continue; + } printColumnSizeStats(console, column, schema, maxColumnWidth); } } @@ -111,6 +136,12 @@ private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, Me @Override public List getExamples() { - return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet"); + return Lists.newArrayList( + "# Show size statistics for a Parquet file", + "sample.parquet", + "# Show size statistics for selected columns", + "sample.parquet -c name,tags", + "# Show size statistics for a specific row-group", + "sample.parquet -r 0"); } } diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java index 55d4f9d6e8..ff1733e906 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java @@ -34,4 +34,24 @@ public void testShowSizeStatisticsCommand() throws IOException { command.setConf(new Configuration()); Assert.assertEquals(0, command.run()); } + + @Test + public void testShowSizeStatisticsWithColumnFilter() throws IOException { + File file = parquetFile(); + ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + command.columns = Arrays.asList(INT32_FIELD, INT64_FIELD); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + } + + @Test + public void testShowSizeStatisticsWithRowGroupFilter() throws IOException { + File file = parquetFile(); + ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + command.rowGroups = Arrays.asList(0); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + } } From 6c98c77cd57d7a0f4bd46314e8bbdde1d67b63c5 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 21 Aug 2025 10:41:24 +0000 Subject: [PATCH 2/5] lint --- .../apache/parquet/cli/commands/ShowSizeStatisticsCommand.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java index 272d000289..3c9d7528cf 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -26,9 +26,9 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; +import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.HashSet; import org.apache.commons.text.TextStringBuilder; import org.apache.parquet.cli.BaseCommand; import org.apache.parquet.column.statistics.SizeStatistics; From 07458982e920ba1588aa0ccc6f2ec3c7140b4f41 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 28 Aug 2025 08:57:10 +0000 Subject: [PATCH 3/5] update --- .../cli/commands/ShowSizeStatisticsCommand.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java index 3c9d7528cf..0440dbb63f 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -73,9 +73,9 @@ public int run() throws IOException { console.info("\nFile path: {}", source); List blocks = footer.getBlocks(); - Set allowedRowGroups = rowGroups == null ? null : new HashSet<>(rowGroups); + Set allowedRowGroups = rowGroups == null ? new HashSet<>() : new HashSet<>(rowGroups); for (int index = 0, n = blocks.size(); index < n; index++) { - if (allowedRowGroups != null && !allowedRowGroups.contains(index)) { + if (!allowedRowGroups.isEmpty() && !allowedRowGroups.contains(index)) { continue; } printRowGroupSizeStats(console, index, blocks.get(index), schema); @@ -100,14 +100,11 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row console.info( String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram")); - Set allowedColumns = null; - if (columns != null && !columns.isEmpty()) { - allowedColumns = new HashSet<>(columns); - } + Set allowedColumns = columns == null ? new HashSet<>() : new HashSet<>(columns); for (ColumnChunkMetaData column : rowGroup.getColumns()) { String dotPath = column.getPath().toDotString(); - if (allowedColumns != null && !allowedColumns.contains(dotPath)) { + if (!allowedColumns.isEmpty() && !allowedColumns.contains(dotPath)) { continue; } printColumnSizeStats(console, column, schema, maxColumnWidth); From 5a2bf7d1359d3b48ea87f991061b6c794b6c9349 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 28 Aug 2025 10:39:08 +0000 Subject: [PATCH 4/5] update --- .../commands/ShowSizeStatisticsCommand.java | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java index 0440dbb63f..c5fa073dcd 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -26,9 +26,12 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.Collection; +import javax.annotation.Nullable; import org.apache.commons.text.TextStringBuilder; import org.apache.parquet.cli.BaseCommand; import org.apache.parquet.column.statistics.SizeStatistics; @@ -42,6 +45,15 @@ @Parameters(commandDescription = "Print size statistics for a Parquet file") public class ShowSizeStatisticsCommand extends BaseCommand { + private static Set filterOrNull(Collection values) { + if (values == null || values.isEmpty()) return null; + return new HashSet<>(values); + } + + private static boolean includesOrAll(@Nullable Set filter, Object value) { + return filter == null || filter.contains(value); + } + public ShowSizeStatisticsCommand(Logger console) { super(console); } @@ -73,12 +85,12 @@ public int run() throws IOException { console.info("\nFile path: {}", source); List blocks = footer.getBlocks(); - Set allowedRowGroups = rowGroups == null ? new HashSet<>() : new HashSet<>(rowGroups); - for (int index = 0, n = blocks.size(); index < n; index++) { - if (!allowedRowGroups.isEmpty() && !allowedRowGroups.contains(index)) { - continue; - } - printRowGroupSizeStats(console, index, blocks.get(index), schema); + + final Set rowGroupFilter = filterOrNull(this.rowGroups); + + for (int i = 0, n = blocks.size(); i < n; i++) { + if (!includesOrAll(rowGroupFilter, i)) continue; + printRowGroupSizeStats(console, i, blocks.get(i), schema); console.info(""); } } @@ -100,13 +112,11 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row console.info( String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram")); - Set allowedColumns = columns == null ? new HashSet<>() : new HashSet<>(columns); + final Set columnFilter = filterOrNull(this.columns); for (ColumnChunkMetaData column : rowGroup.getColumns()) { String dotPath = column.getPath().toDotString(); - if (!allowedColumns.isEmpty() && !allowedColumns.contains(dotPath)) { - continue; - } + if (!includesOrAll(columnFilter, dotPath)) continue; printColumnSizeStats(console, column, schema, maxColumnWidth); } } From 15c3e7f105ddf13df195660187ea5cfbab6a5230 Mon Sep 17 00:00:00 2001 From: arnavb Date: Thu, 28 Aug 2025 10:53:07 +0000 Subject: [PATCH 5/5] update --- .../apache/parquet/cli/commands/ShowSizeStatisticsCommand.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java index c5fa073dcd..1f50509a03 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -26,11 +26,10 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; -import java.util.Collections; +import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.Collection; import javax.annotation.Nullable; import org.apache.commons.text.TextStringBuilder; import org.apache.parquet.cli.BaseCommand;