diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java index 0821d260e0..1f50509a03 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java @@ -26,7 +26,11 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import javax.annotation.Nullable; import org.apache.commons.text.TextStringBuilder; import org.apache.parquet.cli.BaseCommand; import org.apache.parquet.column.statistics.SizeStatistics; @@ -40,6 +44,15 @@ @Parameters(commandDescription = "Print size statistics for a Parquet file") public class ShowSizeStatisticsCommand extends BaseCommand { + private static Set filterOrNull(Collection values) { + if (values == null || values.isEmpty()) return null; + return new HashSet<>(values); + } + + private static boolean includesOrAll(@Nullable Set filter, Object value) { + return filter == null || filter.contains(value); + } + public ShowSizeStatisticsCommand(Logger console) { super(console); } @@ -47,6 +60,16 @@ public ShowSizeStatisticsCommand(Logger console) { @Parameter(description = "") List targets; + @Parameter( + names = {"-c", "--column", "--columns"}, + description = "List of columns (dot paths) to include") + List columns; + + @Parameter( + names = {"-r", "--row-group", "--row-groups"}, + description = "List of row-group indexes to include (0-based)") + List rowGroups; + @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -60,9 +83,13 @@ public int run() throws IOException { console.info("\nFile path: {}", source); - List rowGroups = footer.getBlocks(); - for (int index = 0, n = rowGroups.size(); index < n; index++) { - printRowGroupSizeStats(console, index, rowGroups.get(index), schema); + List blocks = footer.getBlocks(); + + final Set rowGroupFilter = filterOrNull(this.rowGroups); + + for (int i = 0, n = blocks.size(); i < n; i++) { + if (!includesOrAll(rowGroupFilter, i)) continue; + printRowGroupSizeStats(console, i, blocks.get(i), schema); console.info(""); } } @@ -84,7 +111,11 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row console.info( String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram")); + final Set columnFilter = filterOrNull(this.columns); + for (ColumnChunkMetaData column : rowGroup.getColumns()) { + String dotPath = column.getPath().toDotString(); + if (!includesOrAll(columnFilter, dotPath)) continue; printColumnSizeStats(console, column, schema, maxColumnWidth); } } @@ -111,6 +142,12 @@ private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, Me @Override public List getExamples() { - return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet"); + return Lists.newArrayList( + "# Show size statistics for a Parquet file", + "sample.parquet", + "# Show size statistics for selected columns", + "sample.parquet -c name,tags", + "# Show size statistics for a specific row-group", + "sample.parquet -r 0"); } } diff --git a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java index 55d4f9d6e8..ff1733e906 100644 --- a/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java +++ b/parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java @@ -34,4 +34,24 @@ public void testShowSizeStatisticsCommand() throws IOException { command.setConf(new Configuration()); Assert.assertEquals(0, command.run()); } + + @Test + public void testShowSizeStatisticsWithColumnFilter() throws IOException { + File file = parquetFile(); + ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + command.columns = Arrays.asList(INT32_FIELD, INT64_FIELD); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + } + + @Test + public void testShowSizeStatisticsWithRowGroupFilter() throws IOException { + File file = parquetFile(); + ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger()); + command.targets = Arrays.asList(file.getAbsolutePath()); + command.rowGroups = Arrays.asList(0); + command.setConf(new Configuration()); + Assert.assertEquals(0, command.run()); + } }