Skip to content

Commit 4b6fbc1

Browse files
authored
GH-3273: Add scoped chunk level statistics to avoid unbounded output (#3274)
1 parent 2a58300 commit 4b6fbc1

File tree

2 files changed

+61
-4
lines changed

2 files changed

+61
-4
lines changed

parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommand.java

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
import com.google.common.base.Preconditions;
2727
import com.google.common.collect.Lists;
2828
import java.io.IOException;
29+
import java.util.Collection;
30+
import java.util.HashSet;
2931
import java.util.List;
32+
import java.util.Set;
33+
import javax.annotation.Nullable;
3034
import org.apache.commons.text.TextStringBuilder;
3135
import org.apache.parquet.cli.BaseCommand;
3236
import org.apache.parquet.column.statistics.SizeStatistics;
@@ -40,13 +44,32 @@
4044
@Parameters(commandDescription = "Print size statistics for a Parquet file")
4145
public class ShowSizeStatisticsCommand extends BaseCommand {
4246

47+
private static <T> Set<T> filterOrNull(Collection<T> values) {
48+
if (values == null || values.isEmpty()) return null;
49+
return new HashSet<>(values);
50+
}
51+
52+
private static boolean includesOrAll(@Nullable Set<?> filter, Object value) {
53+
return filter == null || filter.contains(value);
54+
}
55+
4356
public ShowSizeStatisticsCommand(Logger console) {
4457
super(console);
4558
}
4659

4760
@Parameter(description = "<parquet path>")
4861
List<String> targets;
4962

63+
@Parameter(
64+
names = {"-c", "--column", "--columns"},
65+
description = "List of columns (dot paths) to include")
66+
List<String> columns;
67+
68+
@Parameter(
69+
names = {"-r", "--row-group", "--row-groups"},
70+
description = "List of row-group indexes to include (0-based)")
71+
List<Integer> rowGroups;
72+
5073
@Override
5174
@SuppressWarnings("unchecked")
5275
public int run() throws IOException {
@@ -60,9 +83,13 @@ public int run() throws IOException {
6083

6184
console.info("\nFile path: {}", source);
6285

63-
List<BlockMetaData> rowGroups = footer.getBlocks();
64-
for (int index = 0, n = rowGroups.size(); index < n; index++) {
65-
printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
86+
List<BlockMetaData> blocks = footer.getBlocks();
87+
88+
final Set<Integer> rowGroupFilter = filterOrNull(this.rowGroups);
89+
90+
for (int i = 0, n = blocks.size(); i < n; i++) {
91+
if (!includesOrAll(rowGroupFilter, i)) continue;
92+
printRowGroupSizeStats(console, i, blocks.get(i), schema);
6693
console.info("");
6794
}
6895
}
@@ -84,7 +111,11 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row
84111
console.info(
85112
String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram"));
86113

114+
final Set<String> columnFilter = filterOrNull(this.columns);
115+
87116
for (ColumnChunkMetaData column : rowGroup.getColumns()) {
117+
String dotPath = column.getPath().toDotString();
118+
if (!includesOrAll(columnFilter, dotPath)) continue;
88119
printColumnSizeStats(console, column, schema, maxColumnWidth);
89120
}
90121
}
@@ -111,6 +142,12 @@ private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, Me
111142

112143
@Override
113144
public List<String> getExamples() {
114-
return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet");
145+
return Lists.newArrayList(
146+
"# Show size statistics for a Parquet file",
147+
"sample.parquet",
148+
"# Show size statistics for selected columns",
149+
"sample.parquet -c name,tags",
150+
"# Show size statistics for a specific row-group",
151+
"sample.parquet -r 0");
115152
}
116153
}

parquet-cli/src/test/java/org/apache/parquet/cli/commands/ShowSizeStatisticsCommandTest.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,24 @@ public void testShowSizeStatisticsCommand() throws IOException {
3434
command.setConf(new Configuration());
3535
Assert.assertEquals(0, command.run());
3636
}
37+
38+
@Test
39+
public void testShowSizeStatisticsWithColumnFilter() throws IOException {
40+
File file = parquetFile();
41+
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
42+
command.targets = Arrays.asList(file.getAbsolutePath());
43+
command.columns = Arrays.asList(INT32_FIELD, INT64_FIELD);
44+
command.setConf(new Configuration());
45+
Assert.assertEquals(0, command.run());
46+
}
47+
48+
@Test
49+
public void testShowSizeStatisticsWithRowGroupFilter() throws IOException {
50+
File file = parquetFile();
51+
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
52+
command.targets = Arrays.asList(file.getAbsolutePath());
53+
command.rowGroups = Arrays.asList(0);
54+
command.setConf(new Configuration());
55+
Assert.assertEquals(0, command.run());
56+
}
3757
}

0 commit comments

Comments
 (0)