Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.commons.text.TextStringBuilder;
import org.apache.parquet.cli.BaseCommand;
import org.apache.parquet.column.statistics.SizeStatistics;
Expand All @@ -40,13 +44,32 @@
@Parameters(commandDescription = "Print size statistics for a Parquet file")
public class ShowSizeStatisticsCommand extends BaseCommand {

private static <T> Set<T> filterOrNull(Collection<T> values) {
if (values == null || values.isEmpty()) return null;
return new HashSet<>(values);
}

private static boolean includesOrAll(@Nullable Set<?> filter, Object value) {
return filter == null || filter.contains(value);
}

public ShowSizeStatisticsCommand(Logger console) {
super(console);
}

@Parameter(description = "<parquet path>")
List<String> targets;

@Parameter(
names = {"-c", "--column", "--columns"},
description = "List of columns (dot paths) to include")
List<String> columns;

@Parameter(
names = {"-r", "--row-group", "--row-groups"},
description = "List of row-group indexes to include (0-based)")
List<Integer> rowGroups;

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Expand All @@ -60,9 +83,13 @@ public int run() throws IOException {

console.info("\nFile path: {}", source);

List<BlockMetaData> rowGroups = footer.getBlocks();
for (int index = 0, n = rowGroups.size(); index < n; index++) {
printRowGroupSizeStats(console, index, rowGroups.get(index), schema);
List<BlockMetaData> blocks = footer.getBlocks();

final Set<Integer> rowGroupFilter = filterOrNull(this.rowGroups);

for (int i = 0, n = blocks.size(); i < n; i++) {
if (!includesOrAll(rowGroupFilter, i)) continue;
printRowGroupSizeStats(console, i, blocks.get(i), schema);
console.info("");
}
}
Expand All @@ -84,7 +111,11 @@ private void printRowGroupSizeStats(Logger console, int index, BlockMetaData row
console.info(
String.format(formatString, "column", "unencoded bytes", "rep level histogram", "def level histogram"));

final Set<String> columnFilter = filterOrNull(this.columns);

for (ColumnChunkMetaData column : rowGroup.getColumns()) {
String dotPath = column.getPath().toDotString();
if (!includesOrAll(columnFilter, dotPath)) continue;
printColumnSizeStats(console, column, schema, maxColumnWidth);
}
}
Expand All @@ -111,6 +142,12 @@ private void printColumnSizeStats(Logger console, ColumnChunkMetaData column, Me

@Override
public List<String> getExamples() {
return Lists.newArrayList("# Show size statistics for a Parquet file", "sample.parquet");
return Lists.newArrayList(
"# Show size statistics for a Parquet file",
"sample.parquet",
"# Show size statistics for selected columns",
"sample.parquet -c name,tags",
"# Show size statistics for a specific row-group",
"sample.parquet -r 0");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,24 @@ public void testShowSizeStatisticsCommand() throws IOException {
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}

@Test
public void testShowSizeStatisticsWithColumnFilter() throws IOException {
File file = parquetFile();
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
command.targets = Arrays.asList(file.getAbsolutePath());
command.columns = Arrays.asList(INT32_FIELD, INT64_FIELD);
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}

@Test
public void testShowSizeStatisticsWithRowGroupFilter() throws IOException {
File file = parquetFile();
ShowSizeStatisticsCommand command = new ShowSizeStatisticsCommand(createLogger());
command.targets = Arrays.asList(file.getAbsolutePath());
command.rowGroups = Arrays.asList(0);
command.setConf(new Configuration());
Assert.assertEquals(0, command.run());
}
}