Skip to content

Commit 4d2f1ee

Browse files
authored
GH-3331: Track Column index page skip statistics during file read (#3330)
1 parent 8c84dd0 commit 4d2f1ee

File tree

3 files changed

+25
-3
lines changed

3 files changed

+25
-3
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
*/
1919
package org.apache.parquet.hadoop;
2020

21+
import static org.apache.parquet.hadoop.ParquetFileReaderMetrics.PagesIncluded;
22+
import static org.apache.parquet.hadoop.ParquetFileReaderMetrics.PagesSkipped;
23+
2124
import it.unimi.dsi.fastutil.ints.IntArrayList;
2225
import it.unimi.dsi.fastutil.ints.IntList;
2326
import java.util.ArrayList;
@@ -129,14 +132,30 @@ public String toString() {
129132
/*
130133
* Returns the filtered offset index containing only the pages which are overlapping with rowRanges.
131134
*/
132-
static OffsetIndex filterOffsetIndex(OffsetIndex offsetIndex, RowRanges rowRanges, long totalRowCount) {
135+
static OffsetIndex filterOffsetIndex(
136+
OffsetIndex offsetIndex,
137+
RowRanges rowRanges,
138+
long totalRowCount,
139+
org.apache.parquet.ParquetReadOptions options) {
133140
IntList indexMap = new IntArrayList();
141+
int pagesIncluded = 0;
142+
int pagesSkipped = 0;
134143
for (int i = 0, n = offsetIndex.getPageCount(); i < n; ++i) {
135144
long from = offsetIndex.getFirstRowIndex(i);
136145
if (rowRanges.isOverlapping(from, offsetIndex.getLastRowIndex(i, totalRowCount))) {
137146
indexMap.add(i);
147+
pagesIncluded++;
148+
} else {
149+
pagesSkipped++;
138150
}
139151
}
152+
153+
if (options != null && options.getMetricsCallback() != null) {
154+
final ParquetMetricsCallback metricsCallback = options.getMetricsCallback();
155+
metricsCallback.setValueInt(PagesIncluded.name(), pagesIncluded);
156+
metricsCallback.setValueInt(PagesSkipped.name(), pagesSkipped);
157+
}
158+
140159
return new FilteredOffsetIndex(offsetIndex, indexMap.toIntArray());
141160
}
142161

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1423,7 +1423,8 @@ private ColumnChunkPageReadStore internalReadFilteredRowGroup(
14231423
if (columnDescriptor != null) {
14241424
OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
14251425

1426-
OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount());
1426+
OffsetIndex filteredOffsetIndex =
1427+
filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount(), options);
14271428
for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
14281429
BenchmarkCounter.incrementTotalBytes(range.getLength());
14291430
long startingPos = range.getOffset();

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReaderMetrics.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ public enum ParquetFileReaderMetrics {
2727
ReadThroughput("read throughput when reading Parquet file from storage (MB/sec)"),
2828
DecompressTime("time spent in block decompression"),
2929
DecompressSize("decompressed data size (MB)"),
30-
DecompressThroughput("block decompression throughput (MB/sec)");
30+
DecompressThroughput("block decompression throughput (MB/sec)"),
31+
PagesIncluded("pages included due to column index filtering"),
32+
PagesSkipped("pages skipped due to column index filtering");
3133

3234
private final String desc;
3335

0 commit comments

Comments
 (0)