Skip to content

Commit 68fa731

Browse files
authored
GH-3320: Ensure parquet reader does not fail due to incorrect statistics (#3325)
1 parent 41f7359 commit 68fa731

File tree

1 file changed

+34
-1
lines changed

1 file changed

+34
-1
lines changed

parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,12 @@ private RowRanges applyPredicate(
192192
return allRows();
193193
}
194194

195-
return RowRanges.create(rowCount, func.apply(ci), oi);
195+
if (!isValidIndexSize(ci, oi, columnPath)) {
196+
return allRows();
197+
}
198+
199+
PrimitiveIterator.OfInt pageIndexes = func.apply(ci);
200+
return RowRanges.create(rowCount, pageIndexes, oi);
196201
}
197202

198203
@Override
@@ -220,4 +225,32 @@ public RowRanges visit(Not not) {
220225
throw new IllegalArgumentException(
221226
"Predicates containing a NOT must be run through LogicalInverseRewriter. " + not);
222227
}
228+
229+
/**
230+
* Validates that column index and offset index metadata are consistent and can be used safely.
231+
*
232+
* @param columnIndex the column index to validate
233+
* @param offsetIndex the offset index to validate
234+
* @param columnPath the column path for error reporting
235+
* @return true if metadata is valid and safe to use, false if corrupt and should be ignored
236+
*/
237+
private static boolean isValidIndexSize(ColumnIndex columnIndex, OffsetIndex offsetIndex, ColumnPath columnPath) {
238+
239+
int columnIndexSize = columnIndex.getMinValues().size();
240+
int offsetIndexSize = offsetIndex.getPageCount();
241+
242+
if (columnIndexSize != offsetIndexSize) {
243+
LOGGER.warn(
244+
"Column index and offset index size mismatch for column {}: "
245+
+ "column index has {} entries but offset index has {} pages. "
246+
+ "This indicates corrupted metadata from the writer. "
247+
+ "Ignoring column index for filtering to avoid errors.",
248+
columnPath,
249+
columnIndexSize,
250+
offsetIndexSize);
251+
return false;
252+
}
253+
254+
return true;
255+
}
223256
}

0 commit comments

Comments
 (0)