From 3a87682c4d3ab2972dbb1d6e1af8868109816114 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Sat, 28 Mar 2026 22:26:18 +0900 Subject: [PATCH 1/5] HBASE-29039 Seek past delete markers instead of skipping one at a time When a DeleteColumn or DeleteFamily marker is encountered during a normal user scan, the matcher currently returns SKIP, forcing the scanner to advance one cell at a time. This causes read latency to degrade linearly with the number of accumulated delete markers for the same row or column. Since these are range deletes that mask all remaining versions of the column, seek past the entire column immediately via columns.getNextRowOrNextColumn(). This is safe because cells arrive in timestamp descending order, so any puts newer than the delete have already been processed. For DeleteFamily, also fix getKeyForNextColumn in ScanQueryMatcher to bypass the empty-qualifier guard (HBASE-18471) when the cell is a DeleteFamily marker. Without this, the seek barely advances past the current cell instead of jumping to the first real qualified column. The optimization is only applied with plain ScanDeleteTracker, and skipped when: - seePastDeleteMarkers is true (KEEP_DELETED_CELLS) - newVersionBehavior is enabled (sequence IDs determine visibility) - visibility labels are in use (delete/put label mismatch) --- .../NormalUserScanQueryMatcher.java | 13 ++++++++ .../querymatcher/ScanQueryMatcher.java | 11 +++---- .../TestUserScanQueryMatcher.java | 33 +++++++++++++++++++ 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java index 9ad3c792345e..9f7f3fd50117 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.hadoop.hbase.ExtendedCell; import org.apache.hadoop.hbase.KeepDeletedCells; +import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.regionserver.ScanInfo; @@ -70,6 +71,18 @@ public MatchCode match(ExtendedCell cell) throws IOException { seePastDeleteMarkers ? tr.withinTimeRange(timestamp) : tr.withinOrAfterTimeRange(timestamp); if (includeDeleteMarker) { this.deletes.add(cell); + // A DeleteColumn or DeleteFamily masks all remaining cells for this column/family. + // Seek past them instead of skipping one cell at a time. + // Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence + // IDs determine visibility), visibility labels (delete/put label mismatch), or + // seePastDeleteMarkers (KEEP_DELETED_CELLS). + if ( + !seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class + && (typeByte == KeyValue.Type.DeleteColumn.getCode() + || typeByte == KeyValue.Type.DeleteFamily.getCode()) + ) { + return columns.getNextRowOrNextColumn(cell); + } } return MatchCode.SKIP; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java index 4f6168b1e926..4478ccf4a747 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java @@ -292,12 +292,11 @@ public void setToNewRow(ExtendedCell currentRow) { public abstract boolean moreRowsMayExistAfter(ExtendedCell cell); public ExtendedCell getKeyForNextColumn(ExtendedCell cell) { - // We aren't sure whether any DeleteFamily cells exist, so we can't skip to next column. - // TODO: Current way disable us to seek to next column quickly. Is there any better solution? - // see HBASE-18471 for more details - // see TestFromClientSide3#testScanAfterDeletingSpecifiedRow - // see TestFromClientSide3#testScanAfterDeletingSpecifiedRowV2 - if (cell.getQualifierLength() == 0) { + // For cells with empty qualifier, we generally can't skip to the next column because + // DeleteFamily cells might exist that we haven't seen yet (see HBASE-18471). + // However, if the cell itself IS a DeleteFamily marker, we know we've already processed it, + // so we can safely seek to the next real column. + if (cell.getQualifierLength() == 0 && !PrivateCellUtil.isDeleteFamily(cell)) { ExtendedCell nextKey = PrivateCellUtil.createNextOnRowCol(cell); if (nextKey != cell) { return nextKey; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java index efbb4c77d475..eb97037f666d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeepDeletedCells; import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.KeyValue.Type; import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.FilterBase; @@ -396,4 +397,36 @@ scanWithFilter, new ScanInfo(this.conf, fam2, 0, 5, ttl, KeepDeletedCells.FALSE, Cell nextCell = qm.getKeyForNextColumn(lastCell); assertArrayEquals(nextCell.getQualifierArray(), col4); } + + /** + * DeleteColumn and DeleteFamily should return SEEK_NEXT_COL instead of SKIP, so the scanner seeks + * past the entire column rather than advancing one cell at a time. + */ + @Test + public void testSeekOnRangeDelete() throws IOException { + // DeleteColumn: should seek past the column + assertDeleteMatchCode(KeepDeletedCells.FALSE, Type.DeleteColumn, MatchCode.SEEK_NEXT_COL); + + // DeleteFamily: should seek past the column + assertDeleteMatchCode(KeepDeletedCells.FALSE, Type.DeleteFamily, MatchCode.SEEK_NEXT_COL); + + // Delete (version): should still SKIP (point delete, not range) + assertDeleteMatchCode(KeepDeletedCells.FALSE, Type.Delete, MatchCode.SKIP); + + // KEEP_DELETED_CELLS=TRUE: should SKIP (masked cells must remain visible) + assertDeleteMatchCode(KeepDeletedCells.TRUE, Type.DeleteColumn, MatchCode.SKIP); + } + + private void assertDeleteMatchCode(KeepDeletedCells keepDeletedCells, Type type, + MatchCode expected) throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + UserScanQueryMatcher qm = + UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, ttl, keepDeletedCells, + HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, now, null); + boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion; + byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1; + KeyValue kv = new KeyValue(row1, fam1, qual, now, type); + qm.setToNewRow(kv); + assertEquals(expected, qm.match(kv)); + } } From 59ad767fcd8052401276e256fa66b9fba65f6785 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Mon, 30 Mar 2026 11:36:50 +0900 Subject: [PATCH 2/5] Use threshold to avoid seek overhead for single delete markers Seeking is ~50% more expensive than skipping. When each row has only one DeleteFamily or DeleteColumn marker (common case), the seek overhead adds up across many rows, causing ~50% scan regression. Introduce a counter that tracks consecutive range delete markers per row. Only switch from SKIP to SEEK after seeing SEEK_ON_DELETE_MARKER_THRESHOLD (default 3) markers, indicating actual accumulation. This preserves skip performance for the common case while still optimizing the accumulation case. --- .../NormalUserScanQueryMatcher.java | 33 +++++++-- .../TestUserScanQueryMatcher.java | 69 +++++++++++++++---- 2 files changed, 82 insertions(+), 20 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java index 9f7f3fd50117..fceead3a15dc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java @@ -32,6 +32,14 @@ @InterfaceAudience.Private public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { + /** + * Number of consecutive range delete markers (DeleteColumn/DeleteFamily) to skip before switching + * to seek. Seeking is more expensive than skipping for a single marker, but much faster when + * markers accumulate. This threshold avoids the seek overhead for the common case (one delete per + * row/column) while still kicking in when markers pile up. + */ + private static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 3; + /** Keeps track of deletes */ private final DeleteTracker deletes; @@ -41,12 +49,20 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { /** whether time range queries can see rows "behind" a delete */ protected final boolean seePastDeleteMarkers; + /** Whether seek optimization for range delete markers is applicable */ + private final boolean canSeekOnDeleteMarker; + + /** Count of consecutive range delete markers seen */ + private int rangeDeleteCount; + protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, ColumnTracker columns, boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long now) { super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now); this.deletes = deletes; this.get = scan.isGetScan(); this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() != KeepDeletedCells.FALSE; + this.canSeekOnDeleteMarker = + !seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class; } @Override @@ -72,20 +88,26 @@ public MatchCode match(ExtendedCell cell) throws IOException { if (includeDeleteMarker) { this.deletes.add(cell); // A DeleteColumn or DeleteFamily masks all remaining cells for this column/family. - // Seek past them instead of skipping one cell at a time. + // Seek past them instead of skipping one cell at a time, but only after seeing + // enough consecutive markers to justify the seek overhead. // Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence // IDs determine visibility), visibility labels (delete/put label mismatch), or // seePastDeleteMarkers (KEEP_DELETED_CELLS). if ( - !seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class - && (typeByte == KeyValue.Type.DeleteColumn.getCode() - || typeByte == KeyValue.Type.DeleteFamily.getCode()) + canSeekOnDeleteMarker && (typeByte == KeyValue.Type.DeleteFamily.getCode() + || (typeByte == KeyValue.Type.DeleteColumn.getCode() && cell.getQualifierLength() > 0)) ) { - return columns.getNextRowOrNextColumn(cell); + if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) { + rangeDeleteCount = 0; + return columns.getNextRowOrNextColumn(cell); + } + } else { + rangeDeleteCount = 0; } } return MatchCode.SKIP; } + rangeDeleteCount = 0; returnCode = checkDeleted(deletes, cell); if (returnCode != null) { return returnCode; @@ -96,6 +118,7 @@ public MatchCode match(ExtendedCell cell) throws IOException { @Override protected void reset() { deletes.reset(); + rangeDeleteCount = 0; } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java index eb97037f666d..bdeb7c52a738 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java @@ -399,34 +399,73 @@ scanWithFilter, new ScanInfo(this.conf, fam2, 0, 5, ttl, KeepDeletedCells.FALSE, } /** - * DeleteColumn and DeleteFamily should return SEEK_NEXT_COL instead of SKIP, so the scanner seeks - * past the entire column rather than advancing one cell at a time. + * After enough consecutive range delete markers, the matcher should switch from SKIP to + * SEEK_NEXT_COL. Point deletes and KEEP_DELETED_CELLS always SKIP. */ @Test public void testSeekOnRangeDelete() throws IOException { - // DeleteColumn: should seek past the column - assertDeleteMatchCode(KeepDeletedCells.FALSE, Type.DeleteColumn, MatchCode.SEEK_NEXT_COL); + // DeleteColumn: first two SKIP, third triggers SEEK_NEXT_COL + assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteColumn, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); - // DeleteFamily: should seek past the column - assertDeleteMatchCode(KeepDeletedCells.FALSE, Type.DeleteFamily, MatchCode.SEEK_NEXT_COL); + // DeleteFamily: same threshold behavior + assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteFamily, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); - // Delete (version): should still SKIP (point delete, not range) - assertDeleteMatchCode(KeepDeletedCells.FALSE, Type.Delete, MatchCode.SKIP); + // Delete (version): always SKIP (point delete, not range) + assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.Delete, MatchCode.SKIP, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP); - // KEEP_DELETED_CELLS=TRUE: should SKIP (masked cells must remain visible) - assertDeleteMatchCode(KeepDeletedCells.TRUE, Type.DeleteColumn, MatchCode.SKIP); + // KEEP_DELETED_CELLS=TRUE: always SKIP + assertDeleteMatchCodes(KeepDeletedCells.TRUE, Type.DeleteColumn, MatchCode.SKIP, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP); } - private void assertDeleteMatchCode(KeepDeletedCells keepDeletedCells, Type type, - MatchCode expected) throws IOException { + /** + * DeleteColumn with empty qualifier must not cause seeking past a subsequent DeleteFamily. + * DeleteFamily masks all columns, so it must be tracked by the delete tracker. + */ + @Test + public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + byte[] e = HConstants.EMPTY_BYTE_ARRAY; + UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, + ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, + now - ttl, now, null); + + // Feed enough DCs with empty qualifier to hit the threshold, then a DF. + // The DF must NOT be seeked past -- it must be SKIP'd so the tracker picks it up. + KeyValue dc1 = new KeyValue(row1, fam1, e, now, Type.DeleteColumn); + KeyValue dc2 = new KeyValue(row1, fam1, e, now - 1, Type.DeleteColumn); + KeyValue dc3 = new KeyValue(row1, fam1, e, now - 2, Type.DeleteColumn); + KeyValue df = new KeyValue(row1, fam1, e, now - 3, Type.DeleteFamily); + KeyValue put = new KeyValue(row1, fam1, col1, now - 3, Type.Put, data); + + qm.setToNewRow(dc1); + assertEquals(MatchCode.SKIP, qm.match(dc1)); + assertEquals(MatchCode.SKIP, qm.match(dc2)); + // Third DC hits threshold -- but since it's empty qualifier, it should NOT seek + // because a DeleteFamily might follow. + assertEquals(MatchCode.SKIP, qm.match(dc3)); + // DF must be processed (SKIP), not seeked past + assertEquals(MatchCode.SKIP, qm.match(df)); + // Put in col1 at t=now-3 should be masked by DF@t=now-3 + MatchCode putCode = qm.match(put); + assertEquals(MatchCode.SEEK_NEXT_COL, putCode); + } + + private void assertDeleteMatchCodes(KeepDeletedCells keepDeletedCells, Type type, + MatchCode... expected) throws IOException { long now = EnvironmentEdgeManager.currentTime(); UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, ttl, keepDeletedCells, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, now, null); boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion; byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1; - KeyValue kv = new KeyValue(row1, fam1, qual, now, type); - qm.setToNewRow(kv); - assertEquals(expected, qm.match(kv)); + qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type)); + for (int i = 0; i < expected.length; i++) { + KeyValue kv = new KeyValue(row1, fam1, qual, now - i, type); + assertEquals("Mismatch at index " + i, expected[i], qm.match(kv)); + } } } From bea7eaad9003f95fd3367ac8a55c9a740bf6ff81 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Fri, 3 Apr 2026 02:23:59 +0900 Subject: [PATCH 3/5] Compare qualifiers to avoid false positive seeks on different columns --- .../NormalUserScanQueryMatcher.java | 17 +++++++++++-- .../TestUserScanQueryMatcher.java | 24 +++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java index fceead3a15dc..894ad0842b07 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java @@ -18,9 +18,11 @@ package org.apache.hadoop.hbase.regionserver.querymatcher; import java.io.IOException; +import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.ExtendedCell; import org.apache.hadoop.hbase.KeepDeletedCells; import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.KeyValueUtil; import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.regionserver.ScanInfo; @@ -52,9 +54,12 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { /** Whether seek optimization for range delete markers is applicable */ private final boolean canSeekOnDeleteMarker; - /** Count of consecutive range delete markers seen */ + /** Count of consecutive range delete markers seen for the same column */ private int rangeDeleteCount; + /** Last range delete cell, for qualifier comparison across consecutive markers */ + private ExtendedCell lastDelete; + protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, ColumnTracker columns, boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long now) { super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now); @@ -69,6 +74,9 @@ protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, ColumnTracker public void beforeShipped() throws IOException { super.beforeShipped(); deletes.beforeShipped(); + if (lastDelete != null) { + lastDelete = KeyValueUtil.toNewKeyCell(lastDelete); + } } @Override @@ -89,7 +97,7 @@ public MatchCode match(ExtendedCell cell) throws IOException { this.deletes.add(cell); // A DeleteColumn or DeleteFamily masks all remaining cells for this column/family. // Seek past them instead of skipping one cell at a time, but only after seeing - // enough consecutive markers to justify the seek overhead. + // enough consecutive markers for the same column to justify the seek overhead. // Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence // IDs determine visibility), visibility labels (delete/put label mismatch), or // seePastDeleteMarkers (KEEP_DELETED_CELLS). @@ -97,6 +105,10 @@ public MatchCode match(ExtendedCell cell) throws IOException { canSeekOnDeleteMarker && (typeByte == KeyValue.Type.DeleteFamily.getCode() || (typeByte == KeyValue.Type.DeleteColumn.getCode() && cell.getQualifierLength() > 0)) ) { + if (lastDelete != null && !CellUtil.matchingQualifier(cell, lastDelete)) { + rangeDeleteCount = 0; + } + lastDelete = cell; if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) { rangeDeleteCount = 0; return columns.getNextRowOrNextColumn(cell); @@ -119,6 +131,7 @@ public MatchCode match(ExtendedCell cell) throws IOException { protected void reset() { deletes.reset(); rangeDeleteCount = 0; + lastDelete = null; } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java index bdeb7c52a738..e7a8f2e5b0fb 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java @@ -454,6 +454,30 @@ public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws IOExc assertEquals(MatchCode.SEEK_NEXT_COL, putCode); } + /** + * DeleteColumn markers for different qualifiers should not accumulate the seek counter. Only + * consecutive markers for the same qualifier should trigger seeking. + */ + @Test + public void testDeleteColumnDifferentQualifiersDoNotSeek() throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, + ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, + now - ttl, now, null); + + // DCs for different qualifiers: counter resets on qualifier change, never seeks + qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn)); + assertEquals(MatchCode.SKIP, qm.match(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn))); + assertEquals(MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, col2, now - 1, Type.DeleteColumn))); + assertEquals(MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, col3, now - 2, Type.DeleteColumn))); + assertEquals(MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, col4, now - 3, Type.DeleteColumn))); + assertEquals(MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, col5, now - 4, Type.DeleteColumn))); + } + private void assertDeleteMatchCodes(KeepDeletedCells keepDeletedCells, Type type, MatchCode... expected) throws IOException { long now = EnvironmentEdgeManager.currentTime(); From 859dc18330c15f15ad1b4f791bf0871793eeb228 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Fri, 3 Apr 2026 11:09:03 +0900 Subject: [PATCH 4/5] Increase seek threshold from 3 to 10 to reduce false positive overhead --- .../NormalUserScanQueryMatcher.java | 2 +- .../TestUserScanQueryMatcher.java | 30 +++++++++---------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java index 894ad0842b07..8d80ceb70bb1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java @@ -40,7 +40,7 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { * markers accumulate. This threshold avoids the seek overhead for the common case (one delete per * row/column) while still kicking in when markers pile up. */ - private static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 3; + private static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 10; /** Keeps track of deletes */ private final DeleteTracker deletes; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java index e7a8f2e5b0fb..4c805c1dd5db 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java @@ -404,13 +404,15 @@ scanWithFilter, new ScanInfo(this.conf, fam2, 0, 5, ttl, KeepDeletedCells.FALSE, */ @Test public void testSeekOnRangeDelete() throws IOException { - // DeleteColumn: first two SKIP, third triggers SEEK_NEXT_COL + // DeleteColumn: first nine SKIP, tenth triggers SEEK_NEXT_COL assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteColumn, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); // DeleteFamily: same threshold behavior assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteFamily, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); // Delete (version): always SKIP (point delete, not range) assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.Delete, MatchCode.SKIP, MatchCode.SKIP, @@ -433,20 +435,16 @@ public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws IOExc ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, now, null); - // Feed enough DCs with empty qualifier to hit the threshold, then a DF. + // Feed DCs with empty qualifier past the threshold, then a DF. // The DF must NOT be seeked past -- it must be SKIP'd so the tracker picks it up. - KeyValue dc1 = new KeyValue(row1, fam1, e, now, Type.DeleteColumn); - KeyValue dc2 = new KeyValue(row1, fam1, e, now - 1, Type.DeleteColumn); - KeyValue dc3 = new KeyValue(row1, fam1, e, now - 2, Type.DeleteColumn); - KeyValue df = new KeyValue(row1, fam1, e, now - 3, Type.DeleteFamily); - KeyValue put = new KeyValue(row1, fam1, col1, now - 3, Type.Put, data); - - qm.setToNewRow(dc1); - assertEquals(MatchCode.SKIP, qm.match(dc1)); - assertEquals(MatchCode.SKIP, qm.match(dc2)); - // Third DC hits threshold -- but since it's empty qualifier, it should NOT seek - // because a DeleteFamily might follow. - assertEquals(MatchCode.SKIP, qm.match(dc3)); + qm.setToNewRow(new KeyValue(row1, fam1, e, now, Type.DeleteColumn)); + for (int i = 0; i < 11; i++) { + // Empty qualifier DCs should never trigger seek, regardless of threshold + assertEquals("DC at i=" + i, MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, e, now - i, Type.DeleteColumn))); + } + KeyValue df = new KeyValue(row1, fam1, e, now - 11, Type.DeleteFamily); + KeyValue put = new KeyValue(row1, fam1, col1, now - 11, Type.Put, data); // DF must be processed (SKIP), not seeked past assertEquals(MatchCode.SKIP, qm.match(df)); // Put in col1 at t=now-3 should be masked by DF@t=now-3 From 5b8afdc0fe9d096d1145ee0d63b2fa4507e5e5a8 Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Sun, 5 Apr 2026 11:14:53 +0900 Subject: [PATCH 5/5] Move redundancy counting out of 'includeDeleteMarker' condition --- .../NormalUserScanQueryMatcher.java | 41 +++++---- .../TestUserScanQueryMatcher.java | 91 ++++++++++++++----- 2 files changed, 90 insertions(+), 42 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java index 8d80ceb70bb1..2dd5594475bf 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java @@ -40,7 +40,7 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { * markers accumulate. This threshold avoids the seek overhead for the common case (one delete per * row/column) while still kicking in when markers pile up. */ - private static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 10; + static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 10; /** Keeps track of deletes */ private final DeleteTracker deletes; @@ -95,27 +95,28 @@ public MatchCode match(ExtendedCell cell) throws IOException { seePastDeleteMarkers ? tr.withinTimeRange(timestamp) : tr.withinOrAfterTimeRange(timestamp); if (includeDeleteMarker) { this.deletes.add(cell); - // A DeleteColumn or DeleteFamily masks all remaining cells for this column/family. - // Seek past them instead of skipping one cell at a time, but only after seeing - // enough consecutive markers for the same column to justify the seek overhead. - // Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence - // IDs determine visibility), visibility labels (delete/put label mismatch), or - // seePastDeleteMarkers (KEEP_DELETED_CELLS). - if ( - canSeekOnDeleteMarker && (typeByte == KeyValue.Type.DeleteFamily.getCode() - || (typeByte == KeyValue.Type.DeleteColumn.getCode() && cell.getQualifierLength() > 0)) - ) { - if (lastDelete != null && !CellUtil.matchingQualifier(cell, lastDelete)) { - rangeDeleteCount = 0; - } - lastDelete = cell; - if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) { - rangeDeleteCount = 0; - return columns.getNextRowOrNextColumn(cell); - } - } else { + } + + // A DeleteColumn or DeleteFamily masks all remaining cells for this column/family. + // Seek past them instead of skipping one cell at a time, but only after seeing + // enough consecutive markers for the same column to justify the seek overhead. + // Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence + // IDs determine visibility), visibility labels (delete/put label mismatch), or + // seePastDeleteMarkers (KEEP_DELETED_CELLS). + if ( + canSeekOnDeleteMarker && (typeByte == KeyValue.Type.DeleteFamily.getCode() + || (typeByte == KeyValue.Type.DeleteColumn.getCode() && cell.getQualifierLength() > 0)) + ) { + if (lastDelete != null && !CellUtil.matchingQualifier(cell, lastDelete)) { rangeDeleteCount = 0; } + lastDelete = cell; + if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) { + rangeDeleteCount = 0; + return columns.getNextRowOrNextColumn(cell); + } + } else { + rangeDeleteCount = 0; } return MatchCode.SKIP; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java index 4c805c1dd5db..36dbd213bbe2 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java @@ -404,23 +404,19 @@ scanWithFilter, new ScanInfo(this.conf, fam2, 0, 5, ttl, KeepDeletedCells.FALSE, */ @Test public void testSeekOnRangeDelete() throws IOException { - // DeleteColumn: first nine SKIP, tenth triggers SEEK_NEXT_COL - assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteColumn, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); + int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD; + + // DeleteColumn: first N-1 SKIP, N-th triggers SEEK_NEXT_COL + assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteColumn, n); // DeleteFamily: same threshold behavior - assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteFamily, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); + assertSeekAfterThreshold(KeepDeletedCells.FALSE, Type.DeleteFamily, n); // Delete (version): always SKIP (point delete, not range) - assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.Delete, MatchCode.SKIP, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP); + assertAllSkip(KeepDeletedCells.FALSE, Type.Delete, n + 1); // KEEP_DELETED_CELLS=TRUE: always SKIP - assertDeleteMatchCodes(KeepDeletedCells.TRUE, Type.DeleteColumn, MatchCode.SKIP, MatchCode.SKIP, - MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP); + assertAllSkip(KeepDeletedCells.TRUE, Type.DeleteColumn, n + 1); } /** @@ -435,16 +431,17 @@ public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws IOExc ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, now, null); + int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD; // Feed DCs with empty qualifier past the threshold, then a DF. // The DF must NOT be seeked past -- it must be SKIP'd so the tracker picks it up. qm.setToNewRow(new KeyValue(row1, fam1, e, now, Type.DeleteColumn)); - for (int i = 0; i < 11; i++) { + for (int i = 0; i < n + 1; i++) { // Empty qualifier DCs should never trigger seek, regardless of threshold assertEquals("DC at i=" + i, MatchCode.SKIP, qm.match(new KeyValue(row1, fam1, e, now - i, Type.DeleteColumn))); } - KeyValue df = new KeyValue(row1, fam1, e, now - 11, Type.DeleteFamily); - KeyValue put = new KeyValue(row1, fam1, col1, now - 11, Type.Put, data); + KeyValue df = new KeyValue(row1, fam1, e, now - n - 1, Type.DeleteFamily); + KeyValue put = new KeyValue(row1, fam1, col1, now - n - 1, Type.Put, data); // DF must be processed (SKIP), not seeked past assertEquals(MatchCode.SKIP, qm.match(df)); // Put in col1 at t=now-3 should be masked by DF@t=now-3 @@ -476,18 +473,68 @@ public void testDeleteColumnDifferentQualifiersDoNotSeek() throws IOException { qm.match(new KeyValue(row1, fam1, col5, now - 4, Type.DeleteColumn))); } - private void assertDeleteMatchCodes(KeepDeletedCells keepDeletedCells, Type type, - MatchCode... expected) throws IOException { + /** + * Delete markers outside the scan's time range (includeDeleteMarker=false) should still + * accumulate the seek counter and trigger SEEK_NEXT_COL after the threshold. + */ + @Test + public void testSeekOnRangeDeleteOutsideTimeRange() throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + long futureTs = now + 1_000_000; + Scan scanWithTimeRange = new Scan(scan).setTimeRange(futureTs, Long.MAX_VALUE); + + UserScanQueryMatcher qm = UserScanQueryMatcher.create(scanWithTimeRange, + new ScanInfo(this.conf, fam1, 0, 1, ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, + 0, rowComparator, false), + null, now - ttl, now, null); + + int n = NormalUserScanQueryMatcher.SEEK_ON_DELETE_MARKER_THRESHOLD; + qm.setToNewRow(new KeyValue(row1, fam1, col1, now, Type.DeleteColumn)); + // All DCs have timestamps below the time range, so includeDeleteMarker is false. + // The seek counter should still accumulate. + for (int i = 0; i < n - 1; i++) { + assertEquals("DC at i=" + i, MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, col1, now - i, Type.DeleteColumn))); + } + assertEquals(MatchCode.SEEK_NEXT_COL, + qm.match(new KeyValue(row1, fam1, col1, now - n + 1, Type.DeleteColumn))); + } + + private UserScanQueryMatcher createDeleteMatcher(KeepDeletedCells keepDeletedCells) + throws IOException { long now = EnvironmentEdgeManager.currentTime(); - UserScanQueryMatcher qm = - UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, ttl, keepDeletedCells, - HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, now, null); + return UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, ttl, + keepDeletedCells, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, + now, null); + } + + /** First n-1 markers SKIP, n-th triggers SEEK_NEXT_COL. */ + private void assertSeekAfterThreshold(KeepDeletedCells keepDeletedCells, Type type, int n) + throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells); boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion; byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1; qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type)); - for (int i = 0; i < expected.length; i++) { - KeyValue kv = new KeyValue(row1, fam1, qual, now - i, type); - assertEquals("Mismatch at index " + i, expected[i], qm.match(kv)); + for (int i = 0; i < n - 1; i++) { + assertEquals("Mismatch at index " + i, MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, qual, now - i, type))); + } + assertEquals("Expected SEEK_NEXT_COL at index " + (n - 1), MatchCode.SEEK_NEXT_COL, + qm.match(new KeyValue(row1, fam1, qual, now - n + 1, type))); + } + + /** All markers should SKIP regardless of count. */ + private void assertAllSkip(KeepDeletedCells keepDeletedCells, Type type, int count) + throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + UserScanQueryMatcher qm = createDeleteMatcher(keepDeletedCells); + boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion; + byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1; + qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type)); + for (int i = 0; i < count; i++) { + assertEquals("Mismatch at index " + i, MatchCode.SKIP, + qm.match(new KeyValue(row1, fam1, qual, now - i, type))); } } }