diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppConfigNodeConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppConfigNodeConfig.java index 4385097ce045b..cfca2502cb38a 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppConfigNodeConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/config/MppConfigNodeConfig.java @@ -73,4 +73,18 @@ public ConfigNodeConfig setLeaderDistributionPolicy(String policy) { properties.setProperty("leader_distribution_policy", policy); return this; } + + @Override + public ConfigNodeConfig setConsistencyCheckSchedulerInitialDelayInMs(long initialDelayInMs) { + properties.setProperty( + "consistency_check_scheduler_initial_delay_in_ms", String.valueOf(initialDelayInMs)); + return this; + } + + @Override + public ConfigNodeConfig setConsistencyCheckSchedulerIntervalInMs(long intervalInMs) { + properties.setProperty( + "consistency_check_scheduler_interval_in_ms", String.valueOf(intervalInMs)); + return this; + } } diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteConfigNodeConfig.java b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteConfigNodeConfig.java index c16722f4bd943..387020363cefb 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteConfigNodeConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/remote/config/RemoteConfigNodeConfig.java @@ -43,4 +43,14 @@ public ConfigNodeConfig setMetricPrometheusReporterPassword(String password) { public ConfigNodeConfig setLeaderDistributionPolicy(String policy) { return this; } + + @Override + public ConfigNodeConfig setConsistencyCheckSchedulerInitialDelayInMs(long initialDelayInMs) { + return this; + } + + @Override + public ConfigNodeConfig setConsistencyCheckSchedulerIntervalInMs(long intervalInMs) { + return this; + } } diff --git a/integration-test/src/main/java/org/apache/iotdb/itbase/env/ConfigNodeConfig.java b/integration-test/src/main/java/org/apache/iotdb/itbase/env/ConfigNodeConfig.java index 4af35f6f56a9d..60431ddba13f0 100644 --- a/integration-test/src/main/java/org/apache/iotdb/itbase/env/ConfigNodeConfig.java +++ b/integration-test/src/main/java/org/apache/iotdb/itbase/env/ConfigNodeConfig.java @@ -31,4 +31,8 @@ public interface ConfigNodeConfig { ConfigNodeConfig setMetricPrometheusReporterPassword(String password); ConfigNodeConfig setLeaderDistributionPolicy(String policy); + + ConfigNodeConfig setConsistencyCheckSchedulerInitialDelayInMs(long initialDelayInMs); + + ConfigNodeConfig setConsistencyCheckSchedulerIntervalInMs(long intervalInMs); } diff --git a/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/IoTDBIoTConsensusV23C3DBasicITBase.java b/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/IoTDBIoTConsensusV23C3DBasicITBase.java index 9544ac5cf2b31..711cec4ddc3ae 100644 --- a/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/IoTDBIoTConsensusV23C3DBasicITBase.java +++ b/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/IoTDBIoTConsensusV23C3DBasicITBase.java @@ -19,12 +19,38 @@ package org.apache.iotdb.db.it.iotconsensusv2; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.client.ClientPoolFactory; +import org.apache.iotdb.commons.client.IClientManager; +import org.apache.iotdb.commons.client.property.ThriftClientProperty; +import org.apache.iotdb.commons.client.sync.SyncConfigNodeIServiceClient; +import org.apache.iotdb.commons.client.sync.SyncDataNodeInternalServiceClient; import org.apache.iotdb.confignode.it.regionmigration.IoTDBRegionOperationReliabilityITFramework; +import org.apache.iotdb.confignode.rpc.thrift.TTriggerRegionConsistencyRepairReq; import org.apache.iotdb.consensus.ConsensusFactory; +import org.apache.iotdb.db.storageengine.dataregion.modification.ModificationFile; +import org.apache.iotdb.db.storageengine.dataregion.modification.v1.ModificationFileV1; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; import org.apache.iotdb.isession.SessionConfig; import org.apache.iotdb.it.env.EnvFactory; import org.apache.iotdb.it.env.cluster.node.DataNodeWrapper; import org.apache.iotdb.itbase.env.BaseEnv; +import org.apache.iotdb.mpp.rpc.thrift.TApplyLogicalRepairBatchReq; +import org.apache.iotdb.mpp.rpc.thrift.TDataNodeHeartbeatReq; +import org.apache.iotdb.mpp.rpc.thrift.TFinishLogicalRepairSessionReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeResp; +import org.apache.iotdb.mpp.rpc.thrift.TLogicalRepairBatch; +import org.apache.iotdb.mpp.rpc.thrift.TLogicalRepairLeafSelector; +import org.apache.iotdb.mpp.rpc.thrift.TPartitionConsistencyEligibility; +import org.apache.iotdb.mpp.rpc.thrift.TSnapshotSubtreeNode; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairReq; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairResp; import org.apache.tsfile.utils.Pair; import org.awaitility.Awaitility; @@ -33,15 +59,23 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.sql.Connection; import java.sql.ResultSet; +import java.sql.SQLException; import java.sql.Statement; +import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; +import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.stream.Stream; import static org.apache.iotdb.util.MagicUtils.makeItCloseQuietly; @@ -65,9 +99,19 @@ public abstract class IoTDBIoTConsensusV23C3DBasicITBase protected static final int DATA_REPLICATION_FACTOR = 2; protected static final int SCHEMA_REPLICATION_FACTOR = 3; + private static final IClientManager + DATA_NODE_INTERNAL_CLIENT_MANAGER = + new IClientManager.Factory() + .createClientManager( + new ClientPoolFactory.SyncDataNodeInternalServiceClientPoolFactory()); + /** Timeout in seconds for 3C3D cluster init. */ protected static final int CLUSTER_INIT_TIMEOUT_SECONDS = 300; + protected static final long TIME_PARTITION_INTERVAL = 100L; + protected static final long CONSISTENCY_CHECK_INITIAL_DELAY_MS = 1_000L; + protected static final long CONSISTENCY_CHECK_INTERVAL_MS = 1_000L; + protected static final String INSERTION1 = "INSERT INTO root.sg.d1(timestamp,speed,temperature) values(100, 1, 2)"; protected static final String INSERTION2 = @@ -75,7 +119,12 @@ public abstract class IoTDBIoTConsensusV23C3DBasicITBase protected static final String INSERTION3 = "INSERT INTO root.sg.d1(timestamp,speed,temperature) values(102, 5, 6)"; protected static final String FLUSH_COMMAND = "flush on cluster"; + protected static final String LOCAL_FLUSH_COMMAND = "flush on local"; protected static final String COUNT_QUERY = "select count(*) from root.sg.**"; + protected static final String DELETE_SPEED_UP_TO_101 = + "DELETE FROM root.sg.d1.speed WHERE time <= 101"; + protected static final String COUNT_AFTER_DELETE_QUERY = + "select count(speed), count(temperature) from root.sg.d1"; protected static final String SELECT_ALL_QUERY = "select speed, temperature from root.sg.d1"; /** @@ -93,6 +142,7 @@ public void setUp() throws Exception { .getCommonConfig() .setDataReplicationFactor(DATA_REPLICATION_FACTOR) .setSchemaReplicationFactor(SCHEMA_REPLICATION_FACTOR) + .setTimePartitionInterval(TIME_PARTITION_INTERVAL) .setIoTConsensusV2Mode(getIoTConsensusV2Mode()); EnvFactory.getEnv() @@ -100,6 +150,12 @@ public void setUp() throws Exception { .getDataNodeConfig() .setMetricReporterType(Collections.singletonList("PROMETHEUS")); + EnvFactory.getEnv() + .getConfig() + .getConfigNodeConfig() + .setConsistencyCheckSchedulerInitialDelayInMs(CONSISTENCY_CHECK_INITIAL_DELAY_MS) + .setConsistencyCheckSchedulerIntervalInMs(CONSISTENCY_CHECK_INTERVAL_MS); + EnvFactory.getEnv() .initClusterEnvironment(CONFIG_NODE_NUM, DATA_NODE_NUM, CLUSTER_INIT_TIMEOUT_SECONDS); } @@ -108,17 +164,33 @@ public void test3C3DWriteFlushAndQuery() throws Exception { try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); Statement statement = makeItCloseQuietly(connection.createStatement())) { - LOGGER.info("Writing data to 3C3D cluster (mode: {})...", getIoTConsensusV2Mode()); - statement.execute(INSERTION1); - statement.execute(INSERTION2); - statement.execute(INSERTION3); + insertAndFlushTestData(statement); + verifyDataConsistency(statement); - LOGGER.info("Executing flush on cluster..."); - statement.execute(FLUSH_COMMAND); + LOGGER.info("3C3D IoTConsensusV2 {} basic test passed", getIoTConsensusV2Mode()); + } + } + /** + * Test that a follower can observe the same logical view after the leader reports replication + * catch-up. + */ + public void testFollowerCanReadConsistentDataAfterCatchUp() throws Exception { + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + + insertAndFlushTestData(statement); verifyDataConsistency(statement); - LOGGER.info("3C3D IoTConsensusV2 {} basic test passed", getIoTConsensusV2Mode()); + RegionReplicaSelection regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + + LOGGER.info( + "Verifying logical view from follower DataNode {} for region {} after catch-up", + regionReplicaSelection.followerDataNodeId, + regionReplicaSelection.regionId); + verifyDataConsistencyOnNode(regionReplicaSelection.followerNode); } } @@ -127,125 +199,601 @@ public void test3C3DWriteFlushAndQuery() throws Exception { * follower serves the same data. */ public void testReplicaConsistencyAfterLeaderStop() throws Exception { + RegionReplicaSelection regionReplicaSelection; + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + + insertAndFlushTestData(statement); + verifyDataConsistency(statement); + + regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + } + + LOGGER.info( + "Stopping leader DataNode {} (region {}) for replica consistency test", + regionReplicaSelection.leaderDataNodeId, + regionReplicaSelection.regionId); + + regionReplicaSelection.leaderNode.stopForcibly(); + Assert.assertFalse("Leader should be stopped", regionReplicaSelection.leaderNode.isAlive()); + + LOGGER.info( + "Waiting for follower DataNode {} to be elected as new leader and verifying replica consistency...", + regionReplicaSelection.followerDataNodeId); + Awaitility.await() + .pollDelay(2, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted(() -> verifyDataConsistencyOnNode(regionReplicaSelection.followerNode)); + + LOGGER.info( + "Replica consistency verified: follower has same data as former leader after failover"); + } + + /** + * Test replica consistency for a delete path: after deletion is replicated, stopping the leader + * must not change the surviving logical view. + */ + public void testReplicaConsistencyAfterDeleteAndLeaderStop() throws Exception { + RegionReplicaSelection regionReplicaSelection; try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); Statement statement = makeItCloseQuietly(connection.createStatement())) { - LOGGER.info("Writing data to 3C3D cluster (mode: {})...", getIoTConsensusV2Mode()); - statement.execute(INSERTION1); - statement.execute(INSERTION2); - statement.execute(INSERTION3); + insertAndFlushTestData(statement); + regionReplicaSelection = selectReplicatedDataRegion(statement); + + LOGGER.info( + "Deleting replicated data on leader DataNode {} for region {}", + regionReplicaSelection.leaderDataNodeId, + regionReplicaSelection.regionId); + statement.execute(DELETE_SPEED_UP_TO_101); statement.execute(FLUSH_COMMAND); - verifyDataConsistency(statement); + verifyPostDeleteConsistency(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + verifyPostDeleteConsistencyOnNode(regionReplicaSelection.followerNode); + } - Map>> dataRegionMap = - getDataRegionMapWithLeader(statement); - - int targetRegionId = -1; - int leaderDataNodeId = -1; - int followerDataNodeId = -1; - for (Map.Entry>> entry : dataRegionMap.entrySet()) { - Pair> leaderAndReplicas = entry.getValue(); - if (leaderAndReplicas.getRight().size() > 1 - && leaderAndReplicas.getRight().size() <= DATA_REPLICATION_FACTOR - && leaderAndReplicas.getLeft() > 0) { - targetRegionId = entry.getKey(); - leaderDataNodeId = leaderAndReplicas.getLeft(); - final int lambdaLeaderDataNodeId = leaderDataNodeId; - followerDataNodeId = - leaderAndReplicas.getRight().stream() - .filter(i -> i != lambdaLeaderDataNodeId) - .findAny() - .orElse(-1); - break; - } + LOGGER.info( + "Stopping leader DataNode {} after replicated delete", + regionReplicaSelection.leaderDataNodeId); + regionReplicaSelection.leaderNode.stopForcibly(); + Assert.assertFalse("Leader should be stopped", regionReplicaSelection.leaderNode.isAlive()); + + Awaitility.await() + .pollDelay(2, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> verifyPostDeleteConsistencyOnNode(regionReplicaSelection.followerNode)); + + LOGGER.info( + "Replica consistency verified after delete and failover on follower DataNode {}", + regionReplicaSelection.followerDataNodeId); + } + + /** + * Background consistency check should skip hot partitions before flush and only verify them after + * they become cold and safe. + */ + public void testBackgroundConsistencyCheckOnlyRunsOnColdPartitions() throws Exception { + RegionReplicaSelection regionReplicaSelection; + long partitionId = timePartitionId(100L); + + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertPartitionData(statement, 100L); + regionReplicaSelection = selectReplicatedDataRegion(statement); + + TimeUnit.MILLISECONDS.sleep( + CONSISTENCY_CHECK_INITIAL_DELAY_MS + CONSISTENCY_CHECK_INTERVAL_MS * 2); + assertRepairProgressEmpty(regionReplicaSelection.regionId); + + statement.execute(FLUSH_COMMAND); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + } + + RepairProgressRow row = + waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + Assert.assertEquals("IDLE", row.repairState); + } + + /** + * Restarting a follower should allow the background checker to rebuild its logical snapshot and + * keep the partition verified after the node rejoins. + */ + public void testBackgroundConsistencyCheckRebuildsLogicalSnapshotAfterFollowerRestart() + throws Exception { + RegionReplicaSelection regionReplicaSelection; + long partitionId = timePartitionId(100L); + + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertAndFlushTestData(statement); + regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + } + + RepairProgressRow previousRow = + getRepairProgressRow(regionReplicaSelection.regionId, partitionId); + Assert.assertNotNull(previousRow); + + regionReplicaSelection.followerNode.stopForcibly(); + Assert.assertFalse("Follower should be stopped", regionReplicaSelection.followerNode.isAlive()); + + regionReplicaSelection.followerNode.start(); + waitForNodeConnectionReady(regionReplicaSelection.followerNode); + waitForProgressRefresh(regionReplicaSelection.regionId, partitionId, previousRow.lastCheckedAt); + RepairProgressRow refreshedRow = + waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + Assert.assertEquals("READY", refreshedRow.snapshotState); + Assert.assertTrue(refreshedRow.snapshotEpoch >= previousRow.snapshotEpoch); + } + + /** + * Background check must not advance progress while the leader reports non-zero sync lag. Once the + * lagging follower catches up again, the same partition can be checked and verified in a new + * round. + */ + public void testBackgroundConsistencyCheckWaitsForSyncLagToClear() throws Exception { + RegionReplicaSelection regionReplicaSelection; + long partitionId = timePartitionId(100L); + RepairProgressRow baselineRow; + + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertAndFlushTestData(statement); + regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + baselineRow = waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + } + + regionReplicaSelection.followerNode.stopForcibly(); + Assert.assertFalse("Follower should be stopped", regionReplicaSelection.followerNode.isAlive()); + + try (Connection connection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + regionReplicaSelection.leaderNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT)); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertPartitionData(statement, 103L); + if (shouldFlushLaggingWritesBeforeFollowerRestart()) { + statement.execute(LOCAL_FLUSH_COMMAND); } + } - Assert.assertTrue( - "Should find a data region with leader for root.sg", - targetRegionId > 0 && leaderDataNodeId > 0 && followerDataNodeId > 0); + waitForReplicationLag(regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + assertDataPointCountOnNode(regionReplicaSelection.leaderNode, 12L); - DataNodeWrapper leaderNode = - EnvFactory.getEnv() - .dataNodeIdToWrapper(leaderDataNodeId) - .orElseThrow(() -> new AssertionError("DataNode not found in cluster")); + TimeUnit.MILLISECONDS.sleep(CONSISTENCY_CHECK_INTERVAL_MS * 3); + RepairProgressRow laggingRow = + getRepairProgressRow( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId, partitionId); + Assert.assertNotNull(laggingRow); + Assert.assertEquals( + "Background checker should keep the last verified result while syncLag > 0", + "VERIFIED", + laggingRow.checkState); + Assert.assertEquals( + "Background checker should not advance last_checked_at while syncLag > 0", + baselineRow.lastCheckedAt, + laggingRow.lastCheckedAt); + Assert.assertEquals( + "Background checker should not persist a new mutation epoch while syncLag > 0", + baselineRow.partitionMutationEpoch, + laggingRow.partitionMutationEpoch); + Assert.assertEquals( + "Background checker should not persist a new snapshot epoch while syncLag > 0", + baselineRow.snapshotEpoch, + laggingRow.snapshotEpoch); - waitForReplicationComplete(leaderNode); + regionReplicaSelection.followerNode.start(); + waitForNodeConnectionReady(regionReplicaSelection.followerNode); + waitForReplicationComplete(regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + if (!shouldFlushLaggingWritesBeforeFollowerRestart()) { + localFlushOnNode(regionReplicaSelection.leaderNode); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + } + waitForProgressRefresh(regionReplicaSelection.regionId, partitionId, baselineRow.lastCheckedAt); + RepairProgressRow refreshedRow = + waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + Assert.assertTrue( + "Expected a new mutation epoch after the lagging writes are replicated", + refreshedRow.partitionMutationEpoch > baselineRow.partitionMutationEpoch); + Assert.assertTrue( + "Expected a rebuilt snapshot epoch after sync lag is cleared", + refreshedRow.snapshotEpoch > baselineRow.snapshotEpoch); + assertDataPointCountOnNode(regionReplicaSelection.followerNode, 12L); + } - LOGGER.info( - "Stopping leader DataNode {} (region {}) for replica consistency test", - leaderDataNodeId, - targetRegionId); + /** + * Verified progress should remain queryable after the current ConfigNode leader is restarted. + * This validates that progress is durably persisted rather than kept only in the live leader's + * memory. + */ + public void testRepairProgressSurvivesConfigNodeLeaderRestart() throws Exception { + RegionReplicaSelection regionReplicaSelection; + long partitionId = timePartitionId(100L); + RepairProgressRow baselineRow; + + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertAndFlushTestData(statement); + regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + baselineRow = waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + } + + int leaderConfigNodeIndex = EnvFactory.getEnv().getLeaderConfigNodeIndex(); + EnvFactory.getEnv().getConfigNodeWrapperList().get(leaderConfigNodeIndex).stopForcibly(); + + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> { + RepairProgressRow row = + getRepairProgressRow(regionReplicaSelection.regionId, partitionId); + Assert.assertNotNull(row); + Assert.assertEquals("VERIFIED", row.checkState); + Assert.assertTrue(row.lastCheckedAt >= baselineRow.lastCheckedAt); + Assert.assertTrue(row.snapshotEpoch >= baselineRow.snapshotEpoch); + }); + + EnvFactory.getEnv().getConfigNodeWrapperList().get(leaderConfigNodeIndex).start(); + + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> { + RepairProgressRow row = + getRepairProgressRow(regionReplicaSelection.regionId, partitionId); + Assert.assertNotNull(row); + Assert.assertEquals("VERIFIED", row.checkState); + Assert.assertTrue(row.lastCheckedAt >= baselineRow.lastCheckedAt); + Assert.assertTrue(row.snapshotEpoch >= baselineRow.snapshotEpoch); + }); + } + + /** + * Background check should only mark mismatches. Manual repair should then consume only the + * mismatched partition scope and restore the follower. + */ + public void testReplicaConsistencyRepairAfterFollowerLosesSealedTsFile() throws Exception { + RegionReplicaSelection regionReplicaSelection; + long firstPartitionId = timePartitionId(100L); + long secondPartitionId = timePartitionId(200L); + Path deletedTsFile; + + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertAndFlushPartitionData(statement, 100L); + insertAndFlushPartitionData(statement, 200L); + + regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + waitForCheckState(regionReplicaSelection.regionId, firstPartitionId, "VERIFIED"); + waitForCheckState(regionReplicaSelection.regionId, secondPartitionId, "VERIFIED"); + deletedTsFile = + findLatestSealedTsFile( + regionReplicaSelection.followerNode, + regionReplicaSelection.regionId, + secondPartitionId); + } + + regionReplicaSelection.followerNode.stopForcibly(); + Assert.assertFalse("Follower should be stopped", regionReplicaSelection.followerNode.isAlive()); + deleteTsFileArtifacts(deletedTsFile); + + regionReplicaSelection.followerNode.start(); + waitForNodeConnectionReady(regionReplicaSelection.followerNode); + waitForCheckState(regionReplicaSelection.regionId, secondPartitionId, "MISMATCH"); + assertPartitionViewMismatch(regionReplicaSelection); + assertDataPointCountOnNode(regionReplicaSelection.followerNode, 6L); + + RepairProgressRow firstPartition = + getRepairProgressRow(regionReplicaSelection.regionId, firstPartitionId); + RepairProgressRow secondPartition = + getRepairProgressRow(regionReplicaSelection.regionId, secondPartitionId); + Assert.assertNotNull(firstPartition); + Assert.assertNotNull(secondPartition); + Assert.assertEquals("VERIFIED", firstPartition.checkState); + Assert.assertEquals("MISMATCH", secondPartition.checkState); + + TimeUnit.MILLISECONDS.sleep(CONSISTENCY_CHECK_INTERVAL_MS * 3); + Assert.assertEquals( + "Background checker should stay check-only and not auto-repair mismatches", + "MISMATCH", + getRepairProgressRow(regionReplicaSelection.regionId, secondPartitionId).checkState); + assertDataPointCountOnNode(regionReplicaSelection.followerNode, 6L); + + triggerRegionConsistencyRepair(regionReplicaSelection.regionId); + + waitForCheckState(regionReplicaSelection.regionId, secondPartitionId, "VERIFIED"); + assertPartitionViewMatched(regionReplicaSelection); + assertDataPointCountOnNode(regionReplicaSelection.followerNode, 12L); + + LOGGER.info( + "Stopping leader DataNode {} after repair to verify repaired follower serves local data", + regionReplicaSelection.leaderDataNodeId); + regionReplicaSelection.leaderNode.stopForcibly(); + Assert.assertFalse("Leader should be stopped", regionReplicaSelection.leaderNode.isAlive()); + + Awaitility.await() + .pollDelay(2, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted(() -> assertDataPointCountOnNode(regionReplicaSelection.followerNode, 12L)); + } - leaderNode.stopForcibly(); - Assert.assertFalse("Leader should be stopped", leaderNode.isAlive()); + /** + * The follower-side staged logical repair journal should survive a DataNode restart. This covers + * the data-plane recovery path where batches are durably staged first and only applied during the + * later finish step. The test uses an idempotent logical repair stream derived from the leader's + * current logical snapshot so the assertion stays focused on repair-session durability rather + * than on mismatch injection mechanics. + */ + public void testLogicalRepairSessionSurvivesFollowerRestart() throws Exception { + RegionReplicaSelection regionReplicaSelection; + long firstPartitionId = timePartitionId(100L); + long partitionId = timePartitionId(200L); + + try (Connection connection = makeItCloseQuietly(EnvFactory.getEnv().getConnection()); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + insertAndFlushPartitionData(statement, 100L); + insertAndFlushPartitionData(statement, 200L); + + regionReplicaSelection = selectReplicatedDataRegion(statement); + waitForReplicationComplete( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + waitForCheckState(regionReplicaSelection.regionId, firstPartitionId, "VERIFIED"); + waitForCheckState(regionReplicaSelection.regionId, partitionId, "VERIFIED"); + } + + TPartitionConsistencyEligibility leaderPartition = + getPartitionEligibility( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId, partitionId); + Assert.assertEquals("READY", leaderPartition.getSnapshotState()); + + List leafSelectors = + getPartitionLeafSelectors( + regionReplicaSelection.leaderNode, + regionReplicaSelection.regionId, + partitionId, + leaderPartition.getSnapshotEpoch(), + "LIVE"); + Assert.assertFalse( + "Idempotent logical repair should still stream at least one live leaf", + leafSelectors.isEmpty()); + leafSelectors = Collections.singletonList(leafSelectors.get(0)); + + String repairEpoch = + buildManualRepairEpoch( + regionReplicaSelection.leaderDataNodeId, partitionId, leaderPartition); + TStreamLogicalRepairResp repairResp = + streamLogicalRepair( + regionReplicaSelection.leaderNode, + regionReplicaSelection.regionId, + partitionId, + repairEpoch, + leafSelectors); + Assert.assertEquals( + "Streaming logical repair should succeed", 200, repairResp.getStatus().getCode()); + Assert.assertFalse("Logical repair stream should not be stale", repairResp.isStale()); + Assert.assertTrue( + "Logical repair stream should contain batches", + repairResp.isSetBatches() && !repairResp.getBatches().isEmpty()); + + String sessionId = + stageLogicalRepairBatches( + regionReplicaSelection.followerNode, + regionReplicaSelection.regionId, + partitionId, + repairEpoch, + repairResp.getBatches()); + Path sessionJournalPath = + logicalRepairSessionPath(regionReplicaSelection.followerNode, sessionId); + Assert.assertTrue( + "Staged logical repair session should be persisted before restart", + Files.exists(sessionJournalPath)); + + regionReplicaSelection.followerNode.stopForcibly(); + Assert.assertFalse( + "Follower should stop between stage and finish", + regionReplicaSelection.followerNode.isAlive()); + regionReplicaSelection.followerNode.start(); + waitForNodeConnectionReady(regionReplicaSelection.followerNode); + waitForReplicationComplete(regionReplicaSelection.leaderNode, regionReplicaSelection.regionId); + Assert.assertTrue( + "Staged logical repair session should survive follower restart", + Files.exists(sessionJournalPath)); + + finishLogicalRepairSession( + regionReplicaSelection.followerNode, + regionReplicaSelection.regionId, + partitionId, + repairEpoch, + sessionId); + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(1, TimeUnit.MINUTES) + .until(() -> !Files.exists(sessionJournalPath)); + + assertDataPointCountOnNode(regionReplicaSelection.followerNode, 12L); + Assert.assertEquals( + "Logical repair should not regress the already-verified partition state", + "VERIFIED", + getRepairProgressRow(regionReplicaSelection.regionId, partitionId).checkState); + } + + protected void insertAndFlushTestData(Statement statement) throws Exception { + insertAndFlushPartitionData(statement, 100L); + } + + protected void insertAndFlushPartitionData(Statement statement, long baseTimestamp) + throws Exception { + insertPartitionData(statement, baseTimestamp); + statement.execute(FLUSH_COMMAND); + } + + protected void insertPartitionData(Statement statement, long baseTimestamp) throws Exception { + LOGGER.info( + "Writing partition-scoped data at baseTimestamp={} to 3C3D cluster (mode: {})...", + baseTimestamp, + getIoTConsensusV2Mode()); + statement.execute(insertSql(baseTimestamp, 1L, 2L)); + statement.execute(insertSql(baseTimestamp + 1, 3L, 4L)); + statement.execute(insertSql(baseTimestamp + 2, 5L, 6L)); + } + + protected boolean shouldFlushLaggingWritesBeforeFollowerRestart() { + return ConsensusFactory.IOT_CONSENSUS_V2_BATCH_MODE.equals(getIoTConsensusV2Mode()); + } + + protected void localFlushOnNode(DataNodeWrapper targetNode) throws Exception { + try (Connection connection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT)); + Statement statement = makeItCloseQuietly(connection.createStatement())) { + statement.execute(LOCAL_FLUSH_COMMAND); + } + } + + protected RegionReplicaSelection selectReplicatedDataRegion(Statement statement) + throws Exception { + Map>> dataRegionMap = getDataRegionMapWithLeader(statement); + + for (Map.Entry>> entry : dataRegionMap.entrySet()) { + Pair> leaderAndReplicas = entry.getValue(); + if (leaderAndReplicas.getLeft() <= 0 || leaderAndReplicas.getRight().size() <= 1) { + continue; + } + int leaderDataNodeId = leaderAndReplicas.getLeft(); + int followerDataNodeId = + leaderAndReplicas.getRight().stream() + .filter(dataNodeId -> dataNodeId != leaderDataNodeId) + .findFirst() + .orElse(-1); + if (followerDataNodeId <= 0) { + continue; + } + + DataNodeWrapper leaderNode = + EnvFactory.getEnv() + .dataNodeIdToWrapper(leaderDataNodeId) + .orElseThrow(() -> new AssertionError("Leader DataNode not found in cluster")); DataNodeWrapper followerNode = EnvFactory.getEnv() .dataNodeIdToWrapper(followerDataNodeId) .orElseThrow(() -> new AssertionError("Follower DataNode not found in cluster")); - LOGGER.info( - "Waiting for follower DataNode {} to be elected as new leader and verifying replica consistency...", - followerDataNodeId); - Awaitility.await() - .pollDelay(2, TimeUnit.SECONDS) - .atMost(2, TimeUnit.MINUTES) - .untilAsserted( - () -> { - try (Connection followerConn = - makeItCloseQuietly( - EnvFactory.getEnv() - .getConnection( - followerNode, - SessionConfig.DEFAULT_USER, - SessionConfig.DEFAULT_PASSWORD, - BaseEnv.TREE_SQL_DIALECT)); - Statement followerStmt = makeItCloseQuietly(followerConn.createStatement())) { - verifyDataConsistency(followerStmt); - } - }); + return new RegionReplicaSelection( + entry.getKey(), leaderDataNodeId, followerDataNodeId, leaderNode, followerNode); + } - LOGGER.info( - "Replica consistency verified: follower has same data as former leader after failover"); + Assert.fail("Should find a replicated data region with a leader for root.sg"); + throw new AssertionError("unreachable"); + } + + protected void verifyDataConsistencyOnNode(DataNodeWrapper targetNode) throws Exception { + try (Connection targetConnection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT)); + Statement targetStatement = makeItCloseQuietly(targetConnection.createStatement())) { + verifyDataConsistency(targetStatement); } } - private static final Pattern SYNC_LAG_PATTERN = - Pattern.compile("iot_consensus_v2\\{[^}]*type=\"syncLag\"[^}]*}\\s+(\\S+)"); + protected void verifyPostDeleteConsistencyOnNode(DataNodeWrapper targetNode) throws Exception { + try (Connection targetConnection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT)); + Statement targetStatement = makeItCloseQuietly(targetConnection.createStatement())) { + verifyPostDeleteConsistency(targetStatement); + } + } /** - * Wait until all consensus pipe syncLag metrics on the given leader DataNode reach 0, meaning - * replication is fully caught up. Queries the leader's Prometheus metrics endpoint periodically. + * Wait until the target region's eligibility reports syncLag == 0, matching the production + * scheduler's gating logic. */ - protected void waitForReplicationComplete(DataNodeWrapper leaderNode) { + protected void waitForReplicationComplete(DataNodeWrapper leaderNode, int regionId) { final long timeoutSeconds = 120; - final String metricsUrl = - "http://" + leaderNode.getIp() + ":" + leaderNode.getMetricPort() + "/metrics"; LOGGER.info( - "Waiting for consensus pipe syncLag to reach 0 on leader DataNode (url: {}, timeout: {}s)...", - metricsUrl, + "Waiting for region {} syncLag to reach 0 on leader DataNode {} (timeout: {}s)...", + regionId, + leaderNode.getId(), timeoutSeconds); Awaitility.await() .pollInterval(500, TimeUnit.MILLISECONDS) .atMost(timeoutSeconds, TimeUnit.SECONDS) .untilAsserted( () -> { - String metricsContent = EnvFactory.getEnv().getUrlContent(metricsUrl, null); - Assert.assertNotNull( - "Failed to fetch metrics from leader DataNode at " + metricsUrl, metricsContent); - Matcher matcher = SYNC_LAG_PATTERN.matcher(metricsContent); - boolean found = false; - while (matcher.find()) { - found = true; - double syncLag = Double.parseDouble(matcher.group(1)); - LOGGER.debug("Found syncLag metric value: {}", syncLag); - Assert.assertEquals( - "Consensus pipe syncLag should be 0.0 but was " + syncLag, 0.0, syncLag, 0.001); - } + long syncLag = getConsistencyEligibility(leaderNode, regionId).getSyncLag(); + LOGGER.debug( + "Observed region {} syncLag={} on leader DataNode {}", + regionId, + syncLag, + leaderNode.getId()); + Assert.assertEquals( + "Region " + regionId + " syncLag should be 0 but was " + syncLag, 0L, syncLag); + }); + LOGGER.info("Region {} syncLag == 0 on leader, replication is complete", regionId); + } + + protected void waitForReplicationLag(DataNodeWrapper leaderNode, int regionId) { + final long timeoutSeconds = 60; + LOGGER.info( + "Waiting for region {} syncLag to become positive on leader DataNode {} (timeout: {}s)...", + regionId, + leaderNode.getId(), + timeoutSeconds); + Awaitility.await() + .pollInterval(500, TimeUnit.MILLISECONDS) + .atMost(timeoutSeconds, TimeUnit.SECONDS) + .untilAsserted( + () -> { + long syncLag = getConsistencyEligibility(leaderNode, regionId).getSyncLag(); + LOGGER.debug( + "Observed region {} syncLag={} while waiting for lag on leader DataNode {}", + regionId, + syncLag, + leaderNode.getId()); Assert.assertTrue( - "No iot_consensus_v2 syncLag metric found in leader DataNode metrics", found); + "Expected region " + regionId + " syncLag > 0 while follower is lagging", + syncLag > 0L); }); - LOGGER.info("All consensus pipe syncLag == 0 on leader, replication is complete"); + LOGGER.info("Observed region {} syncLag > 0 on leader while follower is lagging", regionId); } protected void verifyDataConsistency(Statement statement) throws Exception { @@ -284,6 +832,153 @@ protected void verifyDataConsistency(Statement statement) throws Exception { Assert.assertEquals("Expected 3 rows from select *", 3, rowCount); } + protected void verifyDataPointCount(Statement statement, long expectedTotalCount) + throws Exception { + try (ResultSet countResult = statement.executeQuery(COUNT_QUERY)) { + Assert.assertTrue("Count query should return results", countResult.next()); + + int columnCount = countResult.getMetaData().getColumnCount(); + long totalCount = 0; + for (int i = 1; i <= columnCount; i++) { + totalCount += parseLongFromString(countResult.getString(i)); + } + Assert.assertEquals("Unexpected total data point count", expectedTotalCount, totalCount); + } + } + + protected void verifyPostDeleteConsistency(Statement statement) throws Exception { + LOGGER.info("Querying data to verify replicated delete success..."); + try (ResultSet countResult = statement.executeQuery(COUNT_AFTER_DELETE_QUERY)) { + Assert.assertTrue("Delete count query should return results", countResult.next()); + Assert.assertEquals( + "Expected only one surviving speed value after delete", + 1, + parseLongFromString(countResult.getString(1))); + Assert.assertEquals( + "Expected all temperature values to remain after delete", + 3, + parseLongFromString(countResult.getString(2))); + } + + int rowCount = 0; + try (ResultSet selectResult = statement.executeQuery(SELECT_ALL_QUERY)) { + while (selectResult.next()) { + rowCount++; + long timestamp = parseLongFromString(selectResult.getString(1)); + String speed = selectResult.getString(2); + long temperature = parseLongFromString(selectResult.getString(3)); + if (timestamp == 100) { + assertNullValue(speed); + Assert.assertEquals(2, temperature); + } else if (timestamp == 101) { + assertNullValue(speed); + Assert.assertEquals(4, temperature); + } else if (timestamp == 102) { + Assert.assertEquals(5, parseLongFromString(speed)); + Assert.assertEquals(6, temperature); + } else { + Assert.fail("Unexpected timestamp after delete: " + timestamp); + } + } + } + Assert.assertEquals("Expected 3 logical rows from select after delete", 3, rowCount); + } + + protected void assertDataInconsistentOnNode(DataNodeWrapper targetNode) throws Exception { + try (Connection targetConnection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT)); + Statement targetStatement = makeItCloseQuietly(targetConnection.createStatement())) { + try { + verifyDataConsistency(targetStatement); + Assert.fail("Expected inconsistent data on DataNode " + targetNode.getId()); + } catch (AssertionError expected) { + LOGGER.info("Observed expected inconsistency on DataNode {}", targetNode.getId()); + } + } + } + + protected void assertDataPointCountOnNode(DataNodeWrapper targetNode, long expectedTotalCount) + throws Exception { + try (Connection targetConnection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT)); + Statement targetStatement = makeItCloseQuietly(targetConnection.createStatement())) { + verifyDataPointCount(targetStatement, expectedTotalCount); + } + } + + protected void waitForNodeConnectionReady(DataNodeWrapper targetNode) { + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> { + try (Connection ignored = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TREE_SQL_DIALECT))) { + Assert.assertNotNull( + "Expected a JDBC connection for DataNode " + targetNode.getId(), ignored); + } catch (SQLException e) { + throw new AssertionError( + "DataNode " + targetNode.getId() + " is not accepting JDBC connections yet", e); + } + }); + waitForInternalRpcReady(targetNode); + } + + private void waitForInternalRpcReady(DataNodeWrapper targetNode) { + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> { + try (SyncDataNodeInternalServiceClient client = + DATA_NODE_INTERNAL_CLIENT_MANAGER.borrowClient( + new TEndPoint( + targetNode.getInternalAddress(), targetNode.getInternalPort()))) { + TDataNodeHeartbeatReq heartbeatReq = new TDataNodeHeartbeatReq(); + heartbeatReq.setHeartbeatTimestamp(System.nanoTime()); + heartbeatReq.setNeedJudgeLeader(false); + heartbeatReq.setNeedSamplingLoad(false); + heartbeatReq.setTimeSeriesQuotaRemain(0L); + heartbeatReq.setLogicalClock(0L); + Assert.assertNotNull( + "Expected an internal heartbeat response for DataNode " + targetNode.getId(), + client.getDataNodeHeartBeat(heartbeatReq)); + } catch (Exception e) { + throw new AssertionError( + "DataNode " + + targetNode.getId() + + " is not accepting internal RPC connections yet", + e); + } + }); + } + + protected static void assertNullValue(String value) { + Assert.assertTrue( + "Expected deleted value to be null, but was " + value, + value == null || "null".equalsIgnoreCase(value)); + } + /** Parse long from IoTDB result string (handles both "1" and "1.0" formats). */ protected static long parseLongFromString(String s) { if (s == null || s.isEmpty()) { @@ -295,4 +990,525 @@ protected static long parseLongFromString(String s) { return (long) Double.parseDouble(s); } } + + protected static final class RegionReplicaSelection { + private final int regionId; + private final int leaderDataNodeId; + private final int followerDataNodeId; + private final DataNodeWrapper leaderNode; + private final DataNodeWrapper followerNode; + + private RegionReplicaSelection( + int regionId, + int leaderDataNodeId, + int followerDataNodeId, + DataNodeWrapper leaderNode, + DataNodeWrapper followerNode) { + this.regionId = regionId; + this.leaderDataNodeId = leaderDataNodeId; + this.followerDataNodeId = followerDataNodeId; + this.leaderNode = leaderNode; + this.followerNode = followerNode; + } + } + + private Path findLatestSealedTsFile(DataNodeWrapper dataNodeWrapper, int regionId) + throws Exception { + return findLatestSealedTsFile(dataNodeWrapper, regionId, null); + } + + private Path findLatestSealedTsFile( + DataNodeWrapper dataNodeWrapper, int regionId, Long timePartitionId) throws Exception { + try (Stream tsFiles = Files.walk(Paths.get(dataNodeWrapper.getDataPath()))) { + Optional candidate = + tsFiles + .filter(Files::isRegularFile) + .filter(path -> path.getFileName().toString().endsWith(".tsfile")) + .filter(path -> path.toString().contains(File.separator + "root.sg" + File.separator)) + .filter(path -> belongsToRegion(path, regionId)) + .filter( + path -> timePartitionId == null || belongsToTimePartition(path, timePartitionId)) + .max( + (left, right) -> + Long.compare(left.toFile().lastModified(), right.toFile().lastModified())); + if (candidate.isPresent()) { + return candidate.get(); + } + } + throw new AssertionError( + "No sealed TsFile found for region " + + regionId + + (timePartitionId == null ? "" : (" partition " + timePartitionId))); + } + + private boolean belongsToRegion(Path tsFile, int regionId) { + Path timePartitionDir = tsFile.getParent(); + Path regionDir = timePartitionDir == null ? null : timePartitionDir.getParent(); + return regionDir != null && String.valueOf(regionId).equals(regionDir.getFileName().toString()); + } + + private boolean belongsToTimePartition(Path tsFile, long timePartitionId) { + Path timePartitionDir = tsFile.getParent(); + return timePartitionDir != null + && String.valueOf(timePartitionId).equals(timePartitionDir.getFileName().toString()); + } + + private void deleteTsFileArtifacts(Path tsFile) throws Exception { + Files.deleteIfExists(tsFile); + Files.deleteIfExists(Paths.get(tsFile.toString() + TsFileResource.RESOURCE_SUFFIX)); + Files.deleteIfExists(Paths.get(tsFile.toString() + ModificationFile.FILE_SUFFIX)); + Files.deleteIfExists(Paths.get(tsFile.toString() + ModificationFile.COMPACTION_FILE_SUFFIX)); + Files.deleteIfExists(Paths.get(tsFile.toString() + ModificationFileV1.FILE_SUFFIX)); + Files.deleteIfExists(Paths.get(tsFile.toString() + ModificationFileV1.COMPACTION_FILE_SUFFIX)); + } + + private void triggerRegionConsistencyRepair(int regionId) throws Exception { + TConsensusGroupId consensusGroupId = + new TConsensusGroupId(TConsensusGroupType.DataRegion, regionId); + try (SyncConfigNodeIServiceClient client = + (SyncConfigNodeIServiceClient) EnvFactory.getEnv().getLeaderConfigNodeConnection()) { + TSStatus status = + client.triggerRegionConsistencyRepair( + new TTriggerRegionConsistencyRepairReq(consensusGroupId)); + Assert.assertEquals("Replica consistency repair should succeed", 200, status.getCode()); + } + } + + private void assertPartitionViewMismatch(RegionReplicaSelection regionReplicaSelection) + throws Exception { + Assert.assertNotEquals( + "Expected leader and follower logical snapshot roots to differ before repair", + partitionSnapshotSignature( + getConsistencyEligibility( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId)), + partitionSnapshotSignature( + getConsistencyEligibility( + regionReplicaSelection.followerNode, regionReplicaSelection.regionId))); + } + + private void assertPartitionViewMatched(RegionReplicaSelection regionReplicaSelection) + throws Exception { + Assert.assertEquals( + "Expected leader and follower logical snapshot roots to match after repair", + partitionSnapshotSignature( + getConsistencyEligibility( + regionReplicaSelection.leaderNode, regionReplicaSelection.regionId)), + partitionSnapshotSignature( + getConsistencyEligibility( + regionReplicaSelection.followerNode, regionReplicaSelection.regionId))); + } + + private Path logicalRepairSessionPath(DataNodeWrapper dataNodeWrapper, String sessionId) { + return Paths.get( + dataNodeWrapper.getSystemDir(), "consistency-repair", "sessions", sessionId + ".session"); + } + + private TGetConsistencyEligibilityResp getConsistencyEligibility( + DataNodeWrapper dataNodeWrapper, int regionId) throws Exception { + try (SyncDataNodeInternalServiceClient client = + DATA_NODE_INTERNAL_CLIENT_MANAGER.borrowClient( + new TEndPoint( + dataNodeWrapper.getInternalAddress(), dataNodeWrapper.getInternalPort()))) { + TGetConsistencyEligibilityResp response = + client.getConsistencyEligibility( + new TGetConsistencyEligibilityReq( + new TConsensusGroupId(TConsensusGroupType.DataRegion, regionId))); + Assert.assertEquals( + "Consistency eligibility RPC should succeed on DataNode " + dataNodeWrapper.getId(), + 200, + response.getStatus().getCode()); + return response; + } + } + + private TStreamLogicalRepairResp streamLogicalRepair( + DataNodeWrapper dataNodeWrapper, + int regionId, + long timePartitionId, + String repairEpoch, + List leafSelectors) + throws Exception { + try (SyncDataNodeInternalServiceClient client = + DATA_NODE_INTERNAL_CLIENT_MANAGER.borrowClient( + new TEndPoint( + dataNodeWrapper.getInternalAddress(), dataNodeWrapper.getInternalPort()))) { + return client.streamLogicalRepair( + new TStreamLogicalRepairReq( + new TConsensusGroupId(TConsensusGroupType.DataRegion, regionId), + timePartitionId, + repairEpoch, + leafSelectors)); + } + } + + private TPartitionConsistencyEligibility getPartitionEligibility( + DataNodeWrapper dataNodeWrapper, int regionId, long timePartitionId) throws Exception { + return getConsistencyEligibility(dataNodeWrapper, regionId).getPartitions().stream() + .filter(partition -> partition.getTimePartitionId() == timePartitionId) + .findFirst() + .orElseThrow( + () -> + new AssertionError( + "Partition " + + timePartitionId + + " is missing from consistency eligibility on DataNode " + + dataNodeWrapper.getId())); + } + + private List getPartitionLeafSelectors( + DataNodeWrapper dataNodeWrapper, + int regionId, + long timePartitionId, + long snapshotEpoch, + String treeKind) + throws Exception { + List shardNodes = + getSnapshotSubtreeNodes( + dataNodeWrapper, + regionId, + timePartitionId, + snapshotEpoch, + treeKind, + Collections.singletonList("root")); + List shardHandles = new ArrayList<>(); + for (TSnapshotSubtreeNode shardNode : shardNodes) { + if (!shardNode.isLeaf()) { + shardHandles.add(shardNode.getNodeHandle()); + } + } + if (shardHandles.isEmpty()) { + return Collections.emptyList(); + } + + List leafNodes = + getSnapshotSubtreeNodes( + dataNodeWrapper, regionId, timePartitionId, snapshotEpoch, treeKind, shardHandles); + leafNodes.sort(Comparator.comparing(TSnapshotSubtreeNode::getLeafId)); + List selectors = new ArrayList<>(); + for (TSnapshotSubtreeNode leafNode : leafNodes) { + if (!leafNode.isLeaf()) { + continue; + } + selectors.add(new TLogicalRepairLeafSelector(treeKind, leafNode.getLeafId())); + } + return selectors; + } + + private List getSnapshotSubtreeNodes( + DataNodeWrapper dataNodeWrapper, + int regionId, + long timePartitionId, + long snapshotEpoch, + String treeKind, + List nodeHandles) + throws Exception { + try (SyncDataNodeInternalServiceClient client = + DATA_NODE_INTERNAL_CLIENT_MANAGER.borrowClient( + new TEndPoint( + dataNodeWrapper.getInternalAddress(), dataNodeWrapper.getInternalPort()))) { + TGetSnapshotSubtreeResp response = + client.getSnapshotSubtree( + new TGetSnapshotSubtreeReq( + new TConsensusGroupId(TConsensusGroupType.DataRegion, regionId), + timePartitionId, + snapshotEpoch, + treeKind, + nodeHandles)); + Assert.assertEquals( + "Snapshot subtree RPC should succeed on DataNode " + dataNodeWrapper.getId(), + 200, + response.getStatus().getCode()); + Assert.assertFalse( + "Snapshot subtree should stay valid for partition " + timePartitionId, + response.isStale()); + return response.isSetNodes() ? response.getNodes() : Collections.emptyList(); + } + } + + private String buildManualRepairEpoch( + int leaderDataNodeId, + long timePartitionId, + TPartitionConsistencyEligibility partitionEligibility) { + return leaderDataNodeId + + ":" + + timePartitionId + + ":0:" + + partitionEligibility.getSnapshotEpoch() + + ":" + + partitionEligibility.getPartitionMutationEpoch() + + ":manual-it"; + } + + private String stageLogicalRepairBatches( + DataNodeWrapper followerNode, + int regionId, + long timePartitionId, + String repairEpoch, + List batches) + throws Exception { + String sessionId = null; + for (TLogicalRepairBatch batch : batches) { + if (sessionId == null) { + sessionId = batch.getSessionId(); + } else { + Assert.assertEquals( + "All staged batches should belong to one repair session", + sessionId, + batch.getSessionId()); + } + try (SyncDataNodeInternalServiceClient client = + DATA_NODE_INTERNAL_CLIENT_MANAGER.borrowClient( + new TEndPoint(followerNode.getInternalAddress(), followerNode.getInternalPort()))) { + TSStatus status = + client.applyLogicalRepairBatch( + new TApplyLogicalRepairBatchReq( + new TConsensusGroupId(TConsensusGroupType.DataRegion, regionId), + timePartitionId, + repairEpoch, + batch.getSessionId(), + batch.getTreeKind(), + batch.getLeafId(), + batch.getSeqNo(), + batch.getBatchKind(), + batch.bufferForPayload())); + Assert.assertEquals("Staging logical repair batch should succeed", 200, status.getCode()); + } + } + Assert.assertNotNull("Repair stream should have a session id", sessionId); + return sessionId; + } + + private void finishLogicalRepairSession( + DataNodeWrapper followerNode, + int regionId, + long timePartitionId, + String repairEpoch, + String sessionId) + throws Exception { + TEndPoint endPoint = + new TEndPoint(followerNode.getInternalAddress(), followerNode.getInternalPort()); + SyncDataNodeInternalServiceClient client = + new SyncDataNodeInternalServiceClient( + new ThriftClientProperty.Builder() + .setConnectionTimeoutMs((int) TimeUnit.MINUTES.toMillis(5)) + .build(), + endPoint, + null); + try { + TSStatus status = + client.finishLogicalRepairSession( + new TFinishLogicalRepairSessionReq( + new TConsensusGroupId(TConsensusGroupType.DataRegion, regionId), + timePartitionId, + repairEpoch, + sessionId)); + Assert.assertEquals("Finishing logical repair session should succeed", 200, status.getCode()); + } finally { + client.getInputProtocol().getTransport().close(); + } + } + + private List partitionSnapshotSignature( + TGetConsistencyEligibilityResp eligibilityResponse) { + if (eligibilityResponse == null || !eligibilityResponse.isSetPartitions()) { + return Collections.emptyList(); + } + + List partitionSignatures = new ArrayList<>(); + for (TPartitionConsistencyEligibility partition : eligibilityResponse.getPartitions()) { + partitionSignatures.add( + partition.getTimePartitionId() + + "|mutation=" + + partition.getPartitionMutationEpoch() + + "|snapshot=" + + partition.getSnapshotEpoch() + + "|state=" + + partition.getSnapshotState() + + "|live=" + + partition.getLiveRootXorHash() + + ":" + + partition.getLiveRootAddHash() + + "|tombstone=" + + partition.getTombstoneRootXorHash() + + ":" + + partition.getTombstoneRootAddHash()); + } + Collections.sort(partitionSignatures); + return partitionSignatures; + } + + private void assertRepairProgressEmpty(int regionId) throws Exception { + Assert.assertTrue( + "Expected no repair progress rows yet for region " + regionId, + getRepairProgressRows(regionId).isEmpty()); + } + + private RepairProgressRow waitForCheckState(int regionId, long timePartitionId, String checkState) + throws Exception { + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> { + RepairProgressRow row = getRepairProgressRow(regionId, timePartitionId); + Assert.assertNotNull( + "Expected repair progress row for region " + + regionId + + " partition " + + timePartitionId, + row); + Assert.assertEquals(checkState, row.checkState); + }); + return getRepairProgressRow(regionId, timePartitionId); + } + + private void waitForProgressRefresh(int regionId, long timePartitionId, long previousCheckedAt) + throws Exception { + Awaitility.await() + .pollDelay(1, TimeUnit.SECONDS) + .pollInterval(1, TimeUnit.SECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> { + RepairProgressRow row = getRepairProgressRow(regionId, timePartitionId); + Assert.assertNotNull(row); + Assert.assertTrue( + "Expected last_checked_at to advance after a new background round", + row.lastCheckedAt > previousCheckedAt); + }); + } + + private RepairProgressRow getRepairProgressRow(int regionId, long timePartitionId) + throws Exception { + return getRepairProgressRows(regionId).stream() + .filter(row -> row.timePartition == timePartitionId) + .findFirst() + .orElse(null); + } + + private RepairProgressRow getRepairProgressRow( + DataNodeWrapper targetNode, int regionId, long timePartitionId) throws Exception { + return getRepairProgressRows(targetNode, regionId).stream() + .filter(row -> row.timePartition == timePartitionId) + .findFirst() + .orElse(null); + } + + private List getRepairProgressRows(int regionId) throws Exception { + return getRepairProgressRows(selectReadableDataNode(), regionId); + } + + private List getRepairProgressRows(DataNodeWrapper targetNode, int regionId) + throws Exception { + try (Connection connection = + makeItCloseQuietly( + EnvFactory.getEnv() + .getConnection( + targetNode, + SessionConfig.DEFAULT_USER, + SessionConfig.DEFAULT_PASSWORD, + BaseEnv.TABLE_SQL_DIALECT)); + Statement statement = makeItCloseQuietly(connection.createStatement()); + ResultSet resultSet = + statement.executeQuery( + "select region_id, time_partition, check_state, repair_state, last_checked_at, " + + "last_safe_watermark, partition_mutation_epoch, snapshot_epoch, snapshot_state, " + + "last_mismatch_at, mismatch_scope_ref, mismatch_leaf_count, repair_epoch, " + + "last_error_code, last_error_message from information_schema.repair_progress " + + "where region_id = " + + regionId + + " order by time_partition")) { + List rows = new ArrayList<>(); + while (resultSet.next()) { + rows.add( + new RepairProgressRow( + resultSet.getInt(1), + resultSet.getLong(2), + resultSet.getString(3), + resultSet.getString(4), + resultSet.getLong(5), + resultSet.getLong(6), + resultSet.getLong(7), + resultSet.getLong(8), + resultSet.getString(9), + resultSet.getLong(10), + resultSet.getString(11), + resultSet.getInt(12), + resultSet.getString(13), + resultSet.getString(14), + resultSet.getString(15))); + } + rows.sort(Comparator.comparingLong(row -> row.timePartition)); + return rows; + } + } + + private DataNodeWrapper selectReadableDataNode() { + return EnvFactory.getEnv().getDataNodeWrapperList().stream() + .filter(DataNodeWrapper::isAlive) + .findFirst() + .orElseThrow(() -> new IllegalStateException("No alive DataNode is available for query")); + } + + private long timePartitionId(long timestamp) { + return Math.floorDiv(timestamp, TIME_PARTITION_INTERVAL); + } + + private String insertSql(long timestamp, long speed, long temperature) { + return String.format( + "INSERT INTO root.sg.d1(timestamp,speed,temperature) values(%d, %d, %d)", + timestamp, speed, temperature); + } + + private static final class RepairProgressRow { + private final int regionId; + private final long timePartition; + private final String checkState; + private final String repairState; + private final long lastCheckedAt; + private final long lastSafeWatermark; + private final long partitionMutationEpoch; + private final long snapshotEpoch; + private final String snapshotState; + private final long lastMismatchAt; + private final String mismatchScopeRef; + private final int mismatchLeafCount; + private final String repairEpoch; + private final String lastErrorCode; + private final String lastErrorMessage; + + private RepairProgressRow( + int regionId, + long timePartition, + String checkState, + String repairState, + long lastCheckedAt, + long lastSafeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + String snapshotState, + long lastMismatchAt, + String mismatchScopeRef, + int mismatchLeafCount, + String repairEpoch, + String lastErrorCode, + String lastErrorMessage) { + this.regionId = regionId; + this.timePartition = timePartition; + this.checkState = checkState; + this.repairState = repairState; + this.lastCheckedAt = lastCheckedAt; + this.lastSafeWatermark = lastSafeWatermark; + this.partitionMutationEpoch = partitionMutationEpoch; + this.snapshotEpoch = snapshotEpoch; + this.snapshotState = snapshotState; + this.lastMismatchAt = lastMismatchAt; + this.mismatchScopeRef = mismatchScopeRef; + this.mismatchLeafCount = mismatchLeafCount; + this.repairEpoch = repairEpoch; + this.lastErrorCode = lastErrorCode; + this.lastErrorMessage = lastErrorMessage; + } + } } diff --git a/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/batch/IoTDBIoTConsensusV2Batch3C3DBasicIT.java b/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/batch/IoTDBIoTConsensusV2Batch3C3DBasicIT.java index f71462fa470af..6ca3391e35400 100644 --- a/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/batch/IoTDBIoTConsensusV2Batch3C3DBasicIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/batch/IoTDBIoTConsensusV2Batch3C3DBasicIT.java @@ -44,6 +44,55 @@ public void testReplicaConsistencyAfterLeaderStop() throws Exception { super.testReplicaConsistencyAfterLeaderStop(); } + @Override + @Test + public void testFollowerCanReadConsistentDataAfterCatchUp() throws Exception { + super.testFollowerCanReadConsistentDataAfterCatchUp(); + } + + @Override + @Test + public void testReplicaConsistencyAfterDeleteAndLeaderStop() throws Exception { + super.testReplicaConsistencyAfterDeleteAndLeaderStop(); + } + + @Override + @Test + public void testReplicaConsistencyRepairAfterFollowerLosesSealedTsFile() throws Exception { + super.testReplicaConsistencyRepairAfterFollowerLosesSealedTsFile(); + } + + @Override + @Test + public void testLogicalRepairSessionSurvivesFollowerRestart() throws Exception { + super.testLogicalRepairSessionSurvivesFollowerRestart(); + } + + @Override + @Test + public void testBackgroundConsistencyCheckOnlyRunsOnColdPartitions() throws Exception { + super.testBackgroundConsistencyCheckOnlyRunsOnColdPartitions(); + } + + @Override + @Test + public void testBackgroundConsistencyCheckRebuildsLogicalSnapshotAfterFollowerRestart() + throws Exception { + super.testBackgroundConsistencyCheckRebuildsLogicalSnapshotAfterFollowerRestart(); + } + + @Override + @Test + public void testBackgroundConsistencyCheckWaitsForSyncLagToClear() throws Exception { + super.testBackgroundConsistencyCheckWaitsForSyncLagToClear(); + } + + @Override + @Test + public void testRepairProgressSurvivesConfigNodeLeaderRestart() throws Exception { + super.testRepairProgressSurvivesConfigNodeLeaderRestart(); + } + @Override @Test public void test3C3DWriteFlushAndQuery() throws Exception { diff --git a/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/stream/IoTDBIoTConsensusV2Stream3C3DBasicIT.java b/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/stream/IoTDBIoTConsensusV2Stream3C3DBasicIT.java index 856d3624bf189..0d06c71596c77 100644 --- a/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/stream/IoTDBIoTConsensusV2Stream3C3DBasicIT.java +++ b/integration-test/src/test/java/org/apache/iotdb/db/it/iotconsensusv2/stream/IoTDBIoTConsensusV2Stream3C3DBasicIT.java @@ -44,6 +44,55 @@ public void testReplicaConsistencyAfterLeaderStop() throws Exception { super.testReplicaConsistencyAfterLeaderStop(); } + @Override + @Test + public void testFollowerCanReadConsistentDataAfterCatchUp() throws Exception { + super.testFollowerCanReadConsistentDataAfterCatchUp(); + } + + @Override + @Test + public void testReplicaConsistencyAfterDeleteAndLeaderStop() throws Exception { + super.testReplicaConsistencyAfterDeleteAndLeaderStop(); + } + + @Override + @Test + public void testReplicaConsistencyRepairAfterFollowerLosesSealedTsFile() throws Exception { + super.testReplicaConsistencyRepairAfterFollowerLosesSealedTsFile(); + } + + @Override + @Test + public void testLogicalRepairSessionSurvivesFollowerRestart() throws Exception { + super.testLogicalRepairSessionSurvivesFollowerRestart(); + } + + @Override + @Test + public void testBackgroundConsistencyCheckOnlyRunsOnColdPartitions() throws Exception { + super.testBackgroundConsistencyCheckOnlyRunsOnColdPartitions(); + } + + @Override + @Test + public void testBackgroundConsistencyCheckRebuildsLogicalSnapshotAfterFollowerRestart() + throws Exception { + super.testBackgroundConsistencyCheckRebuildsLogicalSnapshotAfterFollowerRestart(); + } + + @Override + @Test + public void testBackgroundConsistencyCheckWaitsForSyncLagToClear() throws Exception { + super.testBackgroundConsistencyCheckWaitsForSyncLagToClear(); + } + + @Override + @Test + public void testRepairProgressSurvivesConfigNodeLeaderRestart() throws Exception { + super.testRepairProgressSurvivesConfigNodeLeaderRestart(); + } + @Override @Test public void test3C3DWriteFlushAndQuery() throws Exception { diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java index 4055398ddb7ec..3841bb97b2f4d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java @@ -36,6 +36,13 @@ public enum CnToDnSyncRequestType { REMOVE_REGION_PEER, DELETE_OLD_REGION_PEER, RESET_PEER_LIST, + GET_CONSISTENCY_ELIGIBILITY, + GET_SNAPSHOT_SUBTREE, + ESTIMATE_LEAF_DIFF, + DECODE_LEAF_DIFF, + STREAM_LOGICAL_REPAIR, + APPLY_LOGICAL_REPAIR_BATCH, + FINISH_LOGICAL_REPAIR_SESSION, // PartitionCache INVALIDATE_PARTITION_CACHE, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java index d63d5a74f6095..bd6a391420071 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java @@ -28,10 +28,16 @@ import org.apache.iotdb.commons.client.exception.ClientManagerException; import org.apache.iotdb.commons.client.sync.SyncDataNodeInternalServiceClient; import org.apache.iotdb.commons.exception.UncheckedStartupException; +import org.apache.iotdb.mpp.rpc.thrift.TApplyLogicalRepairBatchReq; import org.apache.iotdb.mpp.rpc.thrift.TCleanDataNodeCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TCreateDataRegionReq; import org.apache.iotdb.mpp.rpc.thrift.TCreatePeerReq; import org.apache.iotdb.mpp.rpc.thrift.TCreateSchemaRegionReq; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TFinishLogicalRepairSessionReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidatePermissionCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TKillQueryInstanceReq; @@ -39,6 +45,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TRegionLeaderChangeReq; import org.apache.iotdb.mpp.rpc.thrift.TRegionLeaderChangeResp; import org.apache.iotdb.mpp.rpc.thrift.TResetPeerListReq; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairReq; import org.apache.iotdb.mpp.rpc.thrift.TUpdateTableReq; import org.apache.iotdb.mpp.rpc.thrift.TUpdateTemplateReq; import org.apache.iotdb.rpc.TSStatusCode; @@ -98,6 +105,27 @@ private void buildActionMap() { actionMapBuilder.put( CnToDnSyncRequestType.DELETE_REGION, (req, client) -> client.deleteRegion((TConsensusGroupId) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.GET_CONSISTENCY_ELIGIBILITY, + (req, client) -> client.getConsistencyEligibility((TGetConsistencyEligibilityReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.GET_SNAPSHOT_SUBTREE, + (req, client) -> client.getSnapshotSubtree((TGetSnapshotSubtreeReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.ESTIMATE_LEAF_DIFF, + (req, client) -> client.estimateLeafDiff((TEstimateLeafDiffReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.DECODE_LEAF_DIFF, + (req, client) -> client.decodeLeafDiff((TDecodeLeafDiffReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.STREAM_LOGICAL_REPAIR, + (req, client) -> client.streamLogicalRepair((TStreamLogicalRepairReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.APPLY_LOGICAL_REPAIR_BATCH, + (req, client) -> client.applyLogicalRepairBatch((TApplyLogicalRepairBatchReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.FINISH_LOGICAL_REPAIR_SESSION, + (req, client) -> client.finishLogicalRepairSession((TFinishLogicalRepairSessionReq) req)); actionMapBuilder.put( CnToDnSyncRequestType.INVALIDATE_PERMISSION_CACHE, (req, client) -> client.invalidatePermissionCache((TInvalidatePermissionCacheReq) req)); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index 3abb322d08472..f51033ea67cde 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -317,6 +317,9 @@ public class ConfigNodeConfig { /** The getOrCreatePartitionTable interface will log new created Partition if set true. */ private boolean isEnablePrintingNewlyCreatedPartition = false; + private long consistencyCheckSchedulerInitialDelayInMs = 60_000L; + private long consistencyCheckSchedulerIntervalInMs = 900_000L; + private long forceWalPeriodForConfigNodeSimpleInMs = 100; public ConfigNodeConfig() { @@ -1174,6 +1177,23 @@ public void setEnablePrintingNewlyCreatedPartition(boolean enablePrintingNewlyCr isEnablePrintingNewlyCreatedPartition = enablePrintingNewlyCreatedPartition; } + public long getConsistencyCheckSchedulerInitialDelayInMs() { + return consistencyCheckSchedulerInitialDelayInMs; + } + + public void setConsistencyCheckSchedulerInitialDelayInMs( + long consistencyCheckSchedulerInitialDelayInMs) { + this.consistencyCheckSchedulerInitialDelayInMs = consistencyCheckSchedulerInitialDelayInMs; + } + + public long getConsistencyCheckSchedulerIntervalInMs() { + return consistencyCheckSchedulerIntervalInMs; + } + + public void setConsistencyCheckSchedulerIntervalInMs(long consistencyCheckSchedulerIntervalInMs) { + this.consistencyCheckSchedulerIntervalInMs = consistencyCheckSchedulerIntervalInMs; + } + public long getForceWalPeriodForConfigNodeSimpleInMs() { return forceWalPeriodForConfigNodeSimpleInMs; } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index 77790dae1a903..b4655a0056c69 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -348,6 +348,18 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio "enable_auto_leader_balance_for_iot_consensus", String.valueOf(conf.isEnableAutoLeaderBalanceForIoTConsensus())))); + conf.setConsistencyCheckSchedulerInitialDelayInMs( + Long.parseLong( + properties.getProperty( + "consistency_check_scheduler_initial_delay_in_ms", + String.valueOf(conf.getConsistencyCheckSchedulerInitialDelayInMs())))); + + conf.setConsistencyCheckSchedulerIntervalInMs( + Long.parseLong( + properties.getProperty( + "consistency_check_scheduler_interval_in_ms", + String.valueOf(conf.getConsistencyCheckSchedulerIntervalInMs())))); + String routePriorityPolicy = properties.getProperty("route_priority_policy", conf.getRoutePriorityPolicy()); if (IPriorityBalancer.GREEDY_POLICY.equals(routePriorityPolicy) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java index f455edb26b8b1..cd16b1345938a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ConfigManager.java @@ -47,6 +47,7 @@ import org.apache.iotdb.commons.conf.ConfigurationFileUtils; import org.apache.iotdb.commons.conf.IoTDBConstant; import org.apache.iotdb.commons.conf.TrimProperties; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.exception.MetadataException; import org.apache.iotdb.commons.path.MeasurementPath; @@ -109,6 +110,7 @@ import org.apache.iotdb.confignode.consensus.response.ttl.ShowTTLResp; import org.apache.iotdb.confignode.consensus.statemachine.ConfigRegionStateMachine; import org.apache.iotdb.confignode.manager.consensus.ConsensusManager; +import org.apache.iotdb.confignode.manager.consistency.ConsistencyProgressManager; import org.apache.iotdb.confignode.manager.cq.CQManager; import org.apache.iotdb.confignode.manager.externalservice.ExternalServiceInfo; import org.apache.iotdb.confignode.manager.externalservice.ExternalServiceManager; @@ -125,6 +127,7 @@ import org.apache.iotdb.confignode.manager.schema.ClusterSchemaQuotaStatistics; import org.apache.iotdb.confignode.manager.subscription.SubscriptionManager; import org.apache.iotdb.confignode.persistence.ClusterInfo; +import org.apache.iotdb.confignode.persistence.ConsistencyProgressInfo; import org.apache.iotdb.confignode.persistence.ProcedureInfo; import org.apache.iotdb.confignode.persistence.TTLInfo; import org.apache.iotdb.confignode.persistence.TriggerInfo; @@ -217,6 +220,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TReconstructRegionReq; import org.apache.iotdb.confignode.rpc.thrift.TRegionRouteMapResp; import org.apache.iotdb.confignode.rpc.thrift.TRemoveRegionReq; +import org.apache.iotdb.confignode.rpc.thrift.TRepairProgressInfo; import org.apache.iotdb.confignode.rpc.thrift.TSchemaNodeManagementResp; import org.apache.iotdb.confignode.rpc.thrift.TSchemaPartitionTableResp; import org.apache.iotdb.confignode.rpc.thrift.TSetDataNodeStatusReq; @@ -232,6 +236,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TShowPipePluginReq; import org.apache.iotdb.confignode.rpc.thrift.TShowPipeReq; import org.apache.iotdb.confignode.rpc.thrift.TShowPipeResp; +import org.apache.iotdb.confignode.rpc.thrift.TShowRepairProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TShowSubscriptionReq; import org.apache.iotdb.confignode.rpc.thrift.TShowSubscriptionResp; import org.apache.iotdb.confignode.rpc.thrift.TShowTable4InformationSchemaResp; @@ -246,6 +251,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq; import org.apache.iotdb.confignode.rpc.thrift.TThrottleQuotaResp; import org.apache.iotdb.confignode.rpc.thrift.TTimeSlotList; +import org.apache.iotdb.confignode.rpc.thrift.TTriggerRegionConsistencyRepairReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsetSchemaTemplateReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.consensus.common.DataSet; @@ -343,6 +349,8 @@ public class ConfigManager implements IManager { /** Subscription */ private final SubscriptionManager subscriptionManager; + private final ConsistencyProgressManager consistencyProgressManager; + private final ConfigRegionStateMachine stateMachine; private final RetryFailedTasksThread retryFailedTasksThread; @@ -366,6 +374,7 @@ public ConfigManager() throws IOException { QuotaInfo quotaInfo = new QuotaInfo(); TTLInfo ttlInfo = new TTLInfo(); SubscriptionInfo subscriptionInfo = new SubscriptionInfo(); + ConsistencyProgressInfo consistencyProgressInfo = new ConsistencyProgressInfo(); // Build state machine and executor ConfigPlanExecutor executor = @@ -383,7 +392,8 @@ public ConfigManager() throws IOException { pipeInfo, subscriptionInfo, quotaInfo, - ttlInfo); + ttlInfo, + consistencyProgressInfo); this.stateMachine = new ConfigRegionStateMachine(this, executor); // Build the manager module @@ -404,6 +414,7 @@ public ConfigManager() throws IOException { this.cqManager = new CQManager(this); this.pipeManager = new PipeManager(this, pipeInfo); this.subscriptionManager = new SubscriptionManager(this, subscriptionInfo); + this.consistencyProgressManager = new ConsistencyProgressManager(consistencyProgressInfo); this.auditLogger = new CNAuditLogger(this); // 1. keep PipeManager initialization before LoadManager initialization, because @@ -1290,6 +1301,10 @@ public SubscriptionManager getSubscriptionManager() { return subscriptionManager; } + public ConsistencyProgressManager getConsistencyProgressManager() { + return consistencyProgressManager; + } + @Override public CNAuditLogger getAuditLogger() { return auditLogger; @@ -1996,6 +2011,63 @@ public TShowConfigNodes4InformationSchemaResp showConfigNodes4InformationSchema( } } + public TShowRepairProgressResp showRepairProgress() { + final TSStatus status = confirmLeader(); + final TShowRepairProgressResp resp = new TShowRepairProgressResp(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + return resp.setStatus(status); + } + + List progressInfoList = new ArrayList<>(); + for (RepairProgressTable table : consistencyProgressManager.getAllTables()) { + int regionId = parseRegionId(table.getConsensusGroupId()); + for (RepairProgressTable.PartitionProgress progress : table.getAllPartitions()) { + TRepairProgressInfo progressInfo = + new TRepairProgressInfo( + regionId, + progress.getPartitionId(), + progress.getCheckState().name(), + progress.getRepairState().name(), + progress.getLastCheckedAt(), + progress.getLastSafeWatermark(), + progress.getPartitionMutationEpoch(), + progress.getSnapshotEpoch(), + progress.getSnapshotState().name(), + progress.getLastMismatchAt(), + progress.getMismatchLeafCount()); + if (progress.getMismatchScopeRef() != null) { + progressInfo.setMismatchScopeRef(progress.getMismatchScopeRef()); + } + if (progress.getRepairEpoch() != null) { + progressInfo.setRepairEpoch(progress.getRepairEpoch()); + } + if (progress.getLastErrorCode() != null) { + progressInfo.setLastErrorCode(progress.getLastErrorCode()); + } + if (progress.getLastErrorMessage() != null) { + progressInfo.setLastErrorMessage(progress.getLastErrorMessage()); + } + progressInfoList.add(progressInfo); + } + } + progressInfoList.sort( + Comparator.comparingInt(TRepairProgressInfo::getRegionId) + .thenComparingLong(TRepairProgressInfo::getTimePartition)); + return resp.setRepairProgressInfoList(progressInfoList).setStatus(StatusUtils.OK); + } + + private int parseRegionId(String consensusGroupKey) { + int separatorIndex = consensusGroupKey.lastIndexOf('-'); + if (separatorIndex < 0 || separatorIndex == consensusGroupKey.length() - 1) { + return -1; + } + try { + return Integer.parseInt(consensusGroupKey.substring(separatorIndex + 1)); + } catch (NumberFormatException e) { + return -1; + } + } + @Override public TShowDatabaseResp showDatabase(final TGetDatabaseReq req) { final TSStatus status = confirmLeader(); @@ -2601,6 +2673,13 @@ public TSStatus removeRegion(TRemoveRegionReq req) { : status; } + public TSStatus triggerRegionConsistencyRepair(TTriggerRegionConsistencyRepairReq req) { + TSStatus status = confirmLeader(); + return status.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode() + ? procedureManager.triggerRegionConsistencyRepair(req) + : status; + } + @Override public TSStatus createCQ(TCreateCQReq req) { TSStatus status = confirmLeader(); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index 646aaf66daf4f..a918fd86fbc33 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -53,6 +53,7 @@ import org.apache.iotdb.confignode.consensus.request.write.datanode.RemoveDataNodePlan; import org.apache.iotdb.confignode.consensus.request.write.procedure.UpdateProcedurePlan; import org.apache.iotdb.confignode.consensus.request.write.region.CreateRegionGroupsPlan; +import org.apache.iotdb.confignode.manager.consistency.ConsistencyCheckScheduler; import org.apache.iotdb.confignode.manager.partition.PartitionManager; import org.apache.iotdb.confignode.persistence.ProcedureInfo; import org.apache.iotdb.confignode.procedure.PartitionTableAutoCleaner; @@ -62,6 +63,8 @@ import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.env.RegionMaintainHandler; import org.apache.iotdb.confignode.procedure.env.RemoveDataNodeHandler; +import org.apache.iotdb.confignode.procedure.impl.consistency.LiveDataRegionRepairExecutionContext; +import org.apache.iotdb.confignode.procedure.impl.consistency.RepairRegionProcedure; import org.apache.iotdb.confignode.procedure.impl.cq.CreateCQProcedure; import org.apache.iotdb.confignode.procedure.impl.node.AddConfigNodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure; @@ -149,6 +152,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TReconstructRegionReq; import org.apache.iotdb.confignode.rpc.thrift.TRemoveRegionReq; import org.apache.iotdb.confignode.rpc.thrift.TSubscribeReq; +import org.apache.iotdb.confignode.rpc.thrift.TTriggerRegionConsistencyRepairReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.consensus.ConsensusFactory; import org.apache.iotdb.db.exception.BatchProcessException; @@ -171,6 +175,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -205,6 +210,7 @@ public class ProcedureManager { private ProcedureMetrics procedureMetrics; private final PartitionTableAutoCleaner partitionTableCleaner; + private final ConsistencyCheckScheduler consistencyCheckScheduler; private final ReentrantLock tableLock = new ReentrantLock(); @@ -221,6 +227,7 @@ public ProcedureManager(ConfigManager configManager, ProcedureInfo procedureInfo - IoTDBConstant.RAFT_LOG_BASIC_SIZE; this.procedureMetrics = new ProcedureMetrics(this); this.partitionTableCleaner = new PartitionTableAutoCleaner<>(configManager); + this.consistencyCheckScheduler = new ConsistencyCheckScheduler(configManager, this); } public void startExecutor() { @@ -232,12 +239,14 @@ public void startExecutor() { CONFIG_NODE_CONFIG.getProcedureCompletedEvictTTL()); executor.addInternalProcedure(partitionTableCleaner); store.start(); + consistencyCheckScheduler.start(); LOGGER.info("ProcedureManager is started successfully."); } } public void stopExecutor() { if (executor.isRunning()) { + consistencyCheckScheduler.stop(); executor.stop(); if (!executor.isRunning()) { executor.join(); @@ -1374,6 +1383,69 @@ public TSStatus createRegionGroups( } } + public TSStatus triggerRegionConsistencyRepair(final TTriggerRegionConsistencyRepairReq req) { + return triggerRegionConsistencyRepair( + req.getConsensusGroupId(), + req.isSetPartitionFilter() ? req.getPartitionFilter() : Collections.emptyList(), + req.isSetRepairEpoch() ? req.getRepairEpoch() : null); + } + + public TSStatus triggerRegionConsistencyRepair(final TConsensusGroupId consensusGroupId) { + return triggerRegionConsistencyRepair(consensusGroupId, Collections.emptyList(), null); + } + + public TSStatus triggerRegionConsistencyRepair( + final TConsensusGroupId consensusGroupId, + final List partitionFilter, + final String repairEpoch) { + if (consensusGroupId == null || consensusGroupId.getType() != TConsensusGroupType.DataRegion) { + return new TSStatus(TSStatusCode.ILLEGAL_PARAMETER.getStatusCode()) + .setMessage("Replica consistency repair currently only supports DataRegion"); + } + + synchronized (this) { + for (Procedure procedure : executor.getProcedures().values()) { + if (procedure instanceof RepairRegionProcedure + && !procedure.isFinished() + && consensusGroupId.equals(((RepairRegionProcedure) procedure).getConsensusGroupId())) { + return new TSStatus(TSStatusCode.OVERLAP_WITH_EXISTING_TASK.getStatusCode()) + .setMessage( + "Replica consistency repair is already running for region " + consensusGroupId); + } + } + + try { + RepairRegionProcedure procedure = + new RepairRegionProcedure( + consensusGroupId, + new LiveDataRegionRepairExecutionContext( + configManager, + consensusGroupId, + partitionFilter == null ? Collections.emptyList() : partitionFilter, + repairEpoch, + true)); + executor.submitProcedure(procedure); + return waitingProcedureFinished(procedure); + } catch (Exception e) { + LOGGER.warn( + "Failed to trigger replica consistency repair for region {}", consensusGroupId, e); + return new TSStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR.getStatusCode()) + .setMessage(e.getMessage()); + } + } + } + + public boolean hasRunningRepairProcedure(final TConsensusGroupId consensusGroupId) { + for (Procedure procedure : executor.getProcedures().values()) { + if (procedure instanceof RepairRegionProcedure + && !procedure.isFinished() + && consensusGroupId.equals(((RepairRegionProcedure) procedure).getConsensusGroupId())) { + return true; + } + } + return false; + } + /** * Generate {@link CreateTriggerProcedure} and wait until it finished. * diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyCheckScheduler.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyCheckScheduler.java new file mode 100644 index 0000000000000..57c272ba538b9 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyCheckScheduler.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; +import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.commons.utils.TestOnly; +import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor; +import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.ProcedureManager; +import org.apache.iotdb.confignode.procedure.impl.consistency.LiveDataRegionRepairExecutionContext; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Collections; +import java.util.Comparator; +import java.util.concurrent.Future; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +public class ConsistencyCheckScheduler { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsistencyCheckScheduler.class); + + private final ConfigManager configManager; + private final ProcedureManager procedureManager; + private final ScheduledExecutorService executorService = + IoTDBThreadPoolFactory.newSingleThreadScheduledExecutor("ConfigNode-Consistency-Check"); + private final RegionCheckExecutor regionCheckExecutor; + private final long initialDelayMs; + private final long intervalMs; + private final AtomicBoolean roundRunning = new AtomicBoolean(false); + private final Object scheduleMonitor = new Object(); + + private Future scheduledFuture; + + public ConsistencyCheckScheduler(ConfigManager configManager, ProcedureManager procedureManager) { + this( + configManager, + procedureManager, + ConfigNodeDescriptor.getInstance().getConf().getConsistencyCheckSchedulerInitialDelayInMs(), + ConfigNodeDescriptor.getInstance().getConf().getConsistencyCheckSchedulerIntervalInMs(), + (manager, consensusGroupId, progressTable) -> { + LiveDataRegionRepairExecutionContext executionContext = + new LiveDataRegionRepairExecutionContext( + manager, consensusGroupId, Collections.emptySet(), null, false); + executionContext.collectPendingPartitions( + executionContext.computeSafeWatermark(), progressTable); + }); + } + + @TestOnly + ConsistencyCheckScheduler( + ConfigManager configManager, + ProcedureManager procedureManager, + long initialDelayMs, + long intervalMs, + RegionCheckExecutor regionCheckExecutor) { + this.configManager = configManager; + this.procedureManager = procedureManager; + this.initialDelayMs = initialDelayMs; + this.intervalMs = intervalMs; + this.regionCheckExecutor = regionCheckExecutor; + } + + public void start() { + synchronized (scheduleMonitor) { + if (scheduledFuture != null) { + return; + } + scheduledFuture = + ScheduledExecutorUtil.safelyScheduleWithFixedDelay( + executorService, + this::runOneRound, + initialDelayMs, + intervalMs, + TimeUnit.MILLISECONDS); + LOGGER.info( + "Consistency check scheduler started with initialDelay={}ms interval={}ms", + initialDelayMs, + intervalMs); + } + } + + public void stop() { + synchronized (scheduleMonitor) { + if (scheduledFuture == null) { + return; + } + scheduledFuture.cancel(false); + scheduledFuture = null; + LOGGER.info("Consistency check scheduler stopped"); + } + } + + void runOneRound() { + if (!roundRunning.compareAndSet(false, true)) { + LOGGER.debug("Skip consistency check round because the previous round is still running"); + return; + } + + try { + configManager + .getPartitionManager() + .getAllReplicaSetsMap(TConsensusGroupType.DataRegion) + .keySet() + .stream() + .sorted(Comparator.comparingInt(TConsensusGroupId::getId)) + .forEach(this::checkOneRegion); + } finally { + roundRunning.set(false); + } + } + + private void checkOneRegion(TConsensusGroupId consensusGroupId) { + if (procedureManager.hasRunningRepairProcedure(consensusGroupId)) { + LOGGER.debug( + "Skip background consistency check for region {} because a repair is running", + consensusGroupId); + return; + } + + try { + ConsistencyProgressManager consistencyProgressManager = + configManager.getConsistencyProgressManager(); + if (consistencyProgressManager == null) { + LOGGER.debug( + "Skip background consistency check for region {} because progress manager is not ready", + consensusGroupId); + return; + } + RepairProgressTable progressTable = + consistencyProgressManager.loadRepairProgressTable(consensusGroupId); + regionCheckExecutor.execute(configManager, consensusGroupId, progressTable); + consistencyProgressManager.persistRepairProgressTable(progressTable); + } catch (Exception e) { + LOGGER.warn("Background consistency check failed for region {}", consensusGroupId, e); + } + } + + @FunctionalInterface + interface RegionCheckExecutor { + void execute( + ConfigManager configManager, + TConsensusGroupId consensusGroupId, + RepairProgressTable progressTable) + throws Exception; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyProgressManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyProgressManager.java new file mode 100644 index 0000000000000..ff395d7854e9d --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyProgressManager.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.confignode.persistence.ConsistencyProgressInfo; + +import java.util.List; + +public class ConsistencyProgressManager { + + private final ConsistencyProgressInfo consistencyProgressInfo; + + public ConsistencyProgressManager(ConsistencyProgressInfo consistencyProgressInfo) { + this.consistencyProgressInfo = consistencyProgressInfo; + } + + public RepairProgressTable loadRepairProgressTable(String consensusGroupKey) { + RepairProgressTable table = consistencyProgressInfo.getTable(consensusGroupKey); + return table == null ? new RepairProgressTable(consensusGroupKey) : table; + } + + public RepairProgressTable loadRepairProgressTable(TConsensusGroupId consensusGroupId) { + return loadRepairProgressTable(toConsensusGroupKey(consensusGroupId)); + } + + public void persistRepairProgressTable(RepairProgressTable repairProgressTable) { + consistencyProgressInfo.updateTable(repairProgressTable); + } + + public List getAllTables() { + return consistencyProgressInfo.getAllTables(); + } + + public static String toConsensusGroupKey(TConsensusGroupId consensusGroupId) { + if (consensusGroupId == null) { + return "unknown"; + } + return consensusGroupId.getType() + "-" + consensusGroupId.getId(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/ConsistencyProgressInfo.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/ConsistencyProgressInfo.java new file mode 100644 index 0000000000000..80b9a117a2525 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/ConsistencyProgressInfo.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.persistence; + +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.commons.snapshot.SnapshotProcessor; + +import org.apache.thrift.TException; +import org.apache.tsfile.utils.ReadWriteIOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; + +public class ConsistencyProgressInfo implements SnapshotProcessor { + + private static final Logger LOGGER = LoggerFactory.getLogger(ConsistencyProgressInfo.class); + private static final String SNAPSHOT_FILENAME = "consistency_progress.bin"; + + private final ConcurrentHashMap progressTables = + new ConcurrentHashMap<>(); + + public RepairProgressTable getOrCreateTable(String consensusGroupKey) { + return progressTables.computeIfAbsent(consensusGroupKey, RepairProgressTable::new); + } + + public RepairProgressTable getTable(String consensusGroupKey) { + RepairProgressTable table = progressTables.get(consensusGroupKey); + return table == null ? null : table.copy(); + } + + public void updateTable(RepairProgressTable repairProgressTable) { + if (repairProgressTable == null) { + return; + } + progressTables.put(repairProgressTable.getConsensusGroupId(), repairProgressTable.copy()); + } + + public List getAllTables() { + List tables = new ArrayList<>(); + for (RepairProgressTable table : progressTables.values()) { + tables.add(table.copy()); + } + tables.sort((left, right) -> left.getConsensusGroupId().compareTo(right.getConsensusGroupId())); + return tables; + } + + @Override + public boolean processTakeSnapshot(File snapshotDir) throws TException, IOException { + File snapshotFile = new File(snapshotDir, SNAPSHOT_FILENAME); + if (snapshotFile.exists() && snapshotFile.isFile()) { + LOGGER.error( + "Failed to take consistency progress snapshot because [{}] already exists", + snapshotFile.getAbsolutePath()); + return false; + } + + File tmpFile = new File(snapshotFile.getAbsolutePath() + "-" + UUID.randomUUID()); + try (FileOutputStream outputStream = new FileOutputStream(tmpFile)) { + List tables = getAllTables(); + ReadWriteIOUtils.write(tables.size(), outputStream); + for (RepairProgressTable table : tables) { + table.serialize(outputStream); + } + outputStream.getFD().sync(); + } + return tmpFile.renameTo(snapshotFile); + } + + @Override + public void processLoadSnapshot(File snapshotDir) throws TException, IOException { + File snapshotFile = new File(snapshotDir, SNAPSHOT_FILENAME); + if (!snapshotFile.exists() || !snapshotFile.isFile()) { + LOGGER.info( + "Consistency progress snapshot [{}] does not exist, skip loading", + snapshotFile.getAbsolutePath()); + return; + } + + ConcurrentHashMap recovered = new ConcurrentHashMap<>(); + try (FileInputStream inputStream = new FileInputStream(snapshotFile)) { + int size = ReadWriteIOUtils.readInt(inputStream); + for (int i = 0; i < size; i++) { + RepairProgressTable table = RepairProgressTable.deserialize(inputStream); + recovered.put(table.getConsensusGroupId(), table); + } + } + progressTables.clear(); + progressTables.putAll(recovered); + } + + public int size() { + return progressTables.size(); + } + + public Map view() { + return progressTables; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java index 8016690d17c9a..59355fba0add1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/persistence/executor/ConfigPlanExecutor.java @@ -151,6 +151,7 @@ import org.apache.iotdb.confignode.manager.externalservice.ExternalServiceInfo; import org.apache.iotdb.confignode.manager.pipe.agent.PipeConfigNodeAgent; import org.apache.iotdb.confignode.persistence.ClusterInfo; +import org.apache.iotdb.confignode.persistence.ConsistencyProgressInfo; import org.apache.iotdb.confignode.persistence.ProcedureInfo; import org.apache.iotdb.confignode.persistence.TTLInfo; import org.apache.iotdb.confignode.persistence.TriggerInfo; @@ -220,6 +221,8 @@ public class ConfigPlanExecutor { private final TTLInfo ttlInfo; + private final ConsistencyProgressInfo consistencyProgressInfo; + public ConfigPlanExecutor( ClusterInfo clusterInfo, NodeInfo nodeInfo, @@ -234,7 +237,8 @@ public ConfigPlanExecutor( PipeInfo pipeInfo, SubscriptionInfo subscriptionInfo, QuotaInfo quotaInfo, - TTLInfo ttlInfo) { + TTLInfo ttlInfo, + ConsistencyProgressInfo consistencyProgressInfo) { this.snapshotProcessorList = new ArrayList<>(); @@ -280,6 +284,9 @@ public ConfigPlanExecutor( this.ttlInfo = ttlInfo; this.snapshotProcessorList.add(ttlInfo); + this.consistencyProgressInfo = consistencyProgressInfo; + this.snapshotProcessorList.add(consistencyProgressInfo); + this.snapshotProcessorList.add(PipeConfigNodeAgent.runtime().listener()); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/ConsistencyPartitionSelector.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/ConsistencyPartitionSelector.java new file mode 100644 index 0000000000000..64f165b8c13a1 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/ConsistencyPartitionSelector.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.consistency; + +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.mpp.rpc.thrift.TPartitionConsistencyEligibility; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +final class ConsistencyPartitionSelector { + + private ConsistencyPartitionSelector() {} + + static List selectCandidatePartitions( + Map leaderEligibilityByPartition, + Set requestedPartitions, + boolean repairMode, + String requestedRepairEpoch, + RepairProgressTable repairProgressTable) { + return selectCandidatePartitions( + leaderEligibilityByPartition, + requestedPartitions, + repairMode, + requestedRepairEpoch, + repairProgressTable, + Collections.emptyMap()); + } + + static List selectCandidatePartitions( + Map leaderEligibilityByPartition, + Set requestedPartitions, + boolean repairMode, + String requestedRepairEpoch, + RepairProgressTable repairProgressTable, + Map replicaObservationTokens) { + List eligiblePartitions = + leaderEligibilityByPartition.values().stream() + .map(TPartitionConsistencyEligibility::getTimePartitionId) + .filter( + partitionId -> + requestedPartitions == null + || requestedPartitions.isEmpty() + || requestedPartitions.contains(partitionId)) + .sorted() + .collect(Collectors.toList()); + if (eligiblePartitions.isEmpty()) { + return Collections.emptyList(); + } + + if (repairMode && (requestedPartitions == null || requestedPartitions.isEmpty())) { + List mismatchPartitions = + eligiblePartitions.stream() + .filter( + partitionId -> { + RepairProgressTable.PartitionProgress progress = + repairProgressTable.getPartition(partitionId); + return progress != null + && progress.getCheckState() == RepairProgressTable.CheckState.MISMATCH + && (requestedRepairEpoch == null + || requestedRepairEpoch.equals(progress.getRepairEpoch())); + }) + .collect(Collectors.toList()); + if (!mismatchPartitions.isEmpty()) { + return mismatchPartitions; + } + } + + return eligiblePartitions.stream() + .filter( + partitionId -> + shouldInspectPartition( + leaderEligibilityByPartition.get(partitionId), + repairProgressTable.getPartition(partitionId), + replicaObservationTokens.get(partitionId))) + .collect(Collectors.toList()); + } + + static boolean shouldInspectPartition( + TPartitionConsistencyEligibility eligibility, + RepairProgressTable.PartitionProgress progress) { + return shouldInspectPartition(eligibility, progress, null); + } + + static boolean shouldInspectPartition( + TPartitionConsistencyEligibility eligibility, + RepairProgressTable.PartitionProgress progress, + String replicaObservationToken) { + if (eligibility == null) { + return false; + } + if (progress == null) { + return true; + } + RepairProgressTable.SnapshotState snapshotState = + RepairProgressTable.SnapshotState.valueOf(eligibility.getSnapshotState()); + if (snapshotState != RepairProgressTable.SnapshotState.READY) { + return true; + } + switch (progress.getCheckState()) { + case VERIFIED: + return progress.shouldCheck( + eligibility.getPartitionMutationEpoch(), + eligibility.getSnapshotEpoch(), + snapshotState, + replicaObservationToken); + case PENDING: + case DIRTY: + case MISMATCH: + case FAILED: + default: + return true; + } + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/LiveDataRegionRepairExecutionContext.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/LiveDataRegionRepairExecutionContext.java new file mode 100644 index 0000000000000..e56d8db5c8ddd --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/LiveDataRegionRepairExecutionContext.java @@ -0,0 +1,1139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; +import org.apache.iotdb.common.rpc.thrift.TEndPoint; +import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.consensus.iotv2.consistency.LogicalMismatchScope; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; +import org.apache.iotdb.confignode.client.sync.SyncDataNodeClientPool; +import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.consistency.ConsistencyProgressManager; +import org.apache.iotdb.mpp.rpc.thrift.TApplyLogicalRepairBatchReq; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffResp; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffResp; +import org.apache.iotdb.mpp.rpc.thrift.TFinishLogicalRepairSessionReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeResp; +import org.apache.iotdb.mpp.rpc.thrift.TLeafDiffEntry; +import org.apache.iotdb.mpp.rpc.thrift.TLeafDiffEstimate; +import org.apache.iotdb.mpp.rpc.thrift.TLogicalRepairBatch; +import org.apache.iotdb.mpp.rpc.thrift.TLogicalRepairLeafSelector; +import org.apache.iotdb.mpp.rpc.thrift.TPartitionConsistencyEligibility; +import org.apache.iotdb.mpp.rpc.thrift.TSnapshotSubtreeNode; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairReq; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairResp; +import org.apache.iotdb.rpc.TSStatusCode; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeMap; +import java.util.UUID; +import java.util.stream.Collectors; + +/** Runtime bridge between the repair procedure and the logical snapshot RPCs. */ +public class LiveDataRegionRepairExecutionContext + implements RepairRegionProcedure.RepairExecutionContext { + + private static final String TREE_KIND_LIVE = "LIVE"; + private static final String TREE_KIND_TOMBSTONE = "TOMBSTONE"; + private static final long EXACT_DIFF_DECODE_THRESHOLD = 2048L; + + private final ConfigManager configManager; + private final ConsistencyProgressManager consistencyProgressManager; + private final TConsensusGroupId consensusGroupId; + private final TDataNodeLocation leaderLocation; + private final List followerLocations; + private final String routeVersionToken; + private final Set requestedPartitions; + private final String requestedRepairEpoch; + private final boolean repairMode; + + private final Map leaderEligibilityByPartition = + new TreeMap<>(); + private final Map> + followerEligibilityByNodeId = new LinkedHashMap<>(); + private final Map partitionContexts = new TreeMap<>(); + private final Map repairOperationsById = new LinkedHashMap<>(); + + private long syncLag = Long.MAX_VALUE; + private long safeWatermark = Long.MIN_VALUE; + + public LiveDataRegionRepairExecutionContext( + ConfigManager configManager, TConsensusGroupId consensusGroupId) { + this(configManager, consensusGroupId, Collections.emptySet(), null, true); + } + + public LiveDataRegionRepairExecutionContext( + ConfigManager configManager, + TConsensusGroupId consensusGroupId, + Collection partitionFilter, + String requestedRepairEpoch, + boolean repairMode) { + this.configManager = configManager; + this.consistencyProgressManager = configManager.getConsistencyProgressManager(); + this.consensusGroupId = requireDataRegion(consensusGroupId); + this.requestedPartitions = + partitionFilter == null ? Collections.emptySet() : new LinkedHashSet<>(partitionFilter); + this.requestedRepairEpoch = requestedRepairEpoch; + this.repairMode = repairMode; + + TRegionReplicaSet replicaSet = + configManager + .getPartitionManager() + .getAllReplicaSetsMap(TConsensusGroupType.DataRegion) + .get(this.consensusGroupId); + if (replicaSet == null || replicaSet.getDataNodeLocations() == null) { + throw new IllegalStateException("DataRegion " + consensusGroupId + " does not exist"); + } + int leaderId = + configManager.getLoadManager().getRegionLeaderMap().getOrDefault(this.consensusGroupId, -1); + if (leaderId <= 0) { + throw new IllegalStateException("Cannot determine leader for DataRegion " + consensusGroupId); + } + this.leaderLocation = + replicaSet.getDataNodeLocations().stream() + .filter(location -> location.getDataNodeId() == leaderId) + .findFirst() + .orElseThrow( + () -> + new IllegalStateException( + "Leader " + + leaderId + + " is not part of replica set for " + + consensusGroupId)); + this.followerLocations = + replicaSet.getDataNodeLocations().stream() + .filter(location -> location.getDataNodeId() != leaderId) + .sorted(Comparator.comparingInt(TDataNodeLocation::getDataNodeId)) + .collect(Collectors.toList()); + this.routeVersionToken = buildRouteBindingToken(leaderId); + refreshLeaderEligibility(); + } + + @Override + public boolean isReplicationComplete() { + return syncLag == 0L; + } + + @Override + public long computeSafeWatermark() { + return safeWatermark; + } + + @Override + public List collectPendingPartitions( + long currentSafeWatermark, RepairProgressTable repairProgressTable) { + if (!isRouteBindingStillValid()) { + return Collections.emptyList(); + } + refreshLeaderEligibility(); + partitionContexts.clear(); + repairOperationsById.clear(); + + if (!isReplicationComplete() || safeWatermark == Long.MIN_VALUE) { + return Collections.emptyList(); + } + + followerEligibilityByNodeId.clear(); + for (TDataNodeLocation followerLocation : followerLocations) { + EligibilitySnapshot snapshot = fetchEligibility(followerLocation, !repairMode); + if (!snapshot.isAvailable()) { + return Collections.emptyList(); + } + followerEligibilityByNodeId.put( + followerLocation.getDataNodeId(), snapshot.getPartitionsMap()); + } + + List candidatePartitions = + ConsistencyPartitionSelector.selectCandidatePartitions( + leaderEligibilityByPartition, + requestedPartitions, + repairMode, + requestedRepairEpoch, + repairProgressTable, + buildReplicaObservationTokens()); + if (candidatePartitions.isEmpty()) { + return Collections.emptyList(); + } + + long now = System.currentTimeMillis(); + List repairablePartitions = new ArrayList<>(); + for (Long partitionId : candidatePartitions) { + TPartitionConsistencyEligibility leaderEligibility = + leaderEligibilityByPartition.get(partitionId); + RepairProgressTable.PartitionProgress progress = + repairProgressTable.getPartition(partitionId); + if (leaderEligibility == null) { + continue; + } + if (!isSnapshotComparable(leaderEligibility)) { + repairProgressTable.markDirty(partitionId); + continue; + } + + boolean allFollowersReady = true; + boolean rootMatched = true; + Set unionScopes = new LinkedHashSet<>(); + try { + for (TDataNodeLocation followerLocation : followerLocations) { + TPartitionConsistencyEligibility followerEligibility = + followerEligibilityByNodeId + .getOrDefault(followerLocation.getDataNodeId(), Collections.emptyMap()) + .get(partitionId); + if (!isSnapshotComparable(followerEligibility)) { + allFollowersReady = false; + break; + } + boolean liveMatched = sameRootDigest(leaderEligibility, followerEligibility); + boolean tombstoneMatched = sameTombstoneDigest(leaderEligibility, followerEligibility); + if (!liveMatched || !tombstoneMatched) { + rootMatched = false; + if (repairMode + && progress != null + && progress.getCheckState() == RepairProgressTable.CheckState.MISMATCH + && progress.getMismatchScopeRef() != null + && Objects.equals( + buildRepairEpoch(partitionId, leaderEligibility), progress.getRepairEpoch())) { + unionScopes.addAll(LogicalMismatchScope.deserialize(progress.getMismatchScopeRef())); + } else { + if (!liveMatched) { + unionScopes.addAll(comparePartition(followerLocation, partitionId, TREE_KIND_LIVE)); + } + if (!tombstoneMatched) { + unionScopes.addAll( + comparePartition(followerLocation, partitionId, TREE_KIND_TOMBSTONE)); + } + } + } + } + } catch (StaleSnapshotCompareException e) { + repairProgressTable.markDirty(partitionId); + continue; + } + + if (!allFollowersReady) { + repairProgressTable.markDirty(partitionId); + continue; + } + + if (rootMatched) { + String replicaObservationToken = buildReplicaObservationToken(partitionId); + repairProgressTable.markVerified( + partitionId, + now, + safeWatermark, + leaderEligibility.getPartitionMutationEpoch(), + leaderEligibility.getSnapshotEpoch(), + RepairProgressTable.SnapshotState.READY, + replicaObservationToken); + continue; + } + + String repairEpochRef = buildRepairEpoch(partitionId, leaderEligibility); + if (repairMode + && progress != null + && progress.getCheckState() == RepairProgressTable.CheckState.MISMATCH + && progress.getMismatchScopeRef() != null + && !Objects.equals(repairEpochRef, progress.getRepairEpoch())) { + repairProgressTable.markDirty(partitionId); + continue; + } + if (unionScopes.isEmpty()) { + String replicaObservationToken = buildReplicaObservationToken(partitionId); + repairProgressTable.markVerified( + partitionId, + now, + safeWatermark, + leaderEligibility.getPartitionMutationEpoch(), + leaderEligibility.getSnapshotEpoch(), + RepairProgressTable.SnapshotState.READY, + replicaObservationToken); + continue; + } + + String mismatchScopeRef = LogicalMismatchScope.serialize(unionScopes); + String replicaObservationToken = buildReplicaObservationToken(partitionId); + repairProgressTable.markMismatch( + partitionId, + now, + safeWatermark, + leaderEligibility.getPartitionMutationEpoch(), + leaderEligibility.getSnapshotEpoch(), + RepairProgressTable.SnapshotState.READY, + mismatchScopeRef, + unionScopes.size(), + repairEpochRef, + replicaObservationToken); + + if (!repairMode) { + continue; + } + + String blockingReason = buildBlockingReason(partitionId, unionScopes); + if (blockingReason != null) { + partitionContexts.put( + partitionId, + new LivePartitionRepairContext( + partitionId, false, Collections.emptyList(), repairEpochRef, blockingReason)); + repairablePartitions.add(partitionId); + continue; + } + + LivePartitionRepairContext context = + new LivePartitionRepairContext( + partitionId, false, new ArrayList<>(), repairEpochRef, null); + Set scheduledLeaves = new HashSet<>(); + for (TDataNodeLocation followerLocation : followerLocations) { + TPartitionConsistencyEligibility followerEligibility = + followerEligibilityByNodeId + .getOrDefault(followerLocation.getDataNodeId(), Collections.emptyMap()) + .get(partitionId); + if (followerEligibility == null + || (sameRootDigest(leaderEligibility, followerEligibility) + && sameTombstoneDigest(leaderEligibility, followerEligibility))) { + continue; + } + for (LogicalMismatchScope.Scope scope : unionScopes) { + RepairLeafKey repairLeafKey = + new RepairLeafKey( + followerLocation.getDataNodeId(), + scope.getTreeKind(), + scope.getLeafId(), + scope.toPersistentString()); + if (!scheduledLeaves.add(repairLeafKey)) { + continue; + } + String operationId = + partitionId + + "-" + + followerLocation.getDataNodeId() + + "-" + + scope.getTreeKind() + + "-" + + scope.getLeafId() + + "-" + + UUID.randomUUID(); + repairOperationsById.put( + operationId, + new RepairOperation(partitionId, followerLocation, scope, repairEpochRef)); + context.repairOperationIds.add(operationId); + } + } + if (!context.repairOperationIds.isEmpty()) { + partitionContexts.put(partitionId, context); + repairablePartitions.add(partitionId); + } + } + + return repairablePartitions; + } + + @Override + public RepairRegionProcedure.PartitionRepairContext getPartitionContext(long partitionId) { + return partitionContexts.get(partitionId); + } + + @Override + public void executeRepairOperation(String operationId) { + RepairOperation operation = repairOperationsById.get(operationId); + if (operation == null) { + throw new IllegalStateException("Unknown logical repair operation " + operationId); + } + if (!isRouteBindingStillValid()) { + throw new IllegalStateException( + "Repair route drift detected for partition " + operation.partitionId); + } + + TPartitionConsistencyEligibility leaderEligibility = + leaderEligibilityByPartition.get(operation.partitionId); + if (leaderEligibility == null + || !Objects.equals( + buildRepairEpoch(operation.partitionId, leaderEligibility), operation.repairEpoch)) { + throw new IllegalStateException( + "Repair epoch drift detected for partition " + operation.partitionId); + } + + TStreamLogicalRepairResp streamResp = + sendRequestExpect( + leaderLocation.getInternalEndPoint(), + new TStreamLogicalRepairReq( + consensusGroupId, + operation.partitionId, + operation.repairEpoch, + Collections.singletonList( + new TLogicalRepairLeafSelector( + operation.scope.getTreeKind(), encodeRepairSelector(operation.scope)))), + CnToDnSyncRequestType.STREAM_LOGICAL_REPAIR, + TStreamLogicalRepairResp.class); + requireSuccess(streamResp.getStatus(), "streamLogicalRepair"); + if (Boolean.TRUE.equals(streamResp.isStale())) { + throw new IllegalStateException("Leader logical repair stream is stale"); + } + + String sessionId = null; + if (streamResp.isSetBatches()) { + for (TLogicalRepairBatch batch : streamResp.getBatches()) { + sessionId = batch.getSessionId(); + TSStatus applyStatus = + sendRequestExpect( + operation.followerLocation.getInternalEndPoint(), + new TApplyLogicalRepairBatchReq( + consensusGroupId, + operation.partitionId, + operation.repairEpoch, + batch.getSessionId(), + batch.getTreeKind(), + batch.getLeafId(), + batch.getSeqNo(), + batch.getBatchKind(), + batch.bufferForPayload()), + CnToDnSyncRequestType.APPLY_LOGICAL_REPAIR_BATCH, + TSStatus.class); + requireSuccess(applyStatus, "applyLogicalRepairBatch"); + } + } + if (sessionId != null) { + TSStatus finishStatus = + sendRequestExpect( + operation.followerLocation.getInternalEndPoint(), + new TFinishLogicalRepairSessionReq( + consensusGroupId, operation.partitionId, operation.repairEpoch, sessionId), + CnToDnSyncRequestType.FINISH_LOGICAL_REPAIR_SESSION, + TSStatus.class); + requireSuccess(finishStatus, "finishLogicalRepairSession"); + } + } + + @Override + public RepairProgressTable loadRepairProgressTable(String consensusGroupKey) { + return consistencyProgressManager.loadRepairProgressTable(consensusGroupKey); + } + + @Override + public void persistRepairProgressTable(RepairProgressTable repairProgressTable) { + consistencyProgressManager.persistRepairProgressTable(repairProgressTable); + } + + @Override + public void onPartitionCommitted( + long partitionId, long committedAt, RepairProgressTable repairProgressTable) { + TPartitionConsistencyEligibility leaderEligibility = + leaderEligibilityByPartition.get(partitionId); + if (leaderEligibility != null) { + RepairProgressTable.PartitionProgress progress = + repairProgressTable.getOrCreatePartition(partitionId); + repairProgressTable.markRepairSucceeded( + partitionId, + committedAt, + safeWatermark, + leaderEligibility.getPartitionMutationEpoch(), + leaderEligibility.getSnapshotEpoch(), + RepairProgressTable.SnapshotState.READY, + progress.getRepairEpoch(), + progress.getReplicaObservationToken()); + } + } + + private Set comparePartition( + TDataNodeLocation followerLocation, long partitionId, String treeKind) { + Set scopes = new LinkedHashSet<>(); + + Map leaderShards = + fetchSubtreeNodes(leaderLocation, partitionId, treeKind, Collections.singletonList("root")); + Map followerShards = + fetchSubtreeNodes( + followerLocation, partitionId, treeKind, Collections.singletonList("root")); + + Set shardHandles = new LinkedHashSet<>(); + shardHandles.addAll(leaderShards.keySet()); + shardHandles.addAll(followerShards.keySet()); + for (String shardHandle : shardHandles) { + TSnapshotSubtreeNode leaderShard = leaderShards.get(shardHandle); + TSnapshotSubtreeNode followerShard = followerShards.get(shardHandle); + if (sameDigest(leaderShard, followerShard)) { + continue; + } + Map leaderLeaves = + fetchSubtreeNodes( + leaderLocation, partitionId, treeKind, Collections.singletonList(shardHandle)); + Map followerLeaves = + fetchSubtreeNodes( + followerLocation, partitionId, treeKind, Collections.singletonList(shardHandle)); + Set leafHandles = new LinkedHashSet<>(); + leafHandles.addAll(leaderLeaves.keySet()); + leafHandles.addAll(followerLeaves.keySet()); + for (String leafHandle : leafHandles) { + TSnapshotSubtreeNode leaderLeaf = leaderLeaves.get(leafHandle); + TSnapshotSubtreeNode followerLeaf = followerLeaves.get(leafHandle); + if (!sameDigest(leaderLeaf, followerLeaf)) { + scopes.addAll( + compareLeaf( + followerLocation, partitionId, treeKind, leafHandle, leaderLeaf, followerLeaf)); + } + } + } + return scopes; + } + + private Set compareLeaf( + TDataNodeLocation followerLocation, + long partitionId, + String treeKind, + String leafId, + TSnapshotSubtreeNode leaderLeaf, + TSnapshotSubtreeNode followerLeaf) { + long leaderItemCount = leaderLeaf == null ? 0L : leaderLeaf.getItemCount(); + long followerItemCount = followerLeaf == null ? 0L : followerLeaf.getItemCount(); + if (Math.max(leaderItemCount, followerItemCount) > EXACT_DIFF_DECODE_THRESHOLD) { + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + chooseKeyRangeStart(leaderLeaf, followerLeaf), + chooseKeyRangeEnd(leaderLeaf, followerLeaf))); + } + + TLeafDiffEstimate leaderEstimate = + fetchLeafEstimate(leaderLocation, partitionId, treeKind, leafId); + TLeafDiffEstimate followerEstimate = + fetchLeafEstimate(followerLocation, partitionId, treeKind, leafId); + if (leaderEstimate == null || followerEstimate == null) { + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + chooseKeyRangeStart(leaderLeaf, followerLeaf), + chooseKeyRangeEnd(leaderLeaf, followerLeaf))); + } + if (Math.max(leaderEstimate.getStrataEstimate(), followerEstimate.getStrataEstimate()) + > EXACT_DIFF_DECODE_THRESHOLD) { + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + chooseKeyRangeStart(leaderLeaf, followerLeaf), + chooseKeyRangeEnd(leaderLeaf, followerLeaf))); + } + + Set leaderKeys = fetchLeafDiffKeys(leaderLocation, partitionId, treeKind, leafId); + Set followerKeys = fetchLeafDiffKeys(followerLocation, partitionId, treeKind, leafId); + if (leaderKeys == null || followerKeys == null) { + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + chooseKeyRangeStart(leaderLeaf, followerLeaf), + chooseKeyRangeEnd(leaderLeaf, followerLeaf))); + } + + Set symmetricDiff = new LinkedHashSet<>(leaderKeys); + symmetricDiff.addAll(followerKeys); + Set intersection = new LinkedHashSet<>(leaderKeys); + intersection.retainAll(followerKeys); + symmetricDiff.removeAll(intersection); + if (symmetricDiff.isEmpty()) { + return Collections.emptySet(); + } + + List sortedDiffKeys = new ArrayList<>(symmetricDiff); + sortedDiffKeys.sort(String::compareTo); + if (TREE_KIND_TOMBSTONE.equalsIgnoreCase(treeKind)) { + Set leaderOnly = new LinkedHashSet<>(leaderKeys); + leaderOnly.removeAll(followerKeys); + List sortedLeaderOnlyKeys = new ArrayList<>(leaderOnly); + sortedLeaderOnlyKeys.sort(String::compareTo); + + Set followerOnly = new LinkedHashSet<>(followerKeys); + followerOnly.removeAll(leaderKeys); + if (!followerOnly.isEmpty()) { + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + sortedDiffKeys.get(0), + sortedDiffKeys.get(sortedDiffKeys.size() - 1), + sortedLeaderOnlyKeys, + LogicalMismatchScope.RepairDirective.FOLLOWER_EXTRA_TOMBSTONE)); + } + + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + sortedDiffKeys.get(0), + sortedDiffKeys.get(sortedDiffKeys.size() - 1), + sortedLeaderOnlyKeys)); + } + return Collections.singleton( + new LogicalMismatchScope.Scope( + treeKind, + leafId, + sortedDiffKeys.get(0), + sortedDiffKeys.get(sortedDiffKeys.size() - 1), + sortedDiffKeys)); + } + + private String buildBlockingReason(long partitionId, Set scopes) { + List unsupportedScopes = + scopes.stream() + .filter(scope -> scope != null && !scope.isRepairable()) + .map(scope -> scope.getTreeKind() + "@" + scope.getLeafId()) + .sorted() + .collect(Collectors.toList()); + if (unsupportedScopes.isEmpty()) { + return null; + } + return "Partition " + + partitionId + + " contains follower-only tombstone mismatches that cannot be rolled back safely yet: " + + String.join(", ", unsupportedScopes); + } + + private Map fetchSubtreeNodes( + TDataNodeLocation location, long partitionId, String treeKind, List nodeHandles) { + TPartitionConsistencyEligibility localEligibility = + getEligibilityForLocation(location, partitionId); + if (localEligibility == null) { + throw new StaleSnapshotCompareException( + "Missing snapshot eligibility while comparing partition " + partitionId); + } + TGetSnapshotSubtreeResp resp = + sendRequestExpect( + location.getInternalEndPoint(), + new TGetSnapshotSubtreeReq( + consensusGroupId, + partitionId, + localEligibility.getSnapshotEpoch(), + treeKind, + nodeHandles), + CnToDnSyncRequestType.GET_SNAPSHOT_SUBTREE, + TGetSnapshotSubtreeResp.class); + requireSuccess(resp.getStatus(), "getSnapshotSubtree"); + if (Boolean.TRUE.equals(resp.isStale())) { + throw new StaleSnapshotCompareException( + "Snapshot subtree became stale while comparing partition " + partitionId); + } + Map nodes = new LinkedHashMap<>(); + if (resp.isSetNodes()) { + for (TSnapshotSubtreeNode node : resp.getNodes()) { + nodes.put(node.getNodeHandle(), node); + } + } + return nodes; + } + + private TLeafDiffEstimate fetchLeafEstimate( + TDataNodeLocation location, long partitionId, String treeKind, String leafId) { + TPartitionConsistencyEligibility localEligibility = + getEligibilityForLocation(location, partitionId); + if (localEligibility == null) { + return null; + } + TEstimateLeafDiffResp resp = + sendRequestExpect( + location.getInternalEndPoint(), + new TEstimateLeafDiffReq( + consensusGroupId, + partitionId, + localEligibility.getSnapshotEpoch(), + treeKind, + leafId), + CnToDnSyncRequestType.ESTIMATE_LEAF_DIFF, + TEstimateLeafDiffResp.class); + requireSuccess(resp.getStatus(), "estimateLeafDiff"); + if (Boolean.TRUE.equals(resp.isStale()) || !resp.isSetLeafDiff()) { + return null; + } + return resp.getLeafDiff(); + } + + private Set fetchLeafDiffKeys( + TDataNodeLocation location, long partitionId, String treeKind, String leafId) { + TPartitionConsistencyEligibility localEligibility = + getEligibilityForLocation(location, partitionId); + if (localEligibility == null) { + return null; + } + TDecodeLeafDiffResp resp = + sendRequestExpect( + location.getInternalEndPoint(), + new TDecodeLeafDiffReq( + consensusGroupId, + partitionId, + localEligibility.getSnapshotEpoch(), + treeKind, + leafId), + CnToDnSyncRequestType.DECODE_LEAF_DIFF, + TDecodeLeafDiffResp.class); + requireSuccess(resp.getStatus(), "decodeLeafDiff"); + if (Boolean.TRUE.equals(resp.isStale())) { + return null; + } + Set logicalKeys = new LinkedHashSet<>(); + if (resp.isSetDiffEntries()) { + for (TLeafDiffEntry entry : resp.getDiffEntries()) { + logicalKeys.add(entry.getLogicalKey()); + } + } + return logicalKeys; + } + + private EligibilitySnapshot fetchEligibility( + TDataNodeLocation location, boolean tolerateUnavailable) { + Object response = + sendRequest( + location.getInternalEndPoint(), + new TGetConsistencyEligibilityReq(consensusGroupId), + CnToDnSyncRequestType.GET_CONSISTENCY_ELIGIBILITY); + if (response == null) { + if (tolerateUnavailable) { + return EligibilitySnapshot.unavailable(); + } + throw new IllegalStateException("getConsistencyEligibility failed: null response"); + } + if (response instanceof TSStatus) { + TSStatus status = (TSStatus) response; + if (tolerateUnavailable) { + return EligibilitySnapshot.unavailable(); + } + throw new IllegalStateException("getConsistencyEligibility failed: " + status.getMessage()); + } + if (!(response instanceof TGetConsistencyEligibilityResp)) { + throw new IllegalStateException( + "GET_CONSISTENCY_ELIGIBILITY returned unexpected response type " + + response.getClass().getSimpleName()); + } + TGetConsistencyEligibilityResp resp = (TGetConsistencyEligibilityResp) response; + requireSuccess(resp.getStatus(), "getConsistencyEligibility"); + return new EligibilitySnapshot(resp); + } + + private void refreshLeaderEligibility() { + if (!isRouteBindingStillValid()) { + syncLag = Long.MAX_VALUE; + safeWatermark = Long.MIN_VALUE; + leaderEligibilityByPartition.clear(); + followerEligibilityByNodeId.clear(); + return; + } + EligibilitySnapshot snapshot = fetchEligibility(leaderLocation, !repairMode); + if (!snapshot.isAvailable()) { + syncLag = Long.MAX_VALUE; + safeWatermark = Long.MIN_VALUE; + leaderEligibilityByPartition.clear(); + followerEligibilityByNodeId.clear(); + return; + } + syncLag = snapshot.syncLag; + safeWatermark = snapshot.safeWatermark; + leaderEligibilityByPartition.clear(); + leaderEligibilityByPartition.putAll(snapshot.partitionsById); + } + + private Map buildReplicaObservationTokens() { + Map tokens = new LinkedHashMap<>(); + for (Long partitionId : leaderEligibilityByPartition.keySet()) { + tokens.put(partitionId, buildReplicaObservationToken(partitionId)); + } + return tokens; + } + + private String buildReplicaObservationToken(long partitionId) { + StringBuilder builder = new StringBuilder(); + appendEligibilityToken( + builder, leaderLocation.getDataNodeId(), leaderEligibilityByPartition.get(partitionId)); + for (TDataNodeLocation followerLocation : followerLocations) { + builder.append('|'); + appendEligibilityToken( + builder, + followerLocation.getDataNodeId(), + followerEligibilityByNodeId + .getOrDefault(followerLocation.getDataNodeId(), Collections.emptyMap()) + .get(partitionId)); + } + return builder.toString(); + } + + private void appendEligibilityToken( + StringBuilder builder, int dataNodeId, TPartitionConsistencyEligibility eligibility) { + builder.append(dataNodeId).append(':'); + if (eligibility == null) { + builder.append("MISSING"); + return; + } + builder + .append(eligibility.getPartitionMutationEpoch()) + .append(':') + .append(eligibility.getSnapshotEpoch()) + .append(':') + .append(eligibility.getSnapshotState()) + .append(':') + .append(eligibility.getLiveRootXorHash()) + .append(':') + .append(eligibility.getLiveRootAddHash()) + .append(':') + .append(eligibility.getTombstoneRootXorHash()) + .append(':') + .append(eligibility.getTombstoneRootAddHash()); + } + + private TPartitionConsistencyEligibility getEligibilityForLocation( + TDataNodeLocation location, long partitionId) { + if (location == null) { + return null; + } + if (location.getDataNodeId() == leaderLocation.getDataNodeId()) { + return leaderEligibilityByPartition.get(partitionId); + } + return followerEligibilityByNodeId + .getOrDefault(location.getDataNodeId(), Collections.emptyMap()) + .get(partitionId); + } + + private boolean isSnapshotComparable(TPartitionConsistencyEligibility eligibility) { + return eligibility != null + && "READY".equalsIgnoreCase(eligibility.getSnapshotState()) + && eligibility.getSnapshotEpoch() == eligibility.getPartitionMutationEpoch(); + } + + private boolean sameRootDigest( + TPartitionConsistencyEligibility left, TPartitionConsistencyEligibility right) { + return left.getLiveRootXorHash() == right.getLiveRootXorHash() + && left.getLiveRootAddHash() == right.getLiveRootAddHash(); + } + + private boolean sameTombstoneDigest( + TPartitionConsistencyEligibility left, TPartitionConsistencyEligibility right) { + return left.getTombstoneRootXorHash() == right.getTombstoneRootXorHash() + && left.getTombstoneRootAddHash() == right.getTombstoneRootAddHash(); + } + + private boolean sameDigest(TSnapshotSubtreeNode left, TSnapshotSubtreeNode right) { + if (left == null || right == null) { + return false; + } + return left.getXorHash() == right.getXorHash() && left.getAddHash() == right.getAddHash(); + } + + private String buildRepairEpoch( + long partitionId, TPartitionConsistencyEligibility leaderEligibility) { + return leaderLocation.getDataNodeId() + + ":" + + partitionId + + ":" + + safeWatermark + + ":" + + leaderEligibility.getSnapshotEpoch() + + ":" + + leaderEligibility.getPartitionMutationEpoch() + + ":" + + routeVersionToken; + } + + private boolean isRouteBindingStillValid() { + int currentLeaderId = + configManager.getLoadManager().getRegionLeaderMap().getOrDefault(consensusGroupId, -1); + return currentLeaderId == leaderLocation.getDataNodeId() + && Objects.equals(routeVersionToken, buildRouteBindingToken(currentLeaderId)); + } + + private String buildRouteBindingToken(int leaderId) { + TRegionReplicaSet routeReplicaSet = + configManager.getLoadManager().getRegionPriorityMap().get(consensusGroupId); + if (routeReplicaSet == null || routeReplicaSet.getDataNodeLocations() == null) { + routeReplicaSet = + configManager + .getPartitionManager() + .getAllReplicaSetsMap(TConsensusGroupType.DataRegion) + .get(consensusGroupId); + } + List routeNodeIds = + routeReplicaSet == null || routeReplicaSet.getDataNodeLocations() == null + ? Collections.emptyList() + : routeReplicaSet.getDataNodeLocations().stream() + .map(TDataNodeLocation::getDataNodeId) + .collect(Collectors.toList()); + return leaderId + + "-" + + routeNodeIds.stream().map(String::valueOf).collect(Collectors.joining("_")); + } + + private String encodeRepairSelector(LogicalMismatchScope.Scope scope) { + if ((scope.getExactKeys() == null || scope.getExactKeys().isEmpty()) + && scope.getKeyRangeStart() == null + && scope.getKeyRangeEnd() == null) { + return scope.getLeafId(); + } + List exactKeys = + scope.getExactKeys() == null ? Collections.emptyList() : scope.getExactKeys(); + return scope.getLeafId() + + "@" + + encodeNullable(scope.getKeyRangeStart()) + + "@" + + encodeNullable(scope.getKeyRangeEnd()) + + "@" + + encodeNullable(String.join("\n", exactKeys)); + } + + private String encodeNullable(String value) { + if (value == null) { + return ""; + } + return java.util.Base64.getUrlEncoder() + .withoutPadding() + .encodeToString(value.getBytes(java.nio.charset.StandardCharsets.UTF_8)); + } + + private String chooseKeyRangeStart( + TSnapshotSubtreeNode leaderLeaf, TSnapshotSubtreeNode followerLeaf) { + if (leaderLeaf != null && leaderLeaf.isSetKeyRangeStart()) { + return leaderLeaf.getKeyRangeStart(); + } + return followerLeaf != null && followerLeaf.isSetKeyRangeStart() + ? followerLeaf.getKeyRangeStart() + : null; + } + + private String chooseKeyRangeEnd( + TSnapshotSubtreeNode leaderLeaf, TSnapshotSubtreeNode followerLeaf) { + if (leaderLeaf != null && leaderLeaf.isSetKeyRangeEnd()) { + return leaderLeaf.getKeyRangeEnd(); + } + return followerLeaf != null && followerLeaf.isSetKeyRangeEnd() + ? followerLeaf.getKeyRangeEnd() + : null; + } + + private Object sendRequest(TEndPoint endPoint, Object req, CnToDnSyncRequestType requestType) { + return SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithRetry(endPoint, req, requestType); + } + + private T sendRequestExpect( + TEndPoint endPoint, Object req, CnToDnSyncRequestType requestType, Class responseType) { + Object response = sendRequest(endPoint, req, requestType); + if (!responseType.isInstance(response)) { + throw new IllegalStateException( + requestType + + " returned unexpected response type " + + response.getClass().getSimpleName()); + } + return responseType.cast(response); + } + + private void requireSuccess(TSStatus status, String action) { + if (status == null || status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + throw new IllegalStateException( + action + " failed: " + (status == null ? "null status" : status.getMessage())); + } + } + + private static TConsensusGroupId requireDataRegion(TConsensusGroupId consensusGroupId) { + if (consensusGroupId == null || consensusGroupId.getType() != TConsensusGroupType.DataRegion) { + throw new IllegalArgumentException("Only DataRegion consistency repair is supported"); + } + return consensusGroupId; + } + + private static class EligibilitySnapshot { + private final boolean available; + private final long syncLag; + private final long safeWatermark; + private final Map partitionsById = + new LinkedHashMap<>(); + + private EligibilitySnapshot() { + this.available = false; + this.syncLag = Long.MAX_VALUE; + this.safeWatermark = Long.MIN_VALUE; + } + + private EligibilitySnapshot(TGetConsistencyEligibilityResp resp) { + this.available = true; + this.syncLag = resp.getSyncLag(); + this.safeWatermark = resp.getSafeWatermark(); + if (resp.isSetPartitions()) { + for (TPartitionConsistencyEligibility partition : resp.getPartitions()) { + partitionsById.put(partition.getTimePartitionId(), partition); + } + } + } + + private static EligibilitySnapshot unavailable() { + return new EligibilitySnapshot(); + } + + private boolean isAvailable() { + return available; + } + + private Map getPartitionsMap() { + return partitionsById; + } + } + + private static final class StaleSnapshotCompareException extends IllegalStateException { + private StaleSnapshotCompareException(String message) { + super(message); + } + } + + private static class RepairLeafKey { + private final int followerDataNodeId; + private final String treeKind; + private final String leafId; + private final String scopeRef; + + private RepairLeafKey(int followerDataNodeId, String treeKind, String leafId, String scopeRef) { + this.followerDataNodeId = followerDataNodeId; + this.treeKind = treeKind; + this.leafId = leafId; + this.scopeRef = scopeRef; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof RepairLeafKey)) { + return false; + } + RepairLeafKey that = (RepairLeafKey) obj; + return followerDataNodeId == that.followerDataNodeId + && Objects.equals(treeKind, that.treeKind) + && Objects.equals(leafId, that.leafId) + && Objects.equals(scopeRef, that.scopeRef); + } + + @Override + public int hashCode() { + return Objects.hash(followerDataNodeId, treeKind, leafId, scopeRef); + } + } + + private static class RepairOperation { + private final long partitionId; + private final TDataNodeLocation followerLocation; + private final LogicalMismatchScope.Scope scope; + private final String repairEpoch; + + private RepairOperation( + long partitionId, + TDataNodeLocation followerLocation, + LogicalMismatchScope.Scope scope, + String repairEpoch) { + this.partitionId = partitionId; + this.followerLocation = followerLocation; + this.scope = scope; + this.repairEpoch = repairEpoch; + } + } + + private final class LivePartitionRepairContext + implements RepairRegionProcedure.PartitionRepairContext { + private final long partitionId; + private final boolean rootHashMatched; + private final List repairOperationIds; + private final String repairEpoch; + private final String blockingReason; + + private LivePartitionRepairContext( + long partitionId, + boolean rootHashMatched, + List repairOperationIds, + String repairEpoch, + String blockingReason) { + this.partitionId = partitionId; + this.rootHashMatched = rootHashMatched; + this.repairOperationIds = repairOperationIds; + this.repairEpoch = repairEpoch; + this.blockingReason = blockingReason; + } + + @Override + public long getPartitionId() { + return partitionId; + } + + @Override + public boolean isRootHashMatched() { + return rootHashMatched; + } + + @Override + public List getRepairOperationIds() { + return repairOperationIds; + } + + @Override + public String getRepairEpoch() { + return repairEpoch; + } + + @Override + public String getBlockingReason() { + return blockingReason; + } + + @Override + public boolean verify() { + if (!isRouteBindingStillValid()) { + return false; + } + refreshLeaderEligibility(); + TPartitionConsistencyEligibility leaderEligibility = + leaderEligibilityByPartition.get(partitionId); + if (!isSnapshotComparable(leaderEligibility) + || !Objects.equals(repairEpoch, buildRepairEpoch(partitionId, leaderEligibility))) { + return false; + } + for (TDataNodeLocation followerLocation : followerLocations) { + TPartitionConsistencyEligibility followerEligibility = + fetchEligibility(followerLocation, false).getPartitionsMap().get(partitionId); + if (!isSnapshotComparable(followerEligibility) + || !sameRootDigest(leaderEligibility, followerEligibility) + || !sameTombstoneDigest(leaderEligibility, followerEligibility)) { + return false; + } + } + return true; + } + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/RepairRegionProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/RepairRegionProcedure.java new file mode 100644 index 0000000000000..019af5dd4c98b --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/consistency/RepairRegionProcedure.java @@ -0,0 +1,667 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; +import org.apache.iotdb.confignode.procedure.state.consistency.RepairState; +import org.apache.iotdb.confignode.procedure.store.ProcedureType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; + +/** + * RepairRegionProcedure orchestrates partition-scoped replica repair for a single consensus group. + * + *

The current repair path is logical-snapshot driven: ConfigNode first identifies the exact + * partition/leaf mismatch scope, then the procedure executes the corresponding logical repair + * operations and verifies the partition after repair. + */ +public class RepairRegionProcedure + extends StateMachineProcedure { + + private static final Logger LOGGER = LoggerFactory.getLogger(RepairRegionProcedure.class); + + private static final ConcurrentHashMap EXECUTION_CONTEXTS = + new ConcurrentHashMap<>(); + + private TConsensusGroupId consensusGroupId; + private long tSafe; + private List pendingPartitions; + private int currentPartitionIndex; + private boolean hashMatched; + private String executionContextId; + private String lastFailureReason; + + private transient RepairExecutionContext executionContext; + private transient RepairProgressTable repairProgressTable; + private transient PartitionRepairContext currentPartitionContext; + private transient List repairOperationIds; + private transient Set executedRepairOperations; + + /** Required for deserialization. */ + public RepairRegionProcedure() { + this(null, (String) null); + } + + public RepairRegionProcedure(TConsensusGroupId consensusGroupId) { + this(consensusGroupId, (String) null); + } + + public RepairRegionProcedure( + TConsensusGroupId consensusGroupId, RepairExecutionContext executionContext) { + this(consensusGroupId, registerExecutionContext(executionContext)); + this.executionContext = executionContext; + } + + public RepairRegionProcedure(TConsensusGroupId consensusGroupId, String executionContextId) { + this.consensusGroupId = consensusGroupId; + this.executionContextId = executionContextId; + this.pendingPartitions = new ArrayList<>(); + this.currentPartitionIndex = 0; + this.hashMatched = false; + initializeTransientState(); + } + + @Override + protected Flow executeFromState(ConfigNodeProcedureEnv env, RepairState state) + throws InterruptedException { + try { + switch (state) { + case INIT: + LOGGER.info("RepairRegionProcedure: INIT for group {}", consensusGroupId); + setNextState(RepairState.CHECK_SYNC_LAG); + break; + + case CHECK_SYNC_LAG: + LOGGER.info("RepairRegionProcedure: CHECK_SYNC_LAG for group {}", consensusGroupId); + if (!requireExecutionContext().isReplicationComplete()) { + LOGGER.info( + "RepairRegionProcedure: skipping group {} because replication is not complete", + consensusGroupId); + finishWithoutRepair(); + return Flow.NO_MORE_STATE; + } + setNextState(RepairState.COMPUTE_WATERMARK); + break; + + case COMPUTE_WATERMARK: + LOGGER.info("RepairRegionProcedure: COMPUTE_WATERMARK for group {}", consensusGroupId); + computeWatermarkAndPartitions(); + if (pendingPartitions.isEmpty()) { + LOGGER.info("No pending partitions to repair for group {}", consensusGroupId); + finishWithoutRepair(); + return Flow.NO_MORE_STATE; + } + setNextState(RepairState.PREPARE_LOGICAL_SNAPSHOT); + break; + + case PREPARE_LOGICAL_SNAPSHOT: + LOGGER.info( + "RepairRegionProcedure: PREPARE_LOGICAL_SNAPSHOT for partition {} of group {}", + getCurrentPartitionId(), + consensusGroupId); + buildPartitionContext(); + setNextState(RepairState.COMPARE_ROOT_HASH); + break; + + case COMPARE_ROOT_HASH: + hashMatched = requireCurrentPartitionContext().isRootHashMatched(); + if (hashMatched) { + LOGGER.info("Partition {} is already matched", getCurrentPartitionId()); + setNextState(RepairState.COMMIT_PARTITION); + } else { + LOGGER.info("Partition {} is mismatched", getCurrentPartitionId()); + setNextState(RepairState.DRILL_DOWN); + } + break; + + case DRILL_DOWN: + prepareRepairOperations(); + setNextState(RepairState.EXECUTE_REPAIR_OPERATIONS); + break; + + case EXECUTE_REPAIR_OPERATIONS: + executeRepairOperations(); + setNextState(RepairState.VERIFY_REPAIR); + break; + + case VERIFY_REPAIR: + if (hashMatched || requireCurrentPartitionContext().verify()) { + setNextState(RepairState.COMMIT_PARTITION); + } else { + lastFailureReason = + "Verification failed for partition " + + getCurrentPartitionId() + + " in group " + + consensusGroupId; + LOGGER.warn(lastFailureReason); + setNextState(RepairState.ROLLBACK); + } + break; + + case COMMIT_PARTITION: + commitPartition(); + currentPartitionIndex++; + if (currentPartitionIndex < pendingPartitions.size()) { + setNextState(RepairState.PREPARE_LOGICAL_SNAPSHOT); + } else { + setNextState(RepairState.ADVANCE_WATERMARK); + } + break; + + case ADVANCE_WATERMARK: + advanceWatermark(); + return Flow.NO_MORE_STATE; + + case ROLLBACK: + LOGGER.warn("RepairRegionProcedure: ROLLBACK for group {}", consensusGroupId); + rollback(); + return Flow.NO_MORE_STATE; + + case DONE: + finishWithoutRepair(); + return Flow.NO_MORE_STATE; + + default: + LOGGER.error("Unknown state: {}", state); + finishWithoutRepair(); + return Flow.NO_MORE_STATE; + } + } catch (Exception e) { + lastFailureReason = e.getMessage(); + LOGGER.error( + "Error in RepairRegionProcedure state {} for group {}: {}", + state, + consensusGroupId, + e.getMessage(), + e); + setNextState(RepairState.ROLLBACK); + } + return Flow.HAS_MORE_STATE; + } + + private void computeWatermarkAndPartitions() { + RepairExecutionContext context = requireExecutionContext(); + RepairProgressTable progressTable = getOrCreateRepairProgressTable(); + tSafe = context.computeSafeWatermark(); + + pendingPartitions.clear(); + if (tSafe == Long.MIN_VALUE) { + return; + } + + List candidatePartitions = + safeList(context.collectPendingPartitions(tSafe, progressTable)); + candidatePartitions.stream() + .filter(Objects::nonNull) + .filter(partitionId -> partitionId <= tSafe) + .distinct() + .sorted(Comparator.naturalOrder()) + .forEach( + partitionId -> { + pendingPartitions.add(partitionId); + progressTable.getOrCreatePartition(partitionId); + }); + persistRepairProgressTable(progressTable); + } + + private void buildPartitionContext() { + resetCurrentPartitionState(); + long partitionId = getCurrentPartitionId(); + currentPartitionContext = requireExecutionContext().getPartitionContext(partitionId); + if (currentPartitionContext == null) { + throw new IllegalStateException( + "Missing partition context for partition " + + partitionId + + " in group " + + consensusGroupId); + } + if (currentPartitionContext.getPartitionId() != partitionId) { + throw new IllegalStateException( + "Partition context mismatch: expected " + + partitionId + + ", actual " + + currentPartitionContext.getPartitionId()); + } + } + + private void prepareRepairOperations() { + PartitionRepairContext partitionContext = requireCurrentPartitionContext(); + LOGGER.info( + "Partition {} has {} logical repair operations in the cached mismatch scope", + partitionContext.getPartitionId(), + safeList(partitionContext.getRepairOperationIds()).size()); + + List operationIds = safeList(partitionContext.getRepairOperationIds()); + if (operationIds.isEmpty()) { + String blockingReason = partitionContext.getBlockingReason(); + throw new IllegalStateException( + blockingReason != null + ? blockingReason + : ("Partition " + + partitionContext.getPartitionId() + + " is mismatched but no repair operations were provided")); + } + + for (String operationId : operationIds) { + repairOperationIds.add(operationId); + } + + RepairProgressTable progressTable = getOrCreateRepairProgressTable(); + String repairEpoch = + partitionContext.getRepairEpoch() != null + ? partitionContext.getRepairEpoch() + : progressTable + .getOrCreatePartition(partitionContext.getPartitionId()) + .getRepairEpoch(); + progressTable.markRepairRunning(partitionContext.getPartitionId(), repairEpoch); + persistRepairProgressTable(progressTable); + } + + private void executeRepairOperations() throws Exception { + for (String operationId : repairOperationIds) { + if (!executedRepairOperations.add(operationId)) { + continue; + } + requireExecutionContext().executeRepairOperation(operationId); + } + } + + private void commitPartition() { + long partitionId = getCurrentPartitionId(); + RepairProgressTable progressTable = getOrCreateRepairProgressTable(); + long checkedAt = System.currentTimeMillis(); + requireExecutionContext().onPartitionCommitted(partitionId, checkedAt, progressTable); + RepairProgressTable.PartitionProgress progress = + progressTable.getOrCreatePartition(partitionId); + if (progress.getCheckState() != RepairProgressTable.CheckState.VERIFIED) { + progressTable.markRepairSucceeded( + partitionId, + checkedAt, + tSafe, + progress.getPartitionMutationEpoch(), + progress.getSnapshotEpoch(), + progress.getSnapshotState(), + progress.getRepairEpoch()); + } + persistRepairProgressTable(progressTable); + LOGGER.info("Committed partition {} for group {}", partitionId, consensusGroupId); + resetCurrentPartitionState(); + } + + private void advanceWatermark() { + persistRepairProgressTable(getOrCreateRepairProgressTable()); + LOGGER.info( + "Finished repair procedure for group {} at safe watermark {}", consensusGroupId, tSafe); + cleanupExecutionContext(); + } + + protected void rollback() { + long partitionId = getCurrentPartitionId(); + RepairProgressTable progressTable = getOrCreateRepairProgressTable(); + if (partitionId >= 0) { + RepairProgressTable.PartitionProgress partitionProgress = + progressTable.getOrCreatePartition(partitionId); + progressTable.markRepairFailed( + partitionId, + partitionProgress.getRepairEpoch(), + "REPAIR_FAILED", + lastFailureReason == null ? "Unknown repair failure" : lastFailureReason); + progressTable.markCheckFailed( + partitionId, + System.currentTimeMillis(), + tSafe, + partitionProgress.getPartitionMutationEpoch(), + partitionProgress.getSnapshotEpoch(), + partitionProgress.getSnapshotState(), + "REPAIR_FAILED", + lastFailureReason == null ? "Unknown repair failure" : lastFailureReason); + } + RepairExecutionContext context = getExecutionContextIfPresent(); + if (context != null && partitionId >= 0) { + context.rollbackPartition(partitionId, progressTable); + } + persistRepairProgressTable(progressTable); + LOGGER.warn("Rolled back repair for group {}", consensusGroupId); + cleanupExecutionContext(); + } + + private long getCurrentPartitionId() { + if (pendingPartitions.isEmpty() || currentPartitionIndex >= pendingPartitions.size()) { + return -1; + } + return pendingPartitions.get(currentPartitionIndex); + } + + @Override + protected RepairState getState(int stateId) { + return RepairState.values()[stateId]; + } + + @Override + protected int getStateId(RepairState repairState) { + return repairState.ordinal(); + } + + @Override + protected RepairState getInitialState() { + return RepairState.INIT; + } + + @Override + protected void rollbackState(ConfigNodeProcedureEnv env, RepairState state) { + LOGGER.warn("Rollback requested for state {} in group {}", state, consensusGroupId); + } + + @Override + public void serialize(DataOutputStream stream) throws IOException { + stream.writeShort(ProcedureType.REPAIR_REGION_PROCEDURE.getTypeCode()); + super.serialize(stream); + + stream.writeBoolean(consensusGroupId != null); + if (consensusGroupId != null) { + stream.writeInt(consensusGroupId.getId()); + stream.writeInt(consensusGroupId.getType().getValue()); + } + stream.writeLong(tSafe); + stream.writeInt(currentPartitionIndex); + stream.writeBoolean(hashMatched); + writeString(stream, executionContextId); + writeString(stream, lastFailureReason); + + stream.writeInt(pendingPartitions.size()); + for (long partitionId : pendingPartitions) { + stream.writeLong(partitionId); + } + } + + @Override + public void deserialize(ByteBuffer byteBuffer) { + super.deserialize(byteBuffer); + + if (byteBuffer.get() != 0) { + int groupId = byteBuffer.getInt(); + int groupType = byteBuffer.getInt(); + this.consensusGroupId = new TConsensusGroupId(); + this.consensusGroupId.setId(groupId); + this.consensusGroupId.setType( + org.apache.iotdb.common.rpc.thrift.TConsensusGroupType.findByValue(groupType)); + } else { + this.consensusGroupId = null; + } + + this.tSafe = byteBuffer.getLong(); + this.currentPartitionIndex = byteBuffer.getInt(); + this.hashMatched = byteBuffer.get() != 0; + this.executionContextId = readString(byteBuffer); + this.lastFailureReason = readString(byteBuffer); + + int partitionCount = byteBuffer.getInt(); + this.pendingPartitions = new ArrayList<>(partitionCount); + for (int i = 0; i < partitionCount; i++) { + pendingPartitions.add(byteBuffer.getLong()); + } + initializeTransientState(); + } + + public TConsensusGroupId getConsensusGroupId() { + return consensusGroupId; + } + + public String getExecutionContextId() { + return executionContextId; + } + + public static String registerExecutionContext(RepairExecutionContext executionContext) { + if (executionContext == null) { + return null; + } + String contextId = UUID.randomUUID().toString(); + EXECUTION_CONTEXTS.put(contextId, executionContext); + return contextId; + } + + public static void unregisterExecutionContext(String contextId) { + if (contextId == null) { + return; + } + EXECUTION_CONTEXTS.remove(contextId); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof RepairRegionProcedure)) { + return false; + } + RepairRegionProcedure that = (RepairRegionProcedure) o; + return tSafe == that.tSafe + && currentPartitionIndex == that.currentPartitionIndex + && hashMatched == that.hashMatched + && Objects.equals(consensusGroupId, that.consensusGroupId) + && Objects.equals(pendingPartitions, that.pendingPartitions) + && Objects.equals(executionContextId, that.executionContextId) + && Objects.equals(lastFailureReason, that.lastFailureReason); + } + + @Override + public int hashCode() { + return Objects.hash( + consensusGroupId, + tSafe, + pendingPartitions, + currentPartitionIndex, + hashMatched, + executionContextId, + lastFailureReason); + } + + private void initializeTransientState() { + this.executionContext = getExecutionContextIfPresent(); + this.currentPartitionContext = null; + this.repairProgressTable = null; + this.repairOperationIds = new ArrayList<>(); + this.executedRepairOperations = new LinkedHashSet<>(); + } + + private void resetCurrentPartitionState() { + this.currentPartitionContext = null; + this.repairOperationIds.clear(); + this.executedRepairOperations.clear(); + this.hashMatched = false; + this.lastFailureReason = null; + } + + private RepairExecutionContext requireExecutionContext() { + RepairExecutionContext context = getExecutionContextIfPresent(); + if (context == null) { + throw new IllegalStateException( + "No execution context registered for repair procedure of group " + + consensusGroupId + + ". Expected context id: " + + executionContextId); + } + return context; + } + + private RepairExecutionContext getExecutionContextIfPresent() { + if (executionContext == null && executionContextId != null) { + executionContext = EXECUTION_CONTEXTS.get(executionContextId); + } + return executionContext; + } + + private PartitionRepairContext requireCurrentPartitionContext() { + if (currentPartitionContext == null) { + throw new IllegalStateException( + "Partition context has not been prepared for partition " + getCurrentPartitionId()); + } + return currentPartitionContext; + } + + private RepairProgressTable getOrCreateRepairProgressTable() { + if (repairProgressTable == null) { + RepairExecutionContext context = getExecutionContextIfPresent(); + if (context != null) { + repairProgressTable = + context.loadRepairProgressTable(toConsensusGroupKey(consensusGroupId)); + } + if (repairProgressTable == null) { + repairProgressTable = new RepairProgressTable(toConsensusGroupKey(consensusGroupId)); + } + } + return repairProgressTable; + } + + private void finishWithoutRepair() { + if (repairProgressTable != null) { + persistRepairProgressTable(repairProgressTable); + } + cleanupExecutionContext(); + } + + private void persistRepairProgressTable(RepairProgressTable progressTable) { + RepairExecutionContext context = getExecutionContextIfPresent(); + if (context != null && progressTable != null) { + context.persistRepairProgressTable(progressTable); + } + } + + private void cleanupExecutionContext() { + RepairExecutionContext context = getExecutionContextIfPresent(); + if (context != null) { + try { + context.close(); + } catch (Exception e) { + LOGGER.warn( + "Failed to close repair execution context {}: {}", + executionContextId, + e.getMessage(), + e); + } + } + unregisterExecutionContext(executionContextId); + executionContext = null; + } + + private static List safeList(List list) { + return list == null ? Collections.emptyList() : list; + } + + private static String toConsensusGroupKey(TConsensusGroupId consensusGroupId) { + if (consensusGroupId == null) { + return "unknown"; + } + return consensusGroupId.getType() + "-" + consensusGroupId.getId(); + } + + private static void writeString(DataOutputStream stream, String value) throws IOException { + if (value == null) { + stream.writeInt(-1); + return; + } + byte[] bytes = value.getBytes(StandardCharsets.UTF_8); + stream.writeInt(bytes.length); + stream.write(bytes); + } + + private static String readString(ByteBuffer byteBuffer) { + int length = byteBuffer.getInt(); + if (length < 0) { + return null; + } + byte[] bytes = new byte[length]; + byteBuffer.get(bytes); + return new String(bytes, StandardCharsets.UTF_8); + } + + /** + * Bridge between the state machine and the transport/runtime-specific implementation that + * provides logical-snapshot mismatch repair operations. + */ + public interface RepairExecutionContext extends AutoCloseable { + + boolean isReplicationComplete(); + + long computeSafeWatermark(); + + List collectPendingPartitions( + long safeWatermark, RepairProgressTable repairProgressTable); + + PartitionRepairContext getPartitionContext(long partitionId); + + void executeRepairOperation(String operationId) throws Exception; + + default RepairProgressTable loadRepairProgressTable(String consensusGroupKey) { + return null; + } + + default void persistRepairProgressTable(RepairProgressTable repairProgressTable) {} + + default void onPartitionCommitted( + long partitionId, long committedAt, RepairProgressTable repairProgressTable) {} + + default void rollbackPartition(long partitionId, RepairProgressTable repairProgressTable) {} + + @Override + default void close() {} + } + + /** Immutable repair inputs for a single time partition. */ + public interface PartitionRepairContext { + + long getPartitionId(); + + boolean isRootHashMatched(); + + List getRepairOperationIds(); + + default String getRepairEpoch() { + return null; + } + + default String getBlockingReason() { + return null; + } + + default boolean verify() { + return true; + } + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/consistency/RepairState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/consistency/RepairState.java new file mode 100644 index 0000000000000..f0d263380de51 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/consistency/RepairState.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.state.consistency; + +/** State machine states for the RepairRegionProcedure. */ +public enum RepairState { + INIT, + CHECK_SYNC_LAG, + COMPUTE_WATERMARK, + PREPARE_LOGICAL_SNAPSHOT, + COMPARE_ROOT_HASH, + DRILL_DOWN, + EXECUTE_REPAIR_OPERATIONS, + VERIFY_REPAIR, + COMMIT_PARTITION, + ADVANCE_WATERMARK, + ROLLBACK, + DONE +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java index dd15558608718..474ca6f618171 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java @@ -21,6 +21,7 @@ import org.apache.iotdb.commons.exception.runtime.ThriftSerDeException; import org.apache.iotdb.confignode.procedure.Procedure; +import org.apache.iotdb.confignode.procedure.impl.consistency.RepairRegionProcedure; import org.apache.iotdb.confignode.procedure.impl.cq.CreateCQProcedure; import org.apache.iotdb.confignode.procedure.impl.node.AddConfigNodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure; @@ -395,6 +396,9 @@ public Procedure create(ByteBuffer buffer) throws IOException { case CONSUMER_GROUP_META_SYNC_PROCEDURE: procedure = new ConsumerGroupMetaSyncProcedure(); break; + case REPAIR_REGION_PROCEDURE: + procedure = new RepairRegionProcedure(); + break; case CREATE_MANY_DATABASES_PROCEDURE: procedure = new CreateManyDatabasesProcedure(); break; @@ -540,6 +544,8 @@ public static ProcedureType getProcedureType(final Procedure procedure) { return ProcedureType.ALTER_CONSUMER_GROUP_PROCEDURE; } else if (procedure instanceof ConsumerGroupMetaSyncProcedure) { return ProcedureType.CONSUMER_GROUP_META_SYNC_PROCEDURE; + } else if (procedure instanceof RepairRegionProcedure) { + return ProcedureType.REPAIR_REGION_PROCEDURE; } else if (procedure instanceof DeleteLogicalViewProcedure) { return ProcedureType.DELETE_LOGICAL_VIEW_PROCEDURE; } else if (procedure instanceof AlterLogicalViewProcedure) { diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java index 820a90f7ebfb9..c693c110a676f 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java @@ -168,6 +168,9 @@ public enum ProcedureType { TOPIC_META_SYNC_PROCEDURE((short) 1508), CONSUMER_GROUP_META_SYNC_PROCEDURE((short) 1509), + /** Consistency */ + REPAIR_REGION_PROCEDURE((short) 1600), + /** Other */ @TestOnly NEVER_FINISH_PROCEDURE((short) 30000), diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java index 5d6aa8da9f5df..c9a08f0b331c5 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/thrift/ConfigNodeRPCServiceProcessor.java @@ -207,6 +207,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TShowPipeResp; import org.apache.iotdb.confignode.rpc.thrift.TShowRegionReq; import org.apache.iotdb.confignode.rpc.thrift.TShowRegionResp; +import org.apache.iotdb.confignode.rpc.thrift.TShowRepairProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TShowSubscriptionReq; import org.apache.iotdb.confignode.rpc.thrift.TShowSubscriptionResp; import org.apache.iotdb.confignode.rpc.thrift.TShowTTLResp; @@ -223,6 +224,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TSystemConfigurationResp; import org.apache.iotdb.confignode.rpc.thrift.TTestOperation; import org.apache.iotdb.confignode.rpc.thrift.TThrottleQuotaResp; +import org.apache.iotdb.confignode.rpc.thrift.TTriggerRegionConsistencyRepairReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsetSchemaTemplateReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.confignode.service.ConfigNode; @@ -1097,6 +1099,11 @@ public TShowConfigNodes4InformationSchemaResp showConfigNodes4InformationSchema( return configManager.showConfigNodes4InformationSchema(); } + @Override + public TShowRepairProgressResp showRepairProgress() { + return configManager.showRepairProgress(); + } + @Override public TShowDatabaseResp showDatabase(TGetDatabaseReq req) { return configManager.showDatabase(req); @@ -1353,6 +1360,12 @@ public TSStatus removeRegion(TRemoveRegionReq req) throws TException { return configManager.removeRegion(req); } + @Override + public TSStatus triggerRegionConsistencyRepair(TTriggerRegionConsistencyRepairReq req) + throws TException { + return configManager.triggerRegionConsistencyRepair(req); + } + @Override public TSStatus createCQ(TCreateCQReq req) { return configManager.createCQ(req); diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyCheckSchedulerTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyCheckSchedulerTest.java new file mode 100644 index 0000000000000..6baf304af99cc --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/manager/consistency/ConsistencyCheckSchedulerTest.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.manager.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.confignode.manager.ConfigManager; +import org.apache.iotdb.confignode.manager.ProcedureManager; +import org.apache.iotdb.confignode.manager.partition.PartitionManager; +import org.apache.iotdb.confignode.persistence.ConsistencyProgressInfo; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class ConsistencyCheckSchedulerTest { + + @Test + public void schedulerShouldSkipRegionsWithRunningRepairProcedure() { + ConfigManager configManager = mock(ConfigManager.class); + ProcedureManager procedureManager = mock(ProcedureManager.class); + PartitionManager partitionManager = mock(PartitionManager.class); + + TConsensusGroupId region1 = new TConsensusGroupId(TConsensusGroupType.DataRegion, 1); + TConsensusGroupId region2 = new TConsensusGroupId(TConsensusGroupType.DataRegion, 2); + Map replicaSets = new LinkedHashMap<>(); + replicaSets.put(region1, new TRegionReplicaSet()); + replicaSets.put(region2, new TRegionReplicaSet()); + + ConsistencyProgressManager progressManager = + new ConsistencyProgressManager(new ConsistencyProgressInfo()); + List checkedRegions = new ArrayList<>(); + + when(configManager.getPartitionManager()).thenReturn(partitionManager); + when(configManager.getConsistencyProgressManager()).thenReturn(progressManager); + when(partitionManager.getAllReplicaSetsMap(TConsensusGroupType.DataRegion)) + .thenReturn(replicaSets); + when(procedureManager.hasRunningRepairProcedure(region1)).thenReturn(false); + when(procedureManager.hasRunningRepairProcedure(region2)).thenReturn(true); + + ConsistencyCheckScheduler scheduler = + new ConsistencyCheckScheduler( + configManager, + procedureManager, + 0L, + 1L, + (manager, consensusGroupId, progressTable) -> { + checkedRegions.add(consensusGroupId); + progressTable.markVerified( + 100L, 1000L, 2000L, 3000L, 3000L, RepairProgressTable.SnapshotState.READY); + }); + + scheduler.runOneRound(); + + Assert.assertEquals(1, checkedRegions.size()); + Assert.assertEquals(region1, checkedRegions.get(0)); + Assert.assertNotNull(progressManager.loadRepairProgressTable(region1).getPartition(100L)); + Assert.assertNull(progressManager.loadRepairProgressTable(region2).getPartition(100L)); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/persistence/ConsistencyProgressInfoTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/persistence/ConsistencyProgressInfoTest.java new file mode 100644 index 0000000000000..c0890c0f5a1b7 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/persistence/ConsistencyProgressInfoTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.persistence; + +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; + +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; + +public class ConsistencyProgressInfoTest { + + @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); + + @Test + public void snapshotShouldRestorePersistedProgressTables() throws Exception { + ConsistencyProgressInfo info = new ConsistencyProgressInfo(); + RepairProgressTable table = new RepairProgressTable("DataRegion-5"); + table.markMismatch( + 100L, + 1000L, + 2000L, + 3000L, + 3000L, + RepairProgressTable.SnapshotState.READY, + "LIVE@leaf:5:0", + 1, + "leader:5:2000:3000:3000"); + table.markRepairRunning(100L, "leader:5:2000:3000:3000"); + info.updateTable(table); + + File snapshotDir = temporaryFolder.newFolder("consistency-progress-snapshot"); + Assert.assertTrue(info.processTakeSnapshot(snapshotDir)); + + ConsistencyProgressInfo recovered = new ConsistencyProgressInfo(); + recovered.processLoadSnapshot(snapshotDir); + + Assert.assertEquals(info.getAllTables(), recovered.getAllTables()); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/consistency/ConsistencyPartitionSelectorTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/consistency/ConsistencyPartitionSelectorTest.java new file mode 100644 index 0000000000000..833ce319f2f87 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/consistency/ConsistencyPartitionSelectorTest.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.consistency; + +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.mpp.rpc.thrift.TPartitionConsistencyEligibility; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +public class ConsistencyPartitionSelectorTest { + + @Test + public void verifiedPartitionWithSameSnapshotShouldNotBeRequeued() { + RepairProgressTable progressTable = new RepairProgressTable("DataRegion-1"); + progressTable.markVerified(1L, 100L, 200L, 300L, 300L, RepairProgressTable.SnapshotState.READY); + + Map eligibilityByPartition = new LinkedHashMap<>(); + eligibilityByPartition.put(1L, eligibility(1L, 300L, 300L, "READY")); + eligibilityByPartition.put(2L, eligibility(2L, 301L, 301L, "READY")); + + List selected = + ConsistencyPartitionSelector.selectCandidatePartitions( + eligibilityByPartition, Collections.emptySet(), false, null, progressTable); + + Assert.assertEquals(Collections.singletonList(2L), selected); + } + + @Test + public void dirtyAndBuildingPartitionsShouldBeRechecked() { + RepairProgressTable progressTable = new RepairProgressTable("DataRegion-1"); + progressTable.markVerified(1L, 100L, 200L, 300L, 300L, RepairProgressTable.SnapshotState.READY); + progressTable.markVerified(2L, 100L, 200L, 400L, 400L, RepairProgressTable.SnapshotState.READY); + + Map eligibilityByPartition = new LinkedHashMap<>(); + eligibilityByPartition.put(1L, eligibility(1L, 301L, 301L, "READY")); + eligibilityByPartition.put(2L, eligibility(2L, 400L, 400L, "BUILDING")); + + List selected = + ConsistencyPartitionSelector.selectCandidatePartitions( + eligibilityByPartition, Collections.emptySet(), false, null, progressTable); + + Assert.assertEquals(Arrays.asList(1L, 2L), selected); + } + + @Test + public void verifiedPartitionShouldBeRequeuedWhenReplicaObservationChanges() { + RepairProgressTable progressTable = new RepairProgressTable("DataRegion-1"); + progressTable.markVerified( + 1L, + 100L, + 200L, + 300L, + 300L, + RepairProgressTable.SnapshotState.READY, + "4:300:300:READY|3:300:300:READY"); + + Map eligibilityByPartition = new LinkedHashMap<>(); + eligibilityByPartition.put(1L, eligibility(1L, 300L, 300L, "READY")); + + Map replicaObservationTokens = new LinkedHashMap<>(); + replicaObservationTokens.put(1L, "4:300:300:READY|3:0:0:READY"); + + List selected = + ConsistencyPartitionSelector.selectCandidatePartitions( + eligibilityByPartition, + Collections.emptySet(), + false, + null, + progressTable, + replicaObservationTokens); + + Assert.assertEquals(Collections.singletonList(1L), selected); + } + + @Test + public void repairModeShouldPreferLatestMismatchScope() { + RepairProgressTable progressTable = new RepairProgressTable("DataRegion-1"); + progressTable.markMismatch( + 1L, + 100L, + 200L, + 300L, + 300L, + RepairProgressTable.SnapshotState.READY, + "LIVE@leaf:1:0", + 1, + "leader:1:200:300:300"); + progressTable.markVerified(2L, 100L, 200L, 400L, 400L, RepairProgressTable.SnapshotState.READY); + + Map eligibilityByPartition = new LinkedHashMap<>(); + eligibilityByPartition.put(1L, eligibility(1L, 300L, 300L, "READY")); + eligibilityByPartition.put(2L, eligibility(2L, 401L, 401L, "READY")); + + List selected = + ConsistencyPartitionSelector.selectCandidatePartitions( + eligibilityByPartition, + Collections.emptySet(), + true, + "leader:1:200:300:300", + progressTable); + + Assert.assertEquals(Collections.singletonList(1L), selected); + } + + private static TPartitionConsistencyEligibility eligibility( + long partitionId, long partitionMutationEpoch, long snapshotEpoch, String snapshotState) { + return new TPartitionConsistencyEligibility( + partitionId, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + partitionId, + partitionId, + partitionId, + partitionId); + } +} diff --git a/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/consistency/RepairRegionProcedureTest.java b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/consistency/RepairRegionProcedureTest.java new file mode 100644 index 0000000000000..a882c36ba4857 --- /dev/null +++ b/iotdb-core/confignode/src/test/java/org/apache/iotdb/confignode/procedure/impl/consistency/RepairRegionProcedureTest.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.confignode.procedure.Procedure; +import org.apache.iotdb.confignode.procedure.store.ProcedureFactory; + +import org.apache.tsfile.utils.PublicBAOS; +import org.junit.Assert; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.function.BooleanSupplier; + +public class RepairRegionProcedureTest { + + @Test + public void serDeTest() throws Exception { + TConsensusGroupId groupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 10); + RepairRegionProcedure procedure = + new RepairRegionProcedure(groupId, new TestExecutionContext(Collections.emptyMap())); + try (PublicBAOS byteArrayOutputStream = new PublicBAOS(); + DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + procedure.serialize(outputStream); + ByteBuffer buffer = + ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + Assert.assertEquals(procedure, ProcedureFactory.getInstance().create(buffer)); + } finally { + RepairRegionProcedure.unregisterExecutionContext(procedure.getExecutionContextId()); + } + } + + @Test + public void executeRepairOperationFlowTest() throws Exception { + TConsensusGroupId groupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 1); + String repairEpoch = "leader:1:1000:2000:2000"; + List operationIds = Arrays.asList("live@leaf:1:0", "tombstone@leaf:1:0"); + + TestExecutionContext executionContext = new TestExecutionContext(new LinkedHashMap<>()); + executionContext.lastPersistedTable.markMismatch( + 0L, + 100L, + 1000L, + 2000L, + 2000L, + RepairProgressTable.SnapshotState.READY, + "LIVE@leaf:1:0,TOMBSTONE@leaf:1:0", + 2, + repairEpoch); + executionContext.partitionContexts.put( + 0L, + new TestPartitionRepairContext( + 0L, + false, + operationIds, + repairEpoch, + () -> executionContext.executedOperationIds.equals(operationIds))); + + ExposedRepairRegionProcedure procedure = + new ExposedRepairRegionProcedure(groupId, executionContext); + + executeProcedureToCompletion(procedure, 16); + + Assert.assertEquals(operationIds, executionContext.executedOperationIds); + Assert.assertEquals(Collections.singletonList(0L), executionContext.committedPartitions); + Assert.assertTrue(executionContext.closed); + Assert.assertFalse(executionContext.rolledBack); + + RepairProgressTable.PartitionProgress progress = + executionContext.lastPersistedTable.getPartition(0L); + Assert.assertNotNull(progress); + Assert.assertEquals(RepairProgressTable.CheckState.VERIFIED, progress.getCheckState()); + Assert.assertEquals(RepairProgressTable.RepairState.SUCCEEDED, progress.getRepairState()); + Assert.assertEquals(repairEpoch, progress.getRepairEpoch()); + Assert.assertNull(progress.getLastErrorCode()); + } + + @Test + public void executeRepairOperationFailureRollsBackTest() throws Exception { + TConsensusGroupId groupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 2); + String repairEpoch = "leader:2:1000:3000:3000"; + + TestExecutionContext executionContext = new TestExecutionContext(new LinkedHashMap<>()); + executionContext.lastPersistedTable.markMismatch( + 0L, + 100L, + 1000L, + 3000L, + 3000L, + RepairProgressTable.SnapshotState.READY, + "LIVE@leaf:2:0", + 1, + repairEpoch); + executionContext.operationToFail = "live@leaf:2:0"; + executionContext.partitionContexts.put( + 0L, + new TestPartitionRepairContext( + 0L, false, Collections.singletonList("live@leaf:2:0"), repairEpoch, () -> false)); + + ExposedRepairRegionProcedure procedure = + new ExposedRepairRegionProcedure(groupId, executionContext); + + executeProcedureToCompletion(procedure, 16); + + Assert.assertEquals( + Collections.singletonList("live@leaf:2:0"), executionContext.executedOperationIds); + Assert.assertTrue(executionContext.closed); + Assert.assertTrue(executionContext.rolledBack); + Assert.assertTrue(executionContext.committedPartitions.isEmpty()); + + RepairProgressTable.PartitionProgress progress = + executionContext.lastPersistedTable.getPartition(0L); + Assert.assertNotNull(progress); + Assert.assertEquals(RepairProgressTable.CheckState.FAILED, progress.getCheckState()); + Assert.assertEquals(RepairProgressTable.RepairState.FAILED, progress.getRepairState()); + Assert.assertEquals("REPAIR_FAILED", progress.getLastErrorCode()); + Assert.assertEquals(repairEpoch, progress.getRepairEpoch()); + } + + @Test + public void blockingReasonShouldFailFastWithoutRepairOperations() throws Exception { + TConsensusGroupId groupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 3); + String repairEpoch = "leader:3:1000:4000:4000"; + String blockingReason = + "Partition 0 contains follower-only tombstone mismatches that cannot be rolled back safely yet"; + + TestExecutionContext executionContext = new TestExecutionContext(new LinkedHashMap<>()); + executionContext.lastPersistedTable.markMismatch( + 0L, + 100L, + 1000L, + 4000L, + 4000L, + RepairProgressTable.SnapshotState.READY, + "TOMBSTONE@leaf:3:0", + 1, + repairEpoch); + executionContext.partitionContexts.put( + 0L, + new TestPartitionRepairContext( + 0L, false, Collections.emptyList(), repairEpoch, blockingReason, () -> false)); + + ExposedRepairRegionProcedure procedure = + new ExposedRepairRegionProcedure(groupId, executionContext); + + executeProcedureToCompletion(procedure, 16); + + Assert.assertTrue(executionContext.executedOperationIds.isEmpty()); + Assert.assertTrue(executionContext.closed); + Assert.assertTrue(executionContext.rolledBack); + + RepairProgressTable.PartitionProgress progress = + executionContext.lastPersistedTable.getPartition(0L); + Assert.assertNotNull(progress); + Assert.assertEquals(RepairProgressTable.CheckState.FAILED, progress.getCheckState()); + Assert.assertEquals(RepairProgressTable.RepairState.FAILED, progress.getRepairState()); + Assert.assertEquals("REPAIR_FAILED", progress.getLastErrorCode()); + Assert.assertTrue(progress.getLastErrorMessage().contains(blockingReason)); + } + + private static void executeProcedureToCompletion( + ExposedRepairRegionProcedure procedure, int maxSteps) throws Exception { + int steps = 0; + Procedure[] next; + do { + next = procedure.executeOnce(); + steps++; + } while (next != null && steps < maxSteps); + Assert.assertTrue("procedure should finish within " + maxSteps + " steps", steps < maxSteps); + } + + private static final class ExposedRepairRegionProcedure extends RepairRegionProcedure { + + private ExposedRepairRegionProcedure( + TConsensusGroupId consensusGroupId, RepairExecutionContext executionContext) { + super(consensusGroupId, executionContext); + } + + private Procedure[] executeOnce() throws InterruptedException { + return doExecute(null); + } + } + + private static final class TestExecutionContext + implements RepairRegionProcedure.RepairExecutionContext { + + private final Map partitionContexts; + private final List executedOperationIds = new ArrayList<>(); + private final List committedPartitions = new ArrayList<>(); + private RepairProgressTable lastPersistedTable = new RepairProgressTable("DataRegion-unknown"); + private String operationToFail; + private boolean closed; + private boolean rolledBack; + + private TestExecutionContext( + Map partitionContexts) { + this.partitionContexts = partitionContexts; + } + + @Override + public boolean isReplicationComplete() { + return true; + } + + @Override + public long computeSafeWatermark() { + return 1000L; + } + + @Override + public List collectPendingPartitions( + long safeWatermark, RepairProgressTable repairProgressTable) { + return new ArrayList<>(partitionContexts.keySet()); + } + + @Override + public RepairRegionProcedure.PartitionRepairContext getPartitionContext(long partitionId) { + return partitionContexts.get(partitionId); + } + + @Override + public void executeRepairOperation(String operationId) { + executedOperationIds.add(operationId); + if (operationId.equals(operationToFail)) { + throw new IllegalStateException("Injected repair failure for " + operationId); + } + } + + @Override + public RepairProgressTable loadRepairProgressTable(String consensusGroupKey) { + return lastPersistedTable.copy(); + } + + @Override + public void persistRepairProgressTable(RepairProgressTable repairProgressTable) { + this.lastPersistedTable = repairProgressTable.copy(); + } + + @Override + public void onPartitionCommitted( + long partitionId, long committedAt, RepairProgressTable repairProgressTable) { + committedPartitions.add(partitionId); + } + + @Override + public void rollbackPartition(long partitionId, RepairProgressTable repairProgressTable) { + rolledBack = true; + } + + @Override + public void close() { + closed = true; + } + } + + private static final class TestPartitionRepairContext + implements RepairRegionProcedure.PartitionRepairContext { + + private final long partitionId; + private final boolean rootHashMatched; + private final List repairOperationIds; + private final String repairEpoch; + private final String blockingReason; + private final BooleanSupplier verifier; + + private TestPartitionRepairContext( + long partitionId, + boolean rootHashMatched, + List repairOperationIds, + String repairEpoch, + String blockingReason, + BooleanSupplier verifier) { + this.partitionId = partitionId; + this.rootHashMatched = rootHashMatched; + this.repairOperationIds = repairOperationIds; + this.repairEpoch = repairEpoch; + this.blockingReason = blockingReason; + this.verifier = verifier; + } + + private TestPartitionRepairContext( + long partitionId, + boolean rootHashMatched, + List repairOperationIds, + String repairEpoch, + BooleanSupplier verifier) { + this(partitionId, rootHashMatched, repairOperationIds, repairEpoch, null, verifier); + } + + @Override + public long getPartitionId() { + return partitionId; + } + + @Override + public boolean isRootHashMatched() { + return rootHashMatched; + } + + @Override + public List getRepairOperationIds() { + return repairOperationIds; + } + + @Override + public String getRepairEpoch() { + return repairEpoch; + } + + @Override + public String getBlockingReason() { + return blockingReason; + } + + @Override + public boolean verify() { + return verifier.getAsBoolean(); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSink.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSink.java index 3001c40b16fbf..1b41e2cda56ae 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSink.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSink.java @@ -38,7 +38,6 @@ import org.apache.iotdb.consensus.pipe.metric.IoTConsensusV2SyncLagManager; import org.apache.iotdb.db.conf.IoTDBConfig; import org.apache.iotdb.db.conf.IoTDBDescriptor; -import org.apache.iotdb.db.pipe.consensus.ReplicateProgressDataNodeManager; import org.apache.iotdb.db.pipe.consensus.metric.IoTConsensusV2SinkMetrics; import org.apache.iotdb.db.pipe.event.common.PipeInsertionEvent; import org.apache.iotdb.db.pipe.event.common.deletion.PipeDeleteDataNodeEvent; @@ -115,6 +114,10 @@ public class IoTConsensusV2AsyncSink extends IoTDBSink implements ConsensusPipeS private IoTConsensusV2SyncSink retryConnector; private IClientManager asyncTransferClientManager; private IoTConsensusV2AsyncBatchReqBuilder tabletBatchBuilder; + // Track the highest replicate index that actually enters this connector instead of the + // source-side pre-assigned index, otherwise discarded realtime TsFile events can create + // phantom sync lag. + private volatile long leaderReplicateProgress = 0; private volatile long currentReplicateProgress = 0; @Override @@ -196,6 +199,8 @@ private boolean addEvent2Buffer(EnrichedEvent event) { iotConsensusV2SinkMetrics.recordConnectorEnqueueTimer(duration); // add reference if (result) { + leaderReplicateProgress = + Math.max(leaderReplicateProgress, event.getReplicateIndexForIoTV2()); event.increaseReferenceCount(IoTConsensusV2AsyncSink.class.getName()); } // if connector is closed when executing this method, need to clear this event's reference @@ -717,7 +722,7 @@ public int getRetryBufferSize() { @Override public long getLeaderReplicateProgress() { - return ReplicateProgressDataNodeManager.getReplicateIndexForIoTV2(consensusPipeName); + return leaderReplicateProgress; } @Override diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java index e2c04caedfb20..70a09529063e4 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java @@ -165,6 +165,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TShowPipeResp; import org.apache.iotdb.confignode.rpc.thrift.TShowRegionReq; import org.apache.iotdb.confignode.rpc.thrift.TShowRegionResp; +import org.apache.iotdb.confignode.rpc.thrift.TShowRepairProgressResp; import org.apache.iotdb.confignode.rpc.thrift.TShowSubscriptionReq; import org.apache.iotdb.confignode.rpc.thrift.TShowSubscriptionResp; import org.apache.iotdb.confignode.rpc.thrift.TShowTTLResp; @@ -181,6 +182,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TSystemConfigurationResp; import org.apache.iotdb.confignode.rpc.thrift.TTestOperation; import org.apache.iotdb.confignode.rpc.thrift.TThrottleQuotaResp; +import org.apache.iotdb.confignode.rpc.thrift.TTriggerRegionConsistencyRepairReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsetSchemaTemplateReq; import org.apache.iotdb.confignode.rpc.thrift.TUnsubscribeReq; import org.apache.iotdb.db.conf.IoTDBConfig; @@ -827,6 +829,12 @@ public TSStatus stopRepairData() throws TException { () -> client.stopRepairData(), status -> !updateConfigNodeLeader(status)); } + @Override + public TSStatus triggerRegionConsistencyRepair(TTriggerRegionConsistencyRepairReq req) + throws TException { + throw new UnsupportedOperationException(UNSUPPORTED_INVOCATION); + } + @Override public TSStatus submitLoadConfigurationTask() throws TException { return executeRemoteCallWithRetry( @@ -910,6 +918,12 @@ public TShowConfigNodes4InformationSchemaResp showConfigNodes4InformationSchema( resp -> !updateConfigNodeLeader(resp.status)); } + @Override + public TShowRepairProgressResp showRepairProgress() throws TException { + return executeRemoteCallWithRetry( + () -> client.showRepairProgress(), resp -> !updateConfigNodeLeader(resp.status)); + } + @Override public TShowDatabaseResp showDatabase(TGetDatabaseReq req) throws TException { return executeRemoteCallWithRetry( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 42929be741819..6104664c39c28 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -197,6 +197,7 @@ import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionScheduleTaskManager; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionTaskManager; import org.apache.iotdb.db.storageengine.dataregion.compaction.settle.SettleRequestHandler; +import org.apache.iotdb.db.storageengine.dataregion.consistency.DataRegionConsistencyRepairService; import org.apache.iotdb.db.storageengine.dataregion.flush.CompressionRatio; import org.apache.iotdb.db.storageengine.dataregion.modification.DeletionPredicate; import org.apache.iotdb.db.storageengine.dataregion.modification.IDPredicate; @@ -216,6 +217,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TAlterEncodingCompressorReq; import org.apache.iotdb.mpp.rpc.thrift.TAlterTimeSeriesReq; import org.apache.iotdb.mpp.rpc.thrift.TAlterViewReq; +import org.apache.iotdb.mpp.rpc.thrift.TApplyLogicalRepairBatchReq; import org.apache.iotdb.mpp.rpc.thrift.TAttributeUpdateReq; import org.apache.iotdb.mpp.rpc.thrift.TAuditLogReq; import org.apache.iotdb.mpp.rpc.thrift.TCancelFragmentInstanceReq; @@ -241,6 +243,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TDataNodeHeartbeatReq; import org.apache.iotdb.mpp.rpc.thrift.TDataNodeHeartbeatResp; import org.apache.iotdb.mpp.rpc.thrift.TDeactivateTemplateReq; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffResp; import org.apache.iotdb.mpp.rpc.thrift.TDeleteColumnDataReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataForDeleteSchemaReq; import org.apache.iotdb.mpp.rpc.thrift.TDeleteDataOrDevicesForDropTableReq; @@ -251,15 +255,22 @@ import org.apache.iotdb.mpp.rpc.thrift.TDropFunctionInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TDropPipePluginInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TDropTriggerInstanceReq; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffResp; import org.apache.iotdb.mpp.rpc.thrift.TExecuteCQ; import org.apache.iotdb.mpp.rpc.thrift.TFetchFragmentInstanceInfoReq; import org.apache.iotdb.mpp.rpc.thrift.TFetchFragmentInstanceStatisticsReq; import org.apache.iotdb.mpp.rpc.thrift.TFetchFragmentInstanceStatisticsResp; import org.apache.iotdb.mpp.rpc.thrift.TFetchSchemaBlackListReq; import org.apache.iotdb.mpp.rpc.thrift.TFetchSchemaBlackListResp; +import org.apache.iotdb.mpp.rpc.thrift.TFinishLogicalRepairSessionReq; import org.apache.iotdb.mpp.rpc.thrift.TFireTriggerReq; import org.apache.iotdb.mpp.rpc.thrift.TFireTriggerResp; import org.apache.iotdb.mpp.rpc.thrift.TFragmentInstanceInfoResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeResp; import org.apache.iotdb.mpp.rpc.thrift.TInactiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateColumnCacheReq; @@ -301,6 +312,8 @@ import org.apache.iotdb.mpp.rpc.thrift.TSendFragmentInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TSendFragmentInstanceResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeResp; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairReq; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairResp; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceDeletionWithPatternAndFilterReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceDeletionWithPatternOrModReq; import org.apache.iotdb.mpp.rpc.thrift.TTableDeviceInvalidateCacheReq; @@ -384,6 +397,8 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private final SchemaEngine schemaEngine = SchemaEngine.getInstance(); private final StorageEngine storageEngine = StorageEngine.getInstance(); + private final DataRegionConsistencyRepairService consistencyRepairService = + new DataRegionConsistencyRepairService(); private final TableDeviceSchemaCache tableDeviceSchemaCache = TableDeviceSchemaCache.getInstance(); @@ -590,6 +605,42 @@ public TLoadResp sendLoadCommand(TLoadCommandReq req) { timePartitionProgressIndexMap)); } + @Override + public TGetConsistencyEligibilityResp getConsistencyEligibility( + TGetConsistencyEligibilityReq req) { + return consistencyRepairService.getConsistencyEligibility(req); + } + + @Override + public TGetSnapshotSubtreeResp getSnapshotSubtree(TGetSnapshotSubtreeReq req) { + return consistencyRepairService.getSnapshotSubtree(req); + } + + @Override + public TEstimateLeafDiffResp estimateLeafDiff(TEstimateLeafDiffReq req) { + return consistencyRepairService.estimateLeafDiff(req); + } + + @Override + public TDecodeLeafDiffResp decodeLeafDiff(TDecodeLeafDiffReq req) { + return consistencyRepairService.decodeLeafDiff(req); + } + + @Override + public TStreamLogicalRepairResp streamLogicalRepair(TStreamLogicalRepairReq req) { + return consistencyRepairService.streamLogicalRepair(req); + } + + @Override + public TSStatus applyLogicalRepairBatch(TApplyLogicalRepairBatchReq req) { + return consistencyRepairService.applyLogicalRepairBatch(req); + } + + @Override + public TSStatus finishLogicalRepairSession(TFinishLogicalRepairSessionReq req) { + return consistencyRepairService.finishLogicalRepairSession(req); + } + @Override public TSStatus updateAttribute(final TAttributeUpdateReq req) { TableDeviceSchemaFetcher.getInstance().getAttributeGuard().handleAttributeUpdate(req); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/operator/source/relational/InformationSchemaContentSupplierFactory.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/operator/source/relational/InformationSchemaContentSupplierFactory.java index 8774b670e7a31..dff7027a9cfcc 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/operator/source/relational/InformationSchemaContentSupplierFactory.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/execution/operator/source/relational/InformationSchemaContentSupplierFactory.java @@ -53,6 +53,7 @@ import org.apache.iotdb.confignode.rpc.thrift.TGetUdfTableReq; import org.apache.iotdb.confignode.rpc.thrift.TNodeVersionInfo; import org.apache.iotdb.confignode.rpc.thrift.TRegionInfo; +import org.apache.iotdb.confignode.rpc.thrift.TRepairProgressInfo; import org.apache.iotdb.confignode.rpc.thrift.TShowClusterResp; import org.apache.iotdb.confignode.rpc.thrift.TShowPipeInfo; import org.apache.iotdb.confignode.rpc.thrift.TShowPipeReq; @@ -153,6 +154,8 @@ public static Iterator getSupplier( return new ColumnSupplier(dataTypes, userEntity); case InformationSchema.REGIONS: return new RegionSupplier(dataTypes, userEntity); + case InformationSchema.REPAIR_PROGRESS: + return new RepairProgressSupplier(dataTypes, userEntity); case InformationSchema.PIPES: return new PipeSupplier(dataTypes, userEntity.getUsername()); case InformationSchema.PIPE_PLUGINS: @@ -561,6 +564,67 @@ public boolean hasNext() { } } + private static class RepairProgressSupplier extends TsBlockSupplier { + private final Iterator iterator; + + private RepairProgressSupplier(final List dataTypes, final UserEntity userEntity) + throws Exception { + super(dataTypes); + accessControl.checkUserGlobalSysPrivilege(userEntity); + try (final ConfigNodeClient client = + ConfigNodeClientManager.getInstance().borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) { + final org.apache.iotdb.confignode.rpc.thrift.TShowRepairProgressResp resp = + client.showRepairProgress(); + iterator = + resp == null || !resp.isSetRepairProgressInfoList() + ? Collections.emptyList().iterator() + : resp.getRepairProgressInfoListIterator(); + } + } + + @Override + protected void constructLine() { + final TRepairProgressInfo progressInfo = iterator.next(); + columnBuilders[0].writeInt(progressInfo.getRegionId()); + columnBuilders[1].writeLong(progressInfo.getTimePartition()); + columnBuilders[2].writeBinary(BytesUtils.valueOf(progressInfo.getCheckState())); + columnBuilders[3].writeBinary(BytesUtils.valueOf(progressInfo.getRepairState())); + columnBuilders[4].writeLong(progressInfo.getLastCheckedAt()); + columnBuilders[5].writeLong(progressInfo.getLastSafeWatermark()); + columnBuilders[6].writeLong(progressInfo.getPartitionMutationEpoch()); + columnBuilders[7].writeLong(progressInfo.getSnapshotEpoch()); + columnBuilders[8].writeBinary(BytesUtils.valueOf(progressInfo.getSnapshotState())); + columnBuilders[9].writeLong(progressInfo.getLastMismatchAt()); + if (progressInfo.isSetMismatchScopeRef()) { + columnBuilders[10].writeBinary(BytesUtils.valueOf(progressInfo.getMismatchScopeRef())); + } else { + columnBuilders[10].appendNull(); + } + columnBuilders[11].writeInt(progressInfo.getMismatchLeafCount()); + if (progressInfo.isSetRepairEpoch()) { + columnBuilders[12].writeBinary(BytesUtils.valueOf(progressInfo.getRepairEpoch())); + } else { + columnBuilders[12].appendNull(); + } + if (progressInfo.isSetLastErrorCode()) { + columnBuilders[13].writeBinary(BytesUtils.valueOf(progressInfo.getLastErrorCode())); + } else { + columnBuilders[13].appendNull(); + } + if (progressInfo.isSetLastErrorMessage()) { + columnBuilders[14].writeBinary(BytesUtils.valueOf(progressInfo.getLastErrorMessage())); + } else { + columnBuilders[14].appendNull(); + } + resultBuilder.declarePosition(); + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + } + private static class PipeSupplier extends TsBlockSupplier { private final Iterator iterator; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactory.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactory.java index 8e73872f5ba8f..93e3be62357d4 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactory.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactory.java @@ -93,6 +93,7 @@ public List getDataNodeLocations(final String tableName) { case InformationSchema.TABLES: case InformationSchema.COLUMNS: case InformationSchema.REGIONS: + case InformationSchema.REPAIR_PROGRESS: case InformationSchema.PIPES: case InformationSchema.PIPE_PLUGINS: case InformationSchema.TOPICS: diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java index e00a17ad854cb..88189609f30db 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/DataRegion.java @@ -109,6 +109,7 @@ import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionScheduleTaskManager; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionScheduler; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionTaskManager; +import org.apache.iotdb.db.storageengine.dataregion.consistency.DataRegionConsistencyManager; import org.apache.iotdb.db.storageengine.dataregion.flush.CloseFileListener; import org.apache.iotdb.db.storageengine.dataregion.flush.FlushListener; import org.apache.iotdb.db.storageengine.dataregion.flush.FlushStatus; @@ -1232,6 +1233,7 @@ public void insert(InsertRowNode insertRowNode) throws WriteProcessException { PERFORMANCE_OVERVIEW_METRICS.recordScheduleUpdateLastCacheCost( System.nanoTime() - startTime); } + markConsistencyDirtyForInsert(insertRowNode.getTime()); } finally { writeUnlock(); } @@ -1389,6 +1391,7 @@ public void insertTablet(InsertTabletNode insertTabletNode) // infoForMetrics[4]: InsertedPointsNumber boolean noFailure = executeInsertTablet(insertTabletNode, results, infoForMetrics); updateTsFileProcessorMetric(insertTabletNode, infoForMetrics); + markConsistencyDirtyForTablet(insertTabletNode.getTimes()); if (!noFailure) { throw new BatchProcessException(results); @@ -2873,6 +2876,10 @@ public void deleteByDevice(final MeasurementPath pattern, final DeleteDataNode n if (deletionResource != null && deletionResource.waitForResult() == Status.FAILURE) { throw deletionResource.getCause(); } + DataRegionConsistencyManager.getInstance() + .onDeletion(dataRegionId.convertToTConsensusGroupId(), sealedTsFileResource); + DataRegionConsistencyManager.getInstance() + .onDeletion(dataRegionId.convertToTConsensusGroupId(), startTime, endTime); writeUnlock(); hasReleasedLock = true; @@ -2977,6 +2984,17 @@ public void deleteByTable(RelationalDeleteDataNode node) throws IOException { if (deletionResource != null && deletionResource.waitForResult() == Status.FAILURE) { throw deletionResource.getCause(); } + for (TableDeletionEntry modEntry : modEntries) { + DataRegionConsistencyManager.getInstance() + .onDeletion( + dataRegionId.convertToTConsensusGroupId(), + modEntry.getStartTime(), + modEntry.getEndTime()); + } + for (List sealedTsFileResources : sealedTsFileResourceLists) { + DataRegionConsistencyManager.getInstance() + .onDeletion(dataRegionId.convertToTConsensusGroupId(), sealedTsFileResources); + } writeUnlock(); hasReleasedLock = true; @@ -3035,6 +3053,10 @@ public void deleteDataDirectly(MeasurementPath pathToDelete, DeleteDataNode node if (deletionResource != null && deletionResource.waitForResult() == Status.FAILURE) { throw deletionResource.getCause(); } + DataRegionConsistencyManager.getInstance() + .onDeletion(dataRegionId.convertToTConsensusGroupId(), sealedTsFileResource); + DataRegionConsistencyManager.getInstance() + .onDeletion(dataRegionId.convertToTConsensusGroupId(), startTime, endTime); writeUnlock(); releasedLock = true; deleteDataDirectlyInFile(sealedTsFileResource, deletion); @@ -3978,6 +4000,8 @@ public void loadNewTsFile( } onTsFileLoaded(newTsFileResource, isFromConsensus, lastReader); + DataRegionConsistencyManager.getInstance() + .onPartitionMutation(dataRegionId.convertToTConsensusGroupId(), newFilePartitionId); logger.info("TsFile {} is successfully loaded in unsequence list.", newFileName); } catch (final DiskSpaceInsufficientException e) { logger.error( @@ -4622,6 +4646,7 @@ public void insert(InsertRowsNode insertRowsNode) if (!insertRowsNode.getResults().isEmpty()) { throw new BatchProcessException("Partial failed inserting rows"); } + markConsistencyDirtyForRows(timePartitionIds); } finally { writeUnlock(); } @@ -4686,6 +4711,7 @@ public void insertTablets(InsertMultiTabletsNode insertMultiTabletsNode) } insertMultiTabletsNode.getResults().put(i, firstStatus); } + markConsistencyDirtyForTablet(insertTabletNode.getTimes()); } updateTsFileProcessorMetric(insertMultiTabletsNode, infoForMetrics); @@ -4981,4 +5007,38 @@ private long getTTL(InsertNode insertNode) { return DataNodeTTLCache.getInstance().getTTLForTable(databaseName, insertNode.getTableName()); } } + + private void markConsistencyDirtyForInsert(long time) { + DataRegionConsistencyManager.getInstance() + .onPartitionMutation( + dataRegionId.convertToTConsensusGroupId(), TimePartitionUtils.getTimePartitionId(time)); + } + + private void markConsistencyDirtyForTablet(long[] times) { + if (times == null || times.length == 0) { + return; + } + Set partitions = new HashSet<>(); + for (long time : times) { + partitions.add(TimePartitionUtils.getTimePartitionId(time)); + } + for (Long partitionId : partitions) { + DataRegionConsistencyManager.getInstance() + .onPartitionMutation(dataRegionId.convertToTConsensusGroupId(), partitionId); + } + } + + private void markConsistencyDirtyForRows(long[] timePartitionIds) { + if (timePartitionIds == null || timePartitionIds.length == 0) { + return; + } + Set partitions = new HashSet<>(); + for (long partitionId : timePartitionIds) { + partitions.add(partitionId); + } + for (Long partitionId : partitions) { + DataRegionConsistencyManager.getInstance() + .onPartitionMutation(dataRegionId.convertToTConsensusGroupId(), partitionId); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyManager.java new file mode 100644 index 0000000000000..74a6ff03eb824 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyManager.java @@ -0,0 +1,1884 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.commons.consensus.iotv2.consistency.DualDigest; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.commons.path.AlignedFullPath; +import org.apache.iotdb.commons.path.IFullPath; +import org.apache.iotdb.commons.path.MeasurementPath; +import org.apache.iotdb.commons.path.NonAlignedFullPath; +import org.apache.iotdb.commons.path.PartialPath; +import org.apache.iotdb.db.queryengine.execution.fragment.FragmentInstanceContext; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeId; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.DeleteDataNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.storageengine.dataregion.compaction.execute.performer.impl.ReadPointCompactionPerformer; +import org.apache.iotdb.db.storageengine.dataregion.compaction.execute.utils.MultiTsFileDeviceIterator; +import org.apache.iotdb.db.storageengine.dataregion.compaction.execute.utils.reader.IDataBlockReader; +import org.apache.iotdb.db.storageengine.dataregion.memtable.AlignedWritableMemChunkGroup; +import org.apache.iotdb.db.storageengine.dataregion.memtable.IMemTable; +import org.apache.iotdb.db.storageengine.dataregion.memtable.IWritableMemChunk; +import org.apache.iotdb.db.storageengine.dataregion.memtable.IWritableMemChunkGroup; +import org.apache.iotdb.db.storageengine.dataregion.memtable.TsFileProcessor; +import org.apache.iotdb.db.storageengine.dataregion.read.QueryDataSource; +import org.apache.iotdb.db.storageengine.dataregion.read.control.FileReaderManager; +import org.apache.iotdb.db.storageengine.dataregion.read.control.QueryResourceManager; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import org.apache.iotdb.mpp.rpc.thrift.TConsistencyDeletionSummary; + +import org.apache.tsfile.block.column.Column; +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.IDeviceID; +import org.apache.tsfile.read.common.block.TsBlock; +import org.apache.tsfile.utils.Binary; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.ClosedChannelException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentHashMap; +import java.util.stream.Collectors; + +/** + * DataNode-side logical snapshot manager. The snapshot is built from the merged logical visible + * view instead of TsFile correspondence, so physical file layout heterogeneity does not affect + * consistency results. + */ +public class DataRegionConsistencyManager { + + private static final Logger LOGGER = LoggerFactory.getLogger(DataRegionConsistencyManager.class); + private static final DataRegionConsistencyManager INSTANCE = new DataRegionConsistencyManager(); + + private static final int DEVICE_SHARD_COUNT = 256; + private static final long LEAF_TIME_BUCKET_MS = 3_600_000L; + private static final String TREE_KIND_LIVE = "LIVE"; + private static final String TREE_KIND_TOMBSTONE = "TOMBSTONE"; + private static final String BATCH_KIND_RESET_LEAF = "RESET_LEAF"; + private static final String BATCH_KIND_RESET_SCOPE = "RESET_SCOPE"; + private static final String BATCH_KIND_INSERT_ROWS = "INSERT_ROWS"; + private static final String BATCH_KIND_DELETE_DATA = "DELETE_DATA"; + private static final int SNAPSHOT_REBUILD_MAX_ATTEMPTS = 2; + + private final ConcurrentHashMap regionStates = new ConcurrentHashMap<>(); + private final ThreadLocal repairMutationContext = new ThreadLocal<>(); + private final LogicalConsistencyPartitionStateStore partitionStateStore; + + DataRegionConsistencyManager() { + this(new LogicalConsistencyPartitionStateStore()); + } + + DataRegionConsistencyManager(LogicalConsistencyPartitionStateStore partitionStateStore) { + this.partitionStateStore = partitionStateStore; + } + + public static DataRegionConsistencyManager getInstance() { + return INSTANCE; + } + + public PartitionInspection inspectPartition( + TConsensusGroupId consensusGroupId, + DataRegion dataRegion, + long partitionId, + List partitionDeletionSummaries) { + String consensusGroupKey = consensusGroupId.toString(); + RegionState regionState = getOrCreateRegionState(consensusGroupId); + PartitionState partitionState = + regionState.partitions.computeIfAbsent(partitionId, PartitionState::new); + PartitionInspection inspection; + synchronized (partitionState) { + if (partitionState.snapshotState != RepairProgressTable.SnapshotState.READY + || partitionState.snapshotEpoch != partitionState.partitionMutationEpoch) { + try { + rebuildSnapshot(partitionState, dataRegion, partitionId, partitionDeletionSummaries); + } catch (Exception e) { + partitionState.snapshotState = RepairProgressTable.SnapshotState.FAILED; + partitionState.lastError = e.getMessage(); + LOGGER.warn( + "Failed to build logical consistency snapshot for region {} partition {}", + consensusGroupId, + partitionId, + e); + } + } + inspection = partitionState.toInspection(partitionId); + } + persistKnownPartitionsIfNeeded(consensusGroupKey, regionState); + return inspection; + } + + public List getKnownPartitions(TConsensusGroupId consensusGroupId) { + List partitions = + new ArrayList<>(getOrCreateRegionState(consensusGroupId).partitions.keySet()); + partitions.sort(Long::compareTo); + return partitions; + } + + public SnapshotSubtreeResult getSnapshotSubtree( + TConsensusGroupId consensusGroupId, + DataRegion dataRegion, + long partitionId, + long snapshotEpoch, + String treeKind, + List nodeHandles, + List partitionDeletionSummaries) { + PartitionState partitionState = + getOrCreateRegionState(consensusGroupId) + .partitions + .computeIfAbsent(partitionId, PartitionState::new); + synchronized (partitionState) { + if (!ensureReadySnapshot( + partitionState, dataRegion, partitionId, snapshotEpoch, partitionDeletionSummaries)) { + return SnapshotSubtreeResult.stale(snapshotEpoch); + } + SnapshotTree tree = partitionState.getTree(treeKind); + List resultNodes = new ArrayList<>(); + List requestedHandles = + nodeHandles == null || nodeHandles.isEmpty() + ? Collections.singletonList(SnapshotTree.ROOT_HANDLE) + : nodeHandles; + for (String handle : requestedHandles) { + SnapshotNode node = tree.nodesByHandle.get(handle); + if (node == null) { + continue; + } + if (node.leaf) { + resultNodes.add(node); + continue; + } + for (String childHandle : node.childrenHandles) { + SnapshotNode child = tree.nodesByHandle.get(childHandle); + if (child != null) { + resultNodes.add(child); + } + } + } + resultNodes.sort(Comparator.comparing(SnapshotNode::getNodeHandle)); + return SnapshotSubtreeResult.ready(snapshotEpoch, resultNodes); + } + } + + public LeafEstimate estimateLeaf( + TConsensusGroupId consensusGroupId, + DataRegion dataRegion, + long partitionId, + long snapshotEpoch, + String treeKind, + String leafId, + List partitionDeletionSummaries) { + PartitionState partitionState = + getOrCreateRegionState(consensusGroupId) + .partitions + .computeIfAbsent(partitionId, PartitionState::new); + synchronized (partitionState) { + if (!ensureReadySnapshot( + partitionState, dataRegion, partitionId, snapshotEpoch, partitionDeletionSummaries)) { + return null; + } + SnapshotNode leaf = partitionState.getTree(treeKind).nodesByHandle.get(leafId); + if (leaf == null || !leaf.leaf) { + return null; + } + return new LeafEstimate( + partitionId, + snapshotEpoch, + treeKind, + leafId, + leaf.itemCount, + TREE_KIND_TOMBSTONE.equalsIgnoreCase(treeKind) ? leaf.itemCount : 0L, + leaf.itemCount, + leaf.keyRangeStart, + leaf.keyRangeEnd); + } + } + + public List decodeLeaf( + TConsensusGroupId consensusGroupId, + DataRegion dataRegion, + long partitionId, + long snapshotEpoch, + String treeKind, + String leafId, + List partitionDeletionSummaries) + throws Exception { + PartitionState partitionState = + getOrCreateRegionState(consensusGroupId) + .partitions + .computeIfAbsent(partitionId, PartitionState::new); + synchronized (partitionState) { + if (!ensureReadySnapshot( + partitionState, dataRegion, partitionId, snapshotEpoch, partitionDeletionSummaries)) { + return null; + } + } + + LogicalLeafSelector selector = LogicalLeafSelector.parse(leafId); + List entries = new ArrayList<>(); + if (TREE_KIND_TOMBSTONE.equalsIgnoreCase(treeKind)) { + for (TConsistencyDeletionSummary summary : partitionDeletionSummaries) { + if (selector.matches(summary)) { + entries.add(new LeafDiffEntry(encodeDeletionKey(summary), "LOCAL")); + } + } + } else { + scanLiveCells( + dataRegion, + partitionId, + (deviceId, measurement, type, time, value, aligned) -> { + if (selector.matches(deviceId, time)) { + entries.add( + new LeafDiffEntry( + encodeLogicalCell(deviceId, measurement, time, type, value), "LOCAL")); + } + }); + } + entries.sort(Comparator.comparing(LeafDiffEntry::getLogicalKey)); + return entries; + } + + public List streamLogicalRepair( + TConsensusGroupId consensusGroupId, + DataRegion dataRegion, + long partitionId, + String repairEpoch, + List leafSelectors, + List partitionDeletionSummaries) + throws Exception { + PartitionState partitionState = + getOrCreateRegionState(consensusGroupId) + .partitions + .computeIfAbsent(partitionId, PartitionState::new); + synchronized (partitionState) { + ensureRepairEpochReadySnapshot( + partitionState, dataRegion, partitionId, repairEpoch, partitionDeletionSummaries); + } + + String sessionId = buildRepairSessionId(repairEpoch, leafSelectors); + List batches = new ArrayList<>(); + final int[] seqNo = {0}; + for (LeafSelector selector : leafSelectors) { + if (TREE_KIND_TOMBSTONE.equalsIgnoreCase(selector.treeKind)) { + for (TConsistencyDeletionSummary summary : partitionDeletionSummaries) { + if (!selector.selector.matches(summary)) { + continue; + } + DeleteDataNode deleteDataNode = buildDeleteDataNode(summary); + batches.add( + new LogicalRepairBatch( + sessionId, + selector.treeKind, + selector.selector.leafId, + seqNo[0]++, + BATCH_KIND_DELETE_DATA, + deleteDataNode.serializeToByteBuffer())); + } + continue; + } + + batches.add( + new LogicalRepairBatch( + sessionId, + selector.treeKind, + selector.selector.leafId, + seqNo[0]++, + selector.selector.requiresScopedReset() + ? BATCH_KIND_RESET_SCOPE + : BATCH_KIND_RESET_LEAF, + selector.selector.requiresScopedReset() + ? selector.selector.serialize() + : ByteBuffer.allocate(0))); + + List bufferedRows = new ArrayList<>(); + scanLiveCells( + dataRegion, + partitionId, + (deviceId, measurement, type, time, value, aligned) -> { + if (!selector.selector.matchesLiveCell(deviceId, measurement, type, time, value)) { + return; + } + bufferedRows.add(buildInsertRow(deviceId, measurement, type, time, value, aligned)); + if (bufferedRows.size() >= 256) { + batches.add( + new LogicalRepairBatch( + sessionId, + selector.treeKind, + selector.selector.leafId, + seqNo[0]++, + BATCH_KIND_INSERT_ROWS, + toInsertRowsNode(bufferedRows).serializeToByteBuffer())); + bufferedRows.clear(); + } + }); + if (!bufferedRows.isEmpty()) { + batches.add( + new LogicalRepairBatch( + sessionId, + selector.treeKind, + selector.selector.leafId, + seqNo[0]++, + BATCH_KIND_INSERT_ROWS, + toInsertRowsNode(bufferedRows).serializeToByteBuffer())); + } + } + return batches; + } + + public T runWithLogicalRepairMutation( + TConsensusGroupId consensusGroupId, long partitionId, String repairEpoch, Callable action) + throws Exception { + RepairMutationContext previousContext = repairMutationContext.get(); + repairMutationContext.set( + new RepairMutationContext(consensusGroupId.toString(), partitionId, repairEpoch)); + try { + return action.call(); + } finally { + if (previousContext == null) { + repairMutationContext.remove(); + } else { + repairMutationContext.set(previousContext); + } + } + } + + public void resetLiveLeaf(DataRegion dataRegion, long partitionId, String leafId) + throws Exception { + LogicalLeafSelector selector = LogicalLeafSelector.parse(leafId); + resetLiveBySelector(dataRegion, partitionId, selector); + } + + public void resetLiveScope(DataRegion dataRegion, long partitionId, ByteBuffer selectorPayload) + throws Exception { + LogicalLeafSelector selector = LogicalLeafSelector.deserialize(selectorPayload); + resetLiveBySelector(dataRegion, partitionId, selector); + } + + private void resetLiveBySelector( + DataRegion dataRegion, long partitionId, LogicalLeafSelector selector) throws Exception { + if (selector.requiresScopedReset()) { + resetLiveScopedCells(dataRegion, partitionId, selector); + return; + } + Set fullPaths = new LinkedHashSet<>(); + scanLiveCells( + dataRegion, + partitionId, + (deviceId, measurement, type, time, value, aligned) -> { + if (selector.matches(deviceId, time)) { + fullPaths.add(deviceId + "." + measurement); + } + }); + if (fullPaths.isEmpty()) { + return; + } + List paths = new ArrayList<>(fullPaths.size()); + for (String fullPath : fullPaths) { + paths.add(new MeasurementPath(fullPath)); + } + long bucketStart = selector.bucket * LEAF_TIME_BUCKET_MS; + long bucketEnd = bucketStart + LEAF_TIME_BUCKET_MS - 1; + DeleteDataNode deleteDataNode = + new DeleteDataNode(new PlanNodeId("logical-reset-leaf"), paths, bucketStart, bucketEnd); + for (MeasurementPath path : paths) { + dataRegion.deleteByDevice(path, deleteDataNode); + } + } + + private void resetLiveScopedCells( + DataRegion dataRegion, long partitionId, LogicalLeafSelector selector) throws Exception { + Map> timesByPath = new HashMap<>(); + scanLiveCells( + dataRegion, + partitionId, + (deviceId, measurement, type, time, value, aligned) -> { + if (!selector.matchesLiveCell(deviceId, measurement, type, time, value)) { + return; + } + timesByPath + .computeIfAbsent(deviceId + "." + measurement, ignored -> new LinkedHashSet<>()) + .add(time); + }); + for (Map.Entry> entry : timesByPath.entrySet()) { + MeasurementPath path = new MeasurementPath(entry.getKey()); + for (Long time : entry.getValue()) { + DeleteDataNode deleteDataNode = + new DeleteDataNode( + new PlanNodeId("logical-reset-scope"), Collections.singletonList(path), time, time); + dataRegion.deleteByDevice(path, deleteDataNode); + } + } + } + + public void onTsFileClosed(TConsensusGroupId consensusGroupId, TsFileResource tsFileResource) { + // Logical snapshots are driven by mutation epoch, not close-file events. + } + + public void onCompaction( + TConsensusGroupId consensusGroupId, + List seqSourceFiles, + List unseqSourceFiles, + List targetFiles, + long timePartition) { + // Compaction only changes physical layout and must not dirty logical snapshots. + } + + public void onDeletion(TConsensusGroupId consensusGroupId, List affectedTsFiles) { + if (affectedTsFiles == null) { + return; + } + for (TsFileResource resource : affectedTsFiles) { + onPartitionMutation(consensusGroupId, resource.getTimePartition()); + } + } + + public void onDeletion(TConsensusGroupId consensusGroupId, long startTime, long endTime) { + long partitionInterval = + org.apache.iotdb.commons.conf.CommonDescriptor.getInstance() + .getConfig() + .getTimePartitionInterval(); + if (partitionInterval <= 0) { + onPartitionMutation(consensusGroupId, 0L); + return; + } + long startPartition = Math.floorDiv(startTime, partitionInterval); + long endPartition = Math.floorDiv(endTime, partitionInterval); + for (long partitionId = startPartition; partitionId <= endPartition; partitionId++) { + onPartitionMutation(consensusGroupId, partitionId); + } + } + + public void onPartitionMutation(TConsensusGroupId consensusGroupId, long partitionId) { + String consensusGroupKey = consensusGroupId.toString(); + RegionState regionState = getOrCreateRegionState(consensusGroupId); + PartitionState newState = new PartitionState(partitionId); + PartitionState existingState = regionState.partitions.putIfAbsent(partitionId, newState); + PartitionState state = existingState == null ? newState : existingState; + synchronized (state) { + if (isRepairMutation(consensusGroupId, partitionId)) { + state.snapshotState = RepairProgressTable.SnapshotState.DIRTY; + state.liveTree = SnapshotTree.empty(); + state.tombstoneTree = SnapshotTree.empty(); + state.lastError = null; + return; + } + state.partitionMutationEpoch++; + state.snapshotState = RepairProgressTable.SnapshotState.DIRTY; + state.liveTree = SnapshotTree.empty(); + state.tombstoneTree = SnapshotTree.empty(); + state.lastError = null; + } + if (existingState == null) { + persistKnownPartitionsIfNeeded(consensusGroupKey, regionState); + } + } + + private RegionState getOrCreateRegionState(TConsensusGroupId consensusGroupId) { + return regionStates.computeIfAbsent( + consensusGroupId.toString(), ignored -> loadRegionState(consensusGroupId.toString())); + } + + private RegionState loadRegionState(String consensusGroupKey) { + RegionState regionState = new RegionState(); + try { + Map persistedMutationEpochs = partitionStateStore.load(consensusGroupKey); + for (Map.Entry entry : persistedMutationEpochs.entrySet()) { + PartitionState partitionState = new PartitionState(entry.getKey()); + partitionState.partitionMutationEpoch = entry.getValue(); + regionState.partitions.put(entry.getKey(), partitionState); + } + regionState.lastPersistedMutationEpochs = new TreeMap<>(persistedMutationEpochs); + } catch (IOException e) { + LOGGER.warn( + "Failed to restore logical consistency partition state for region {}", + consensusGroupKey, + e); + } + return regionState; + } + + private void persistKnownPartitionsIfNeeded(String consensusGroupKey, RegionState regionState) { + Map snapshotToPersist = snapshotPartitionMutationEpochs(regionState); + synchronized (regionState.persistMonitor) { + if (snapshotToPersist.equals(regionState.lastPersistedMutationEpochs) + || snapshotToPersist.equals(regionState.pendingPersistMutationEpochs)) { + return; + } + if (regionState.persistInFlight) { + regionState.pendingPersistMutationEpochs = snapshotToPersist; + return; + } + regionState.persistInFlight = true; + } + + while (true) { + boolean persistSucceeded = false; + try { + partitionStateStore.persist(consensusGroupKey, snapshotToPersist); + persistSucceeded = true; + } catch (IOException e) { + LOGGER.warn( + "Failed to persist logical consistency partition state for region {}", + consensusGroupKey, + e); + } + + synchronized (regionState.persistMonitor) { + if (persistSucceeded) { + regionState.lastPersistedMutationEpochs = snapshotToPersist; + } + if (regionState.pendingPersistMutationEpochs == null + || regionState.pendingPersistMutationEpochs.equals( + regionState.lastPersistedMutationEpochs) + || (!persistSucceeded + && regionState.pendingPersistMutationEpochs.equals(snapshotToPersist))) { + regionState.pendingPersistMutationEpochs = null; + regionState.persistInFlight = false; + return; + } + snapshotToPersist = regionState.pendingPersistMutationEpochs; + regionState.pendingPersistMutationEpochs = null; + } + } + } + + private Map snapshotPartitionMutationEpochs(RegionState regionState) { + Map mutationEpochs = new TreeMap<>(); + for (Map.Entry entry : regionState.partitions.entrySet()) { + synchronized (entry.getValue()) { + mutationEpochs.put(entry.getKey(), entry.getValue().partitionMutationEpoch); + } + } + return mutationEpochs; + } + + private boolean ensureReadySnapshot( + PartitionState partitionState, + DataRegion dataRegion, + long partitionId, + long snapshotEpoch, + List partitionDeletionSummaries) { + if (partitionState.snapshotEpoch == snapshotEpoch + && partitionState.snapshotState == RepairProgressTable.SnapshotState.READY) { + return true; + } + if (partitionState.partitionMutationEpoch != snapshotEpoch) { + return false; + } + try { + rebuildSnapshot(partitionState, dataRegion, partitionId, partitionDeletionSummaries); + return partitionState.snapshotEpoch == snapshotEpoch + && partitionState.snapshotState == RepairProgressTable.SnapshotState.READY; + } catch (Exception e) { + partitionState.snapshotState = RepairProgressTable.SnapshotState.FAILED; + partitionState.lastError = e.getMessage(); + return false; + } + } + + private void rebuildSnapshot( + PartitionState partitionState, + DataRegion dataRegion, + long partitionId, + List partitionDeletionSummaries) + throws Exception { + long expectedMutationEpoch = partitionState.partitionMutationEpoch; + partitionState.snapshotState = RepairProgressTable.SnapshotState.BUILDING; + + for (int attempt = 1; attempt <= SNAPSHOT_REBUILD_MAX_ATTEMPTS; attempt++) { + SnapshotTree liveTree = SnapshotTree.empty(); + SnapshotTree tombstoneTree = SnapshotTree.empty(); + + try { + scanLiveCells( + dataRegion, + partitionId, + (deviceId, measurement, type, time, value, aligned) -> { + int shard = computeDeviceShard(deviceId); + long bucket = Math.floorDiv(time, LEAF_TIME_BUCKET_MS); + String leafId = LogicalLeafSelector.leafId(shard, bucket); + String logicalKey = encodeLogicalCell(deviceId, measurement, time, type, value); + long hash = hashLogicalCell(deviceId, measurement, time, type, value); + liveTree.addLeafEntry(leafId, shard, logicalKey, hash, 1L); + }); + } catch (Exception e) { + if (attempt >= SNAPSHOT_REBUILD_MAX_ATTEMPTS || !isRetryableSnapshotReadFailure(e)) { + throw e; + } + invalidateSnapshotReaders(dataRegion, partitionId); + LOGGER.info( + "Retrying logical consistency snapshot build for partition {} after refreshing stale readers", + partitionId); + continue; + } + + for (TConsistencyDeletionSummary summary : partitionDeletionSummaries) { + int shard = computeDeviceShard(summary.getPathPattern()); + long startBucket = Math.floorDiv(summary.getTimeRangeStart(), LEAF_TIME_BUCKET_MS); + long endBucket = Math.floorDiv(summary.getTimeRangeEnd(), LEAF_TIME_BUCKET_MS); + String deletionKey = encodeDeletionKey(summary); + for (long bucket = startBucket; bucket <= endBucket; bucket++) { + String leafId = LogicalLeafSelector.leafId(shard, bucket); + tombstoneTree.addLeafEntry(leafId, shard, deletionKey, hashDeletion(summary), 1L); + } + } + + liveTree.finalizeTree(); + tombstoneTree.finalizeTree(); + + if (expectedMutationEpoch != partitionState.partitionMutationEpoch) { + partitionState.snapshotState = RepairProgressTable.SnapshotState.DIRTY; + return; + } + partitionState.snapshotEpoch = expectedMutationEpoch; + partitionState.snapshotState = RepairProgressTable.SnapshotState.READY; + partitionState.liveTree = liveTree; + partitionState.tombstoneTree = tombstoneTree; + partitionState.lastError = null; + return; + } + + throw new IllegalStateException( + "Logical consistency snapshot rebuild finished without a terminal result"); + } + + private void ensureRepairEpochReadySnapshot( + PartitionState partitionState, + DataRegion dataRegion, + long partitionId, + String repairEpoch, + List partitionDeletionSummaries) + throws Exception { + RepairEpochRef repairEpochRef = RepairEpochRef.parse(repairEpoch); + if (repairEpochRef.partitionId != partitionId) { + throw new IllegalStateException( + "Repair epoch partition " + + repairEpochRef.partitionId + + " does not match requested partition " + + partitionId); + } + if (partitionState.partitionMutationEpoch != repairEpochRef.partitionMutationEpoch) { + throw new IllegalStateException( + "Repair epoch drift detected for partition " + + partitionId + + ": expected mutation epoch " + + repairEpochRef.partitionMutationEpoch + + ", actual " + + partitionState.partitionMutationEpoch); + } + if (partitionState.snapshotState != RepairProgressTable.SnapshotState.READY + || partitionState.snapshotEpoch != repairEpochRef.snapshotEpoch) { + rebuildSnapshot(partitionState, dataRegion, partitionId, partitionDeletionSummaries); + } + if (partitionState.snapshotState != RepairProgressTable.SnapshotState.READY + || partitionState.snapshotEpoch != repairEpochRef.snapshotEpoch + || partitionState.partitionMutationEpoch != repairEpochRef.partitionMutationEpoch) { + throw new IllegalStateException( + "Repair epoch drift detected after snapshot rebuild for partition " + partitionId); + } + } + + private boolean isRepairMutation(TConsensusGroupId consensusGroupId, long partitionId) { + RepairMutationContext mutationContext = repairMutationContext.get(); + return mutationContext != null + && mutationContext.partitionId == partitionId + && mutationContext.consensusGroupKey.equals(consensusGroupId.toString()); + } + + private void scanLiveCells(DataRegion dataRegion, long partitionId, LiveCellConsumer consumer) + throws Exception { + Map deviceSeriesContexts = + collectLogicalSeriesContexts(dataRegion, partitionId); + if (deviceSeriesContexts.isEmpty()) { + return; + } + + long queryId = QueryResourceManager.getInstance().assignInternalQueryId(); + FragmentInstanceContext context = + FragmentInstanceContext.createFragmentInstanceContextForCompaction(queryId); + + try { + for (DeviceSeriesContext deviceSeriesContext : deviceSeriesContexts.values()) { + List paths = deviceSeriesContext.buildPaths(); + if (paths.isEmpty()) { + continue; + } + + QueryDataSource dataSource = + dataRegion.query( + paths, + deviceSeriesContext.deviceId, + context, + null, + Collections.singletonList(partitionId)); + if (dataSource == null || dataSource.isEmpty()) { + continue; + } + + QueryResourceManager.getInstance() + .getQueryFileManager() + .addUsedFilesForQuery(queryId, dataSource); + dataSource.fillOrderIndexes(deviceSeriesContext.deviceId, true); + String deviceId = String.valueOf(deviceSeriesContext.deviceId); + if (deviceSeriesContext.aligned) { + QueryDataSource deviceDataSource = copyQueryDataSourceForDevice(dataSource); + IDataBlockReader reader = + ReadPointCompactionPerformer.constructReader( + deviceSeriesContext.deviceId, + deviceSeriesContext.getMeasurementNames(), + deviceSeriesContext.getMeasurementSchemas(), + deviceSeriesContext.getMeasurementNames(), + context, + deviceDataSource, + true); + consumeAligned( + reader, + deviceId, + deviceSeriesContext.getMeasurementNames(), + deviceSeriesContext.measurementSchemas, + consumer); + } else { + for (String measurementName : deviceSeriesContext.getMeasurementNames()) { + IMeasurementSchema schema = deviceSeriesContext.measurementSchemas.get(measurementName); + QueryDataSource deviceDataSource = copyQueryDataSourceForDevice(dataSource); + IDataBlockReader reader = + ReadPointCompactionPerformer.constructReader( + deviceSeriesContext.deviceId, + Collections.singletonList(measurementName), + Collections.singletonList(schema), + deviceSeriesContext.getMeasurementNames(), + context, + deviceDataSource, + false); + consumeNonAligned(reader, deviceId, measurementName, schema, consumer); + } + } + } + } finally { + QueryResourceManager.getInstance().endQuery(queryId); + } + } + + private void consumeAligned( + IDataBlockReader reader, + String deviceId, + List measurementNames, + Map schemaMap, + LiveCellConsumer consumer) + throws Exception { + while (reader.hasNextBatch()) { + TsBlock tsBlock = reader.nextBatch(); + for (int row = 0; row < tsBlock.getPositionCount(); row++) { + long time = tsBlock.getTimeByIndex(row); + for (int columnIndex = 0; columnIndex < tsBlock.getValueColumnCount(); columnIndex++) { + Column column = tsBlock.getColumn(columnIndex); + if (column.isNull(row)) { + continue; + } + String measurement = measurementNames.get(columnIndex); + IMeasurementSchema schema = schemaMap.get(measurement); + consumer.accept( + deviceId, + measurement, + schema.getType(), + time, + extractValue(column, row, schema.getType()), + true); + } + } + } + } + + private void consumeNonAligned( + IDataBlockReader reader, + String deviceId, + String measurement, + IMeasurementSchema schema, + LiveCellConsumer consumer) + throws Exception { + while (reader.hasNextBatch()) { + TsBlock tsBlock = reader.nextBatch(); + Column column = tsBlock.getColumn(0); + for (int row = 0; row < tsBlock.getPositionCount(); row++) { + if (column.isNull(row)) { + continue; + } + consumer.accept( + deviceId, + measurement, + schema.getType(), + tsBlock.getTimeByIndex(row), + extractValue(column, row, schema.getType()), + false); + } + } + } + + private Object extractValue(Column column, int rowIndex, TSDataType dataType) { + switch (dataType) { + case BOOLEAN: + return column.getBoolean(rowIndex); + case INT32: + case DATE: + return column.getInt(rowIndex); + case INT64: + case TIMESTAMP: + return column.getLong(rowIndex); + case FLOAT: + return column.getFloat(rowIndex); + case DOUBLE: + return column.getDouble(rowIndex); + case TEXT: + case STRING: + case BLOB: + return column.getBinary(rowIndex); + default: + return String.valueOf(column.getObject(rowIndex)); + } + } + + private Map collectLogicalSeriesContexts( + DataRegion dataRegion, long partitionId) throws Exception { + Map deviceSeriesContexts = new TreeMap<>(); + + List seqResources = + new ArrayList<>(dataRegion.getTsFileManager().getTsFileListSnapshot(partitionId, true)); + List unseqResources = + new ArrayList<>(dataRegion.getTsFileManager().getTsFileListSnapshot(partitionId, false)); + pruneClosedResources(seqResources); + pruneClosedResources(unseqResources); + + if (!seqResources.isEmpty() || !unseqResources.isEmpty()) { + // Use dedicated metadata readers here instead of the shared FileReaderManager-backed read + // point iterator. The snapshot rebuild later performs a real logical scan through the query + // engine, so closing the schema-discovery iterator must never poison the shared reader cache. + try (MultiTsFileDeviceIterator deviceIterator = + new MultiTsFileDeviceIterator(seqResources, unseqResources, new HashMap<>())) { + while (deviceIterator.hasNextDevice()) { + org.apache.tsfile.utils.Pair deviceInfo = deviceIterator.nextDevice(); + DeviceSeriesContext deviceSeriesContext = + deviceSeriesContexts.computeIfAbsent( + String.valueOf(deviceInfo.left), + ignored -> new DeviceSeriesContext(deviceInfo.left, deviceInfo.right)); + deviceSeriesContext.mergeAlignment(deviceInfo.right); + + Map rawSchemaMap = + deviceIterator.getAllSchemasOfCurrentDevice(); + rawSchemaMap.remove("time"); + for (MeasurementSchema schema : rawSchemaMap.values()) { + deviceSeriesContext.mergeMeasurement(schema); + } + } + } + } + + collectOpenProcessorSeriesContexts( + dataRegion.getWorkSequenceTsFileProcessors(), partitionId, deviceSeriesContexts); + collectOpenProcessorSeriesContexts( + dataRegion.getWorkUnsequenceTsFileProcessors(), partitionId, deviceSeriesContexts); + return deviceSeriesContexts; + } + + private void collectOpenProcessorSeriesContexts( + Iterable processors, + long partitionId, + Map deviceSeriesContexts) + throws IOException, InterruptedException { + for (TsFileProcessor processor : processors) { + if (processor == null || processor.getTimeRangeId() != partitionId) { + continue; + } + + if (!processor.tryReadLock(1_000L)) { + throw new IOException( + "Failed to acquire logical snapshot read lock for time partition " + partitionId); + } + try { + collectMemTableSeriesContexts(processor.getWorkMemTable(), deviceSeriesContexts); + for (IMemTable flushingMemTable : processor.getFlushingMemTable()) { + collectMemTableSeriesContexts(flushingMemTable, deviceSeriesContexts); + } + } finally { + processor.readUnLock(); + } + } + } + + private void collectMemTableSeriesContexts( + IMemTable memTable, Map deviceSeriesContexts) { + if (memTable == null || memTable.getMemTableMap().isEmpty()) { + return; + } + + for (Map.Entry entry : + memTable.getMemTableMap().entrySet()) { + IWritableMemChunkGroup memChunkGroup = entry.getValue(); + if (memChunkGroup == null || memChunkGroup.isEmpty()) { + continue; + } + + boolean aligned = memChunkGroup instanceof AlignedWritableMemChunkGroup; + DeviceSeriesContext deviceSeriesContext = + deviceSeriesContexts.computeIfAbsent( + String.valueOf(entry.getKey()), + ignored -> new DeviceSeriesContext(entry.getKey(), aligned)); + deviceSeriesContext.mergeAlignment(aligned); + + if (aligned) { + for (IMeasurementSchema schema : + ((AlignedWritableMemChunkGroup) memChunkGroup).getAlignedMemChunk().getSchemaList()) { + deviceSeriesContext.mergeMeasurement(schema); + } + continue; + } + + for (IWritableMemChunk memChunk : memChunkGroup.getMemChunkMap().values()) { + if (memChunk == null) { + continue; + } + deviceSeriesContext.mergeMeasurement(memChunk.getSchema()); + } + } + } + + private void pruneClosedResources(List resources) { + resources.removeIf( + resource -> !resource.isClosed() || resource.isDeleted() || !resource.getTsFile().exists()); + resources.sort(TsFileResource::compareFileName); + } + + private QueryDataSource copyQueryDataSourceForDevice(QueryDataSource dataSource) { + QueryDataSource deviceDataSource = new QueryDataSource(dataSource); + deviceDataSource.setSingleDevice(true); + return deviceDataSource; + } + + private boolean isRetryableSnapshotReadFailure(Throwable throwable) { + Throwable current = throwable; + while (current != null) { + if (current instanceof ClosedChannelException) { + return true; + } + current = current.getCause(); + } + return false; + } + + private void invalidateSnapshotReaders(DataRegion dataRegion, long partitionId) { + DataRegion.operateClearCache(); + closeClosedSnapshotReaders( + dataRegion.getTsFileManager().getTsFileListSnapshot(partitionId, true)); + closeClosedSnapshotReaders( + dataRegion.getTsFileManager().getTsFileListSnapshot(partitionId, false)); + } + + private void closeClosedSnapshotReaders(List resources) { + if (resources == null) { + return; + } + for (TsFileResource resource : resources) { + if (resource == null || !resource.isClosed()) { + continue; + } + try { + FileReaderManager.getInstance().closeFileAndRemoveReader(resource.getTsFileID()); + } catch (IOException e) { + LOGGER.debug( + "Failed to invalidate cached reader for logical snapshot file {}", + resource.getTsFilePath(), + e); + } + } + } + + private static final class DeviceSeriesContext { + private final IDeviceID deviceId; + private boolean aligned; + private final Map measurementSchemas = new TreeMap<>(); + + private DeviceSeriesContext(IDeviceID deviceId, boolean aligned) { + this.deviceId = deviceId; + this.aligned = aligned; + } + + private void mergeAlignment(boolean newAligned) { + aligned = aligned || newAligned; + } + + private void mergeMeasurement(IMeasurementSchema schema) { + if (schema == null + || schema.getMeasurementName() == null + || schema.getMeasurementName().isEmpty()) { + return; + } + measurementSchemas.putIfAbsent(schema.getMeasurementName(), schema); + } + + private List getMeasurementNames() { + return new ArrayList<>(measurementSchemas.keySet()); + } + + private List getMeasurementSchemas() { + return new ArrayList<>(measurementSchemas.values()); + } + + private List buildPaths() { + if (measurementSchemas.isEmpty()) { + return Collections.emptyList(); + } + if (aligned) { + return Collections.singletonList( + new AlignedFullPath(deviceId, getMeasurementNames(), getMeasurementSchemas())); + } + + List paths = new ArrayList<>(measurementSchemas.size()); + for (IMeasurementSchema schema : measurementSchemas.values()) { + paths.add(new NonAlignedFullPath(deviceId, schema)); + } + return paths; + } + } + + private InsertRowNode buildInsertRow( + String deviceId, + String measurement, + TSDataType type, + long time, + Object value, + boolean aligned) { + try { + MeasurementSchema[] schemas = {new MeasurementSchema(measurement, type)}; + return new InsertRowNode( + new PlanNodeId("logical-repair-row"), + new PartialPath(deviceId), + aligned, + new String[] {measurement}, + new TSDataType[] {type}, + schemas, + time, + new Object[] {value}, + false); + } catch (Exception e) { + throw new IllegalStateException( + "Failed to create logical repair row for " + deviceId + "." + measurement, e); + } + } + + private InsertRowsNode toInsertRowsNode(List rows) { + List clonedRows = new ArrayList<>(rows); + List indexes = new ArrayList<>(rows.size()); + for (int i = 0; i < rows.size(); i++) { + indexes.add(i); + } + return new InsertRowsNode(new PlanNodeId("logical-repair-rows"), indexes, clonedRows); + } + + private DeleteDataNode buildDeleteDataNode(TConsistencyDeletionSummary summary) { + try { + return new DeleteDataNode( + new PlanNodeId("logical-repair-delete"), + Collections.singletonList(new MeasurementPath(summary.getPathPattern())), + summary.getTimeRangeStart(), + summary.getTimeRangeEnd()); + } catch (Exception e) { + throw new IllegalStateException( + "Failed to build logical repair delete for " + summary.getPathPattern(), e); + } + } + + private static int computeDeviceShard(String deviceId) { + return Math.floorMod(deviceId.hashCode(), DEVICE_SHARD_COUNT); + } + + private String buildRepairSessionId(String repairEpoch, List leafSelectors) { + String selectorFingerprint = + leafSelectors.stream() + .map(selector -> selector.treeKind + '@' + selector.selector.toSelectorToken()) + .sorted() + .collect(Collectors.joining(",")); + return java.util + .UUID + .nameUUIDFromBytes( + (repairEpoch + "|" + selectorFingerprint).getBytes(StandardCharsets.UTF_8)) + .toString(); + } + + private long hashLogicalCell( + String deviceId, String measurement, long time, TSDataType dataType, Object value) { + long hash = 17L; + hash = 31L * hash + deviceId.hashCode(); + hash = 31L * hash + measurement.hashCode(); + hash = 31L * hash + Long.hashCode(time); + hash = 31L * hash + dataType.ordinal(); + hash = 31L * hash + valueHash(value); + return hash; + } + + private int valueHash(Object value) { + if (value == null) { + return 0; + } + if (value instanceof Binary) { + return value.toString().hashCode(); + } + return value.hashCode(); + } + + private String encodeLogicalCell( + String deviceId, String measurement, long time, TSDataType dataType, Object value) { + return deviceId + + '|' + + measurement + + '|' + + time + + '|' + + dataType.name() + + '|' + + valueHash(value); + } + + private long hashDeletion(TConsistencyDeletionSummary summary) { + long hash = 19L; + hash = 31L * hash + summary.getPathPattern().hashCode(); + hash = 31L * hash + Long.hashCode(summary.getTimeRangeStart()); + hash = 31L * hash + Long.hashCode(summary.getTimeRangeEnd()); + return hash; + } + + private String encodeDeletionKey(TConsistencyDeletionSummary summary) { + return encodeDeletionKeyStatic(summary); + } + + public static class PartitionInspection { + private final long partitionId; + private final long partitionMutationEpoch; + private final long snapshotEpoch; + private final RepairProgressTable.SnapshotState snapshotState; + private final DualDigest liveRootDigest; + private final DualDigest tombstoneRootDigest; + private final String lastError; + + private PartitionInspection( + long partitionId, + long partitionMutationEpoch, + long snapshotEpoch, + RepairProgressTable.SnapshotState snapshotState, + DualDigest liveRootDigest, + DualDigest tombstoneRootDigest, + String lastError) { + this.partitionId = partitionId; + this.partitionMutationEpoch = partitionMutationEpoch; + this.snapshotEpoch = snapshotEpoch; + this.snapshotState = snapshotState; + this.liveRootDigest = liveRootDigest; + this.tombstoneRootDigest = tombstoneRootDigest; + this.lastError = lastError; + } + + public long getPartitionId() { + return partitionId; + } + + public long getPartitionMutationEpoch() { + return partitionMutationEpoch; + } + + public long getSnapshotEpoch() { + return snapshotEpoch; + } + + public RepairProgressTable.SnapshotState getSnapshotState() { + return snapshotState; + } + + public DualDigest getLiveRootDigest() { + return liveRootDigest; + } + + public DualDigest getTombstoneRootDigest() { + return tombstoneRootDigest; + } + + public String getLastError() { + return lastError; + } + } + + public static class SnapshotSubtreeResult { + private final long snapshotEpoch; + private final boolean stale; + private final List nodes; + + private SnapshotSubtreeResult(long snapshotEpoch, boolean stale, List nodes) { + this.snapshotEpoch = snapshotEpoch; + this.stale = stale; + this.nodes = nodes; + } + + public static SnapshotSubtreeResult ready(long snapshotEpoch, List nodes) { + return new SnapshotSubtreeResult(snapshotEpoch, false, nodes); + } + + public static SnapshotSubtreeResult stale(long snapshotEpoch) { + return new SnapshotSubtreeResult(snapshotEpoch, true, Collections.emptyList()); + } + + public long getSnapshotEpoch() { + return snapshotEpoch; + } + + public boolean isStale() { + return stale; + } + + public List getNodes() { + return nodes; + } + } + + public static class LeafEstimate { + private final long partitionId; + private final long snapshotEpoch; + private final String treeKind; + private final String leafId; + private final long rowCount; + private final long tombstoneCount; + private final long strataEstimate; + private final String keyRangeStart; + private final String keyRangeEnd; + + private LeafEstimate( + long partitionId, + long snapshotEpoch, + String treeKind, + String leafId, + long rowCount, + long tombstoneCount, + long strataEstimate, + String keyRangeStart, + String keyRangeEnd) { + this.partitionId = partitionId; + this.snapshotEpoch = snapshotEpoch; + this.treeKind = treeKind; + this.leafId = leafId; + this.rowCount = rowCount; + this.tombstoneCount = tombstoneCount; + this.strataEstimate = strataEstimate; + this.keyRangeStart = keyRangeStart; + this.keyRangeEnd = keyRangeEnd; + } + + public long getPartitionId() { + return partitionId; + } + + public long getSnapshotEpoch() { + return snapshotEpoch; + } + + public String getTreeKind() { + return treeKind; + } + + public String getLeafId() { + return leafId; + } + + public long getRowCount() { + return rowCount; + } + + public long getTombstoneCount() { + return tombstoneCount; + } + + public long getStrataEstimate() { + return strataEstimate; + } + + public String getKeyRangeStart() { + return keyRangeStart; + } + + public String getKeyRangeEnd() { + return keyRangeEnd; + } + } + + public static class LeafDiffEntry { + private final String logicalKey; + private final String diffType; + + private LeafDiffEntry(String logicalKey, String diffType) { + this.logicalKey = logicalKey; + this.diffType = diffType; + } + + public String getLogicalKey() { + return logicalKey; + } + + public String getDiffType() { + return diffType; + } + } + + public static class LogicalRepairBatch { + private final String sessionId; + private final String treeKind; + private final String leafId; + private final int seqNo; + private final String batchKind; + private final ByteBuffer payload; + + private LogicalRepairBatch( + String sessionId, + String treeKind, + String leafId, + int seqNo, + String batchKind, + ByteBuffer payload) { + this.sessionId = sessionId; + this.treeKind = treeKind; + this.leafId = leafId; + this.seqNo = seqNo; + this.batchKind = batchKind; + this.payload = payload; + } + + public String getSessionId() { + return sessionId; + } + + public String getTreeKind() { + return treeKind; + } + + public String getLeafId() { + return leafId; + } + + public int getSeqNo() { + return seqNo; + } + + public String getBatchKind() { + return batchKind; + } + + public ByteBuffer getPayload() { + return payload; + } + } + + public static class LeafSelector { + private final String treeKind; + private final LogicalLeafSelector selector; + + public LeafSelector(String treeKind, String leafId) { + this.treeKind = treeKind; + this.selector = LogicalLeafSelector.parse(leafId); + } + } + + public static class SnapshotNode { + private final String parentNodeHandle; + private final String nodeHandle; + private final int depth; + private final boolean leaf; + private final DualDigest digest; + private final long itemCount; + private final String leafId; + private final String keyRangeStart; + private final String keyRangeEnd; + private final List childrenHandles = new ArrayList<>(); + + private SnapshotNode( + String parentNodeHandle, + String nodeHandle, + int depth, + boolean leaf, + DualDigest digest, + long itemCount, + String leafId, + String keyRangeStart, + String keyRangeEnd) { + this.parentNodeHandle = parentNodeHandle; + this.nodeHandle = nodeHandle; + this.depth = depth; + this.leaf = leaf; + this.digest = digest; + this.itemCount = itemCount; + this.leafId = leafId; + this.keyRangeStart = keyRangeStart; + this.keyRangeEnd = keyRangeEnd; + } + + public String getParentNodeHandle() { + return parentNodeHandle; + } + + public String getNodeHandle() { + return nodeHandle; + } + + public int getDepth() { + return depth; + } + + public boolean isLeaf() { + return leaf; + } + + public DualDigest getDigest() { + return digest; + } + + public long getItemCount() { + return itemCount; + } + + public String getLeafId() { + return leafId; + } + + public String getKeyRangeStart() { + return keyRangeStart; + } + + public String getKeyRangeEnd() { + return keyRangeEnd; + } + } + + private interface LiveCellConsumer { + void accept( + String deviceId, + String measurement, + TSDataType type, + long time, + Object value, + boolean aligned) + throws Exception; + } + + private static class RegionState { + private final ConcurrentHashMap partitions = new ConcurrentHashMap<>(); + private final Object persistMonitor = new Object(); + private Map lastPersistedMutationEpochs = Collections.emptyMap(); + private Map pendingPersistMutationEpochs = null; + private boolean persistInFlight = false; + } + + private static class RepairMutationContext { + private final String consensusGroupKey; + private final long partitionId; + private final String repairEpoch; + + private RepairMutationContext(String consensusGroupKey, long partitionId, String repairEpoch) { + this.consensusGroupKey = consensusGroupKey; + this.partitionId = partitionId; + this.repairEpoch = repairEpoch; + } + } + + private static class RepairEpochRef { + private final long partitionId; + private final long snapshotEpoch; + private final long partitionMutationEpoch; + + private RepairEpochRef(long partitionId, long snapshotEpoch, long partitionMutationEpoch) { + this.partitionId = partitionId; + this.snapshotEpoch = snapshotEpoch; + this.partitionMutationEpoch = partitionMutationEpoch; + } + + private static RepairEpochRef parse(String repairEpoch) { + if (repairEpoch == null || repairEpoch.isEmpty()) { + throw new IllegalStateException("Repair epoch is missing"); + } + String[] parts = repairEpoch.split(":"); + if (parts.length < 5) { + throw new IllegalStateException("Invalid repair epoch: " + repairEpoch); + } + try { + return new RepairEpochRef( + Long.parseLong(parts[1]), Long.parseLong(parts[3]), Long.parseLong(parts[4])); + } catch (NumberFormatException e) { + throw new IllegalStateException("Invalid repair epoch: " + repairEpoch, e); + } + } + } + + private static class PartitionState { + private final long partitionId; + private long partitionMutationEpoch; + private long snapshotEpoch = Long.MIN_VALUE; + private RepairProgressTable.SnapshotState snapshotState = + RepairProgressTable.SnapshotState.DIRTY; + private SnapshotTree liveTree = SnapshotTree.empty(); + private SnapshotTree tombstoneTree = SnapshotTree.empty(); + private String lastError; + + private PartitionState(long partitionId) { + this.partitionId = partitionId; + } + + private PartitionInspection toInspection(long partitionId) { + return new PartitionInspection( + partitionId, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + liveTree.rootDigest, + tombstoneTree.rootDigest, + lastError); + } + + private SnapshotTree getTree(String treeKind) { + return TREE_KIND_TOMBSTONE.equalsIgnoreCase(treeKind) ? tombstoneTree : liveTree; + } + } + + private static class SnapshotTree { + private static final String ROOT_HANDLE = "root"; + + private final Map nodesByHandle = new LinkedHashMap<>(); + private DualDigest rootDigest = DualDigest.ZERO; + + private static SnapshotTree empty() { + SnapshotTree tree = new SnapshotTree(); + tree.nodesByHandle.put( + ROOT_HANDLE, + new SnapshotNode("", ROOT_HANDLE, 0, false, DualDigest.ZERO, 0L, null, null, null)); + return tree; + } + + private void addLeafEntry( + String leafId, int shard, String logicalKey, long hash, long itemCount) { + String shardHandle = "shard:" + shard; + nodesByHandle.computeIfAbsent( + ROOT_HANDLE, + ignored -> + new SnapshotNode("", ROOT_HANDLE, 0, false, DualDigest.ZERO, 0L, null, null, null)); + SnapshotNode shardNode = + nodesByHandle.computeIfAbsent( + shardHandle, + ignored -> + new SnapshotNode( + ROOT_HANDLE, shardHandle, 1, false, DualDigest.ZERO, 0L, null, null, null)); + if (!nodesByHandle.get(ROOT_HANDLE).childrenHandles.contains(shardHandle)) { + nodesByHandle.get(ROOT_HANDLE).childrenHandles.add(shardHandle); + } + SnapshotNode leafNode = + nodesByHandle.computeIfAbsent( + leafId, + ignored -> + new SnapshotNode( + shardHandle, + leafId, + 2, + true, + DualDigest.ZERO, + 0L, + leafId, + logicalKey, + logicalKey)); + if (!shardNode.childrenHandles.contains(leafId)) { + shardNode.childrenHandles.add(leafId); + } + mergeDigest(leafNode, logicalKey, hash, itemCount); + } + + private void finalizeTree() { + Map rebuilt = new LinkedHashMap<>(); + SnapshotNode root = nodesByHandle.get(ROOT_HANDLE); + if (root == null) { + root = new SnapshotNode("", ROOT_HANDLE, 0, false, DualDigest.ZERO, 0L, null, null, null); + } + DualDigest aggregatedRoot = DualDigest.ZERO; + long rootItemCount = 0L; + rebuilt.put(ROOT_HANDLE, root); + List shardHandles = new ArrayList<>(root.childrenHandles); + shardHandles.sort(String::compareTo); + root.childrenHandles.clear(); + for (String shardHandle : shardHandles) { + SnapshotNode rawShard = nodesByHandle.get(shardHandle); + if (rawShard == null) { + continue; + } + DualDigest shardDigest = DualDigest.ZERO; + long shardItemCount = 0L; + SnapshotNode shard = + new SnapshotNode( + ROOT_HANDLE, shardHandle, 1, false, DualDigest.ZERO, 0L, null, null, null); + List leafHandles = new ArrayList<>(rawShard.childrenHandles); + leafHandles.sort(String::compareTo); + for (String leafHandle : leafHandles) { + SnapshotNode leaf = nodesByHandle.get(leafHandle); + if (leaf == null) { + continue; + } + shard.childrenHandles.add(leafHandle); + shardDigest = shardDigest.merge(leaf.digest); + shardItemCount += leaf.itemCount; + rebuilt.put(leafHandle, leaf); + } + SnapshotNode finalizedShard = + new SnapshotNode( + ROOT_HANDLE, shardHandle, 1, false, shardDigest, shardItemCount, null, null, null); + finalizedShard.childrenHandles.addAll(shard.childrenHandles); + rebuilt.put(shardHandle, finalizedShard); + root.childrenHandles.add(shardHandle); + aggregatedRoot = aggregatedRoot.merge(shardDigest); + rootItemCount += shardItemCount; + } + SnapshotNode finalizedRoot = + new SnapshotNode( + "", ROOT_HANDLE, 0, false, aggregatedRoot, rootItemCount, null, null, null); + finalizedRoot.childrenHandles.addAll(root.childrenHandles); + rebuilt.put(ROOT_HANDLE, finalizedRoot); + nodesByHandle.clear(); + nodesByHandle.putAll(rebuilt); + rootDigest = aggregatedRoot; + } + + private void mergeDigest(SnapshotNode leafNode, String logicalKey, long hash, long itemCount) { + SnapshotNode merged = + new SnapshotNode( + leafNode.parentNodeHandle, + leafNode.nodeHandle, + leafNode.depth, + leafNode.leaf, + leafNode.digest.merge(DualDigest.fromSingleHash(hash)), + leafNode.itemCount + itemCount, + leafNode.leafId, + minComparable(leafNode.keyRangeStart, logicalKey), + maxComparable(leafNode.keyRangeEnd, logicalKey)); + merged.childrenHandles.addAll(leafNode.childrenHandles); + nodesByHandle.put(leafNode.nodeHandle, merged); + } + + private static String minComparable(String left, String right) { + if (left == null) { + return right; + } + if (right == null) { + return left; + } + return left.compareTo(right) <= 0 ? left : right; + } + + private static String maxComparable(String left, String right) { + if (left == null) { + return right; + } + if (right == null) { + return left; + } + return left.compareTo(right) >= 0 ? left : right; + } + } + + private static class LogicalLeafSelector { + private final String leafId; + private final int shard; + private final long bucket; + private final String keyRangeStart; + private final String keyRangeEnd; + private final Set exactKeys; + + private LogicalLeafSelector( + String leafId, + int shard, + long bucket, + String keyRangeStart, + String keyRangeEnd, + Set exactKeys) { + this.leafId = leafId; + this.shard = shard; + this.bucket = bucket; + this.keyRangeStart = keyRangeStart; + this.keyRangeEnd = keyRangeEnd; + this.exactKeys = exactKeys == null ? Collections.emptySet() : new LinkedHashSet<>(exactKeys); + } + + private static LogicalLeafSelector parse(String selectorToken) { + String[] selectorParts = selectorToken.split("@", -1); + String rawLeafId = selectorParts[0]; + String[] parts = rawLeafId.split(":"); + if (parts.length != 3 || !"leaf".equals(parts[0])) { + throw new IllegalArgumentException("Unsupported leaf id: " + selectorToken); + } + return new LogicalLeafSelector( + rawLeafId, + Integer.parseInt(parts[1]), + Long.parseLong(parts[2]), + selectorParts.length >= 2 ? decodeNullable(selectorParts[1]) : null, + selectorParts.length >= 3 ? decodeNullable(selectorParts[2]) : null, + selectorParts.length >= 4 ? decodeStringSet(selectorParts[3]) : Collections.emptySet()); + } + + private static String leafId(int shard, long bucket) { + return "leaf:" + shard + ":" + bucket; + } + + private boolean matches(String deviceId, long time) { + return computeDeviceShard(deviceId) == shard + && Math.floorDiv(time, LEAF_TIME_BUCKET_MS) == bucket; + } + + private boolean matchesLiveCell( + String deviceId, String measurement, TSDataType dataType, long time, Object value) { + if (!matches(deviceId, time)) { + return false; + } + String logicalKey = encodeLogicalCellStatic(deviceId, measurement, time, dataType, value); + if (!isWithinRange(logicalKey)) { + return false; + } + return exactKeys.isEmpty() || exactKeys.contains(logicalKey); + } + + private boolean matches(TConsistencyDeletionSummary summary) { + return computeDeviceShard(summary.getPathPattern()) == shard + && Math.floorDiv(summary.getTimeRangeStart(), LEAF_TIME_BUCKET_MS) <= bucket + && Math.floorDiv(summary.getTimeRangeEnd(), LEAF_TIME_BUCKET_MS) >= bucket + && isWithinRange(encodeDeletionKeyStatic(summary)); + } + + private boolean requiresScopedReset() { + return !exactKeys.isEmpty() || keyRangeStart != null || keyRangeEnd != null; + } + + private ByteBuffer serialize() { + return ByteBuffer.wrap(toSelectorToken().getBytes(StandardCharsets.UTF_8)); + } + + private String toSelectorToken() { + return leafId + + "@" + + encodeNullable(keyRangeStart) + + "@" + + encodeNullable(keyRangeEnd) + + "@" + + encodeStringSet(exactKeys); + } + + private static LogicalLeafSelector deserialize(ByteBuffer buffer) { + ByteBuffer duplicate = buffer.duplicate(); + byte[] bytes = new byte[duplicate.remaining()]; + duplicate.get(bytes); + return parse(new String(bytes, StandardCharsets.UTF_8)); + } + + private static String encodeNullable(String value) { + if (value == null) { + return ""; + } + return Base64.getUrlEncoder() + .withoutPadding() + .encodeToString(value.getBytes(StandardCharsets.UTF_8)); + } + + private static String decodeNullable(String value) { + if (value == null || value.isEmpty()) { + return null; + } + return new String(Base64.getUrlDecoder().decode(value), StandardCharsets.UTF_8); + } + + private static String encodeStringSet(Set values) { + if (values == null || values.isEmpty()) { + return ""; + } + return encodeNullable(String.join("\n", values)); + } + + private static Set decodeStringSet(String encoded) { + String decoded = decodeNullable(encoded); + if (decoded == null || decoded.isEmpty()) { + return Collections.emptySet(); + } + Set values = new LinkedHashSet<>(); + Collections.addAll(values, decoded.split("\n")); + values.remove(""); + return values; + } + + private boolean isWithinRange(String logicalKey) { + if (logicalKey == null) { + return false; + } + if (keyRangeStart != null && logicalKey.compareTo(keyRangeStart) < 0) { + return false; + } + return keyRangeEnd == null || logicalKey.compareTo(keyRangeEnd) <= 0; + } + } + + private static String encodeLogicalCellStatic( + String deviceId, String measurement, long time, TSDataType dataType, Object value) { + return deviceId + + '|' + + measurement + + '|' + + time + + '|' + + dataType.name() + + '|' + + valueHashStatic(value); + } + + private static String encodeDeletionKeyStatic(TConsistencyDeletionSummary summary) { + return summary.getPathPattern() + + '|' + + summary.getTimeRangeStart() + + '|' + + summary.getTimeRangeEnd(); + } + + private static int valueHashStatic(Object value) { + if (value == null) { + return 0; + } + if (value instanceof Binary) { + return value.toString().hashCode(); + } + return value.hashCode(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyRepairService.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyRepairService.java new file mode 100644 index 0000000000000..47c9b33ff6302 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyRepairService.java @@ -0,0 +1,589 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TSStatus; +import org.apache.iotdb.commons.consensus.ConsensusGroupId; +import org.apache.iotdb.commons.consensus.DataRegionId; +import org.apache.iotdb.commons.consensus.index.ProgressIndex; +import org.apache.iotdb.commons.path.MeasurementPath; +import org.apache.iotdb.consensus.pipe.metric.IoTConsensusV2SyncLagManager; +import org.apache.iotdb.db.pipe.consensus.deletion.DeletionResource; +import org.apache.iotdb.db.pipe.consensus.deletion.DeletionResourceManager; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.PlanNodeType; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.AbstractDeleteDataNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.DeleteDataNode; +import org.apache.iotdb.db.queryengine.plan.planner.plan.node.write.InsertRowsNode; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.mpp.rpc.thrift.TApplyLogicalRepairBatchReq; +import org.apache.iotdb.mpp.rpc.thrift.TConsistencyDeletionSummary; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TDecodeLeafDiffResp; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffReq; +import org.apache.iotdb.mpp.rpc.thrift.TEstimateLeafDiffResp; +import org.apache.iotdb.mpp.rpc.thrift.TFinishLogicalRepairSessionReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetConsistencyEligibilityResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeReq; +import org.apache.iotdb.mpp.rpc.thrift.TGetSnapshotSubtreeResp; +import org.apache.iotdb.mpp.rpc.thrift.TLeafDiffEntry; +import org.apache.iotdb.mpp.rpc.thrift.TLeafDiffEstimate; +import org.apache.iotdb.mpp.rpc.thrift.TLogicalRepairBatch; +import org.apache.iotdb.mpp.rpc.thrift.TLogicalRepairLeafSelector; +import org.apache.iotdb.mpp.rpc.thrift.TPartitionConsistencyEligibility; +import org.apache.iotdb.mpp.rpc.thrift.TSnapshotSubtreeNode; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairReq; +import org.apache.iotdb.mpp.rpc.thrift.TStreamLogicalRepairResp; +import org.apache.iotdb.rpc.RpcUtils; +import org.apache.iotdb.rpc.TSStatusCode; + +import org.apache.tsfile.utils.PublicBAOS; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** DataNode-side logical snapshot and logical repair primitives for replica consistency repair. */ +public class DataRegionConsistencyRepairService { + + private static final Logger LOGGER = + LoggerFactory.getLogger(DataRegionConsistencyRepairService.class); + + private final StorageEngine storageEngine = StorageEngine.getInstance(); + private final DataRegionConsistencyManager consistencyManager = + DataRegionConsistencyManager.getInstance(); + private final LogicalRepairSessionJournal repairSessionJournal = + new LogicalRepairSessionJournal(); + + public TGetConsistencyEligibilityResp getConsistencyEligibility( + TGetConsistencyEligibilityReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return new TGetConsistencyEligibilityResp( + RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"), + Long.MAX_VALUE, + Long.MIN_VALUE); + } + + try { + long syncLag = + IoTConsensusV2SyncLagManager.getInstance( + normalizeConsensusGroupIdString(req.getConsensusGroupId())) + .calculateSyncLag(); + long safeWatermark = + dataRegion.getDelayAnalyzer() == null + ? Long.MAX_VALUE + : dataRegion.getDelayAnalyzer().getSafeWatermark(System.currentTimeMillis()); + List regionDeletionSummaries = + collectDeletionSummaries(req.getConsensusGroupId()); + Set timePartitions = new LinkedHashSet<>(dataRegion.getTimePartitions()); + timePartitions.addAll(consistencyManager.getKnownPartitions(req.getConsensusGroupId())); + List orderedTimePartitions = new ArrayList<>(timePartitions); + augmentTimePartitionsWithDeletionRanges(orderedTimePartitions, regionDeletionSummaries); + orderedTimePartitions.sort(Long::compareTo); + + List partitions = new ArrayList<>(); + for (Long timePartition : orderedTimePartitions) { + // Eligibility must expose follower partitions even when the follower-local DelayAnalyzer + // has + // not warmed up yet. ConfigNode applies cold-partition pruning from the leader view; if we + // filter here on every replica, a follower can disappear from the compare set and keep the + // partition stuck in DIRTY forever despite the leader already being safe. + List partitionDeletionSummaries = + filterDeletionSummariesForPartition(regionDeletionSummaries, timePartition); + DataRegionConsistencyManager.PartitionInspection inspection = + consistencyManager.inspectPartition( + req.getConsensusGroupId(), dataRegion, timePartition, partitionDeletionSummaries); + partitions.add( + new TPartitionConsistencyEligibility( + inspection.getPartitionId(), + inspection.getPartitionMutationEpoch(), + inspection.getSnapshotEpoch(), + inspection.getSnapshotState().name(), + inspection.getLiveRootDigest().getXorHash(), + inspection.getLiveRootDigest().getAdditiveHash(), + inspection.getTombstoneRootDigest().getXorHash(), + inspection.getTombstoneRootDigest().getAdditiveHash())); + } + + return new TGetConsistencyEligibilityResp(RpcUtils.SUCCESS_STATUS, syncLag, safeWatermark) + .setPartitions(partitions); + } catch (Exception e) { + LOGGER.warn( + "Failed to build consistency eligibility for region {}", req.getConsensusGroupId(), e); + return new TGetConsistencyEligibilityResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()), + Long.MAX_VALUE, + Long.MIN_VALUE); + } + } + + public TGetSnapshotSubtreeResp getSnapshotSubtree(TGetSnapshotSubtreeReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return new TGetSnapshotSubtreeResp( + RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"), + req.getTimePartitionId(), + req.getSnapshotEpoch()) + .setStale(true); + } + try { + List partitionDeletionSummaries = + filterDeletionSummariesForPartition( + collectDeletionSummaries(req.getConsensusGroupId()), req.getTimePartitionId()); + DataRegionConsistencyManager.SnapshotSubtreeResult subtreeResult = + consistencyManager.getSnapshotSubtree( + req.getConsensusGroupId(), + dataRegion, + req.getTimePartitionId(), + req.getSnapshotEpoch(), + req.getTreeKind(), + req.getNodeHandles(), + partitionDeletionSummaries); + List nodes = + subtreeResult.getNodes().stream() + .map( + node -> + new TSnapshotSubtreeNode( + node.getParentNodeHandle(), + node.getNodeHandle(), + req.getTreeKind(), + node.getDepth(), + node.isLeaf(), + node.getDigest().getXorHash(), + node.getDigest().getAdditiveHash(), + node.getItemCount()) + .setLeafId(node.getLeafId()) + .setKeyRangeStart(node.getKeyRangeStart()) + .setKeyRangeEnd(node.getKeyRangeEnd())) + .collect(Collectors.toList()); + return new TGetSnapshotSubtreeResp( + RpcUtils.SUCCESS_STATUS, req.getTimePartitionId(), subtreeResult.getSnapshotEpoch()) + .setStale(subtreeResult.isStale()) + .setNodes(nodes); + } catch (Exception e) { + LOGGER.warn( + "Failed to build snapshot subtree for region {} partition {}", + req.getConsensusGroupId(), + req.getTimePartitionId(), + e); + return new TGetSnapshotSubtreeResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()), + req.getTimePartitionId(), + req.getSnapshotEpoch()) + .setStale(true); + } + } + + public TEstimateLeafDiffResp estimateLeafDiff(TEstimateLeafDiffReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return new TEstimateLeafDiffResp( + RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"), + req.getTimePartitionId(), + req.getSnapshotEpoch()) + .setStale(true); + } + try { + List partitionDeletionSummaries = + filterDeletionSummariesForPartition( + collectDeletionSummaries(req.getConsensusGroupId()), req.getTimePartitionId()); + DataRegionConsistencyManager.LeafEstimate estimate = + consistencyManager.estimateLeaf( + req.getConsensusGroupId(), + dataRegion, + req.getTimePartitionId(), + req.getSnapshotEpoch(), + req.getTreeKind(), + req.getLeafId(), + partitionDeletionSummaries); + if (estimate == null) { + return new TEstimateLeafDiffResp( + RpcUtils.SUCCESS_STATUS, req.getTimePartitionId(), req.getSnapshotEpoch()) + .setStale(true); + } + return new TEstimateLeafDiffResp( + RpcUtils.SUCCESS_STATUS, req.getTimePartitionId(), estimate.getSnapshotEpoch()) + .setLeafDiff( + new TLeafDiffEstimate( + estimate.getPartitionId(), + estimate.getSnapshotEpoch(), + estimate.getTreeKind(), + estimate.getLeafId(), + estimate.getRowCount(), + estimate.getTombstoneCount(), + estimate.getStrataEstimate()) + .setKeyRangeStart(estimate.getKeyRangeStart()) + .setKeyRangeEnd(estimate.getKeyRangeEnd())) + .setStale(false); + } catch (Exception e) { + LOGGER.warn( + "Failed to estimate leaf diff for region {} partition {} leaf {}", + req.getConsensusGroupId(), + req.getTimePartitionId(), + req.getLeafId(), + e); + return new TEstimateLeafDiffResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()), + req.getTimePartitionId(), + req.getSnapshotEpoch()) + .setStale(true); + } + } + + public TDecodeLeafDiffResp decodeLeafDiff(TDecodeLeafDiffReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return new TDecodeLeafDiffResp( + RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"), + req.getTimePartitionId(), + req.getSnapshotEpoch()) + .setStale(true); + } + try { + List partitionDeletionSummaries = + filterDeletionSummariesForPartition( + collectDeletionSummaries(req.getConsensusGroupId()), req.getTimePartitionId()); + List entries = + consistencyManager.decodeLeaf( + req.getConsensusGroupId(), + dataRegion, + req.getTimePartitionId(), + req.getSnapshotEpoch(), + req.getTreeKind(), + req.getLeafId(), + partitionDeletionSummaries); + if (entries == null) { + return new TDecodeLeafDiffResp( + RpcUtils.SUCCESS_STATUS, req.getTimePartitionId(), req.getSnapshotEpoch()) + .setStale(true); + } + return new TDecodeLeafDiffResp( + RpcUtils.SUCCESS_STATUS, req.getTimePartitionId(), req.getSnapshotEpoch()) + .setStale(false) + .setDiffEntries( + entries.stream() + .map(entry -> new TLeafDiffEntry(entry.getLogicalKey(), entry.getDiffType())) + .collect(Collectors.toList())); + } catch (Exception e) { + LOGGER.warn( + "Failed to decode leaf diff for region {} partition {} leaf {}", + req.getConsensusGroupId(), + req.getTimePartitionId(), + req.getLeafId(), + e); + return new TDecodeLeafDiffResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()), + req.getTimePartitionId(), + req.getSnapshotEpoch()) + .setStale(true); + } + } + + public TStreamLogicalRepairResp streamLogicalRepair(TStreamLogicalRepairReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return new TStreamLogicalRepairResp( + RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"), + req.getTimePartitionId()) + .setStale(true); + } + try { + List partitionDeletionSummaries = + filterDeletionSummariesForPartition( + collectDeletionSummaries(req.getConsensusGroupId()), req.getTimePartitionId()); + List leafSelectors = new ArrayList<>(); + for (TLogicalRepairLeafSelector leafSelector : req.getLeafSelectors()) { + leafSelectors.add( + new DataRegionConsistencyManager.LeafSelector( + leafSelector.getTreeKind(), leafSelector.getLeafId())); + } + List batches = + consistencyManager.streamLogicalRepair( + req.getConsensusGroupId(), + dataRegion, + req.getTimePartitionId(), + req.getRepairEpoch(), + leafSelectors, + partitionDeletionSummaries); + List thriftBatches = + batches.stream() + .map( + batch -> + new TLogicalRepairBatch( + batch.getSessionId(), + batch.getTreeKind(), + batch.getLeafId(), + batch.getSeqNo(), + batch.getBatchKind(), + batch.getPayload())) + .collect(Collectors.toList()); + return new TStreamLogicalRepairResp(RpcUtils.SUCCESS_STATUS, req.getTimePartitionId()) + .setStale(false) + .setBatches(thriftBatches); + } catch (Exception e) { + LOGGER.warn( + "Failed to stream logical repair for region {} partition {}", + req.getConsensusGroupId(), + req.getTimePartitionId(), + e); + return new TStreamLogicalRepairResp( + RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()), + req.getTimePartitionId()) + .setStale(true); + } + } + + public TSStatus applyLogicalRepairBatch(TApplyLogicalRepairBatchReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"); + } + try { + repairSessionJournal.stageBatch( + normalizeConsensusGroupIdString(req.getConsensusGroupId()), + req.getTimePartitionId(), + req.getRepairEpoch(), + req.getSessionId(), + req.getTreeKind(), + req.getLeafId(), + req.getSeqNo(), + req.getBatchKind(), + req.bufferForPayload()); + return RpcUtils.SUCCESS_STATUS; + } catch (Exception e) { + LOGGER.warn( + "Failed to stage logical repair batch for region {} partition {} session {} seq {}", + req.getConsensusGroupId(), + req.getTimePartitionId(), + req.getSessionId(), + req.getSeqNo(), + e); + return RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()); + } + } + + public TSStatus finishLogicalRepairSession(TFinishLogicalRepairSessionReq req) { + DataRegion dataRegion = getDataRegion(req.getConsensusGroupId()); + if (dataRegion == null) { + return RpcUtils.getStatus( + TSStatusCode.DATAREGION_PROCESS_ERROR, + "DataRegion " + req.getConsensusGroupId() + " is not found on this DataNode"); + } + try { + List stagedBatches = + repairSessionJournal.loadStagedBatches( + normalizeConsensusGroupIdString(req.getConsensusGroupId()), + req.getTimePartitionId(), + req.getRepairEpoch(), + req.getSessionId()); + if (stagedBatches.isEmpty()) { + repairSessionJournal.completeSession(req.getSessionId()); + return RpcUtils.SUCCESS_STATUS; + } + consistencyManager.runWithLogicalRepairMutation( + req.getConsensusGroupId(), + req.getTimePartitionId(), + req.getRepairEpoch(), + () -> { + for (LogicalRepairSessionJournal.StagedBatch stagedBatch : stagedBatches) { + applyStagedBatch(dataRegion, req.getTimePartitionId(), stagedBatch); + } + return null; + }); + repairSessionJournal.completeSession(req.getSessionId()); + return RpcUtils.SUCCESS_STATUS; + } catch (Exception e) { + LOGGER.warn( + "Failed to finish logical repair session for region {} partition {} session {}", + req.getConsensusGroupId(), + req.getTimePartitionId(), + req.getSessionId(), + e); + return RpcUtils.getStatus(TSStatusCode.EXECUTE_STATEMENT_ERROR, e.getMessage()); + } + } + + private void applyStagedBatch( + DataRegion dataRegion, + long timePartitionId, + LogicalRepairSessionJournal.StagedBatch stagedBatch) + throws Exception { + if ("RESET_SCOPE".equals(stagedBatch.getBatchKind())) { + consistencyManager.resetLiveScope( + dataRegion, timePartitionId, stagedBatch.duplicatePayload()); + return; + } + if ("RESET_LEAF".equals(stagedBatch.getBatchKind())) { + consistencyManager.resetLiveLeaf(dataRegion, timePartitionId, stagedBatch.getLeafId()); + return; + } + PlanNode planNode = PlanNodeType.deserialize(stagedBatch.duplicatePayload()); + if (planNode instanceof InsertRowsNode) { + dataRegion.insert((InsertRowsNode) planNode); + return; + } + if (planNode instanceof DeleteDataNode) { + DeleteDataNode deleteDataNode = (DeleteDataNode) planNode; + for (MeasurementPath path : deleteDataNode.getPathList()) { + dataRegion.deleteByDevice(path, deleteDataNode); + } + return; + } + throw new UnsupportedOperationException( + "Unsupported logical repair batch payload: " + planNode.getClass().getSimpleName()); + } + + private List collectDeletionSummaries( + TConsensusGroupId consensusGroupId) throws IOException { + DeletionResourceManager deletionResourceManager = + DeletionResourceManager.getInstance(consensusGroupId.getId()); + if (deletionResourceManager == null) { + return Collections.emptyList(); + } + + List deletionSummaries = new ArrayList<>(); + for (DeletionResource deletionResource : deletionResourceManager.getAllDeletionResources()) { + AbstractDeleteDataNode deleteDataNode = deletionResource.getDeleteDataNode(); + if (!(deleteDataNode instanceof DeleteDataNode)) { + continue; + } + DeleteDataNode treeDeleteNode = (DeleteDataNode) deleteDataNode; + ByteBuffer serializedProgressIndex = + serializeProgressIndex(treeDeleteNode.getProgressIndex()); + for (MeasurementPath path : treeDeleteNode.getPathList()) { + deletionSummaries.add( + new TConsistencyDeletionSummary( + path.getFullPath(), + treeDeleteNode.getDeleteStartTime(), + treeDeleteNode.getDeleteEndTime(), + serializedProgressIndex.duplicate())); + } + } + deletionSummaries.sort( + Comparator.comparing(TConsistencyDeletionSummary::getPathPattern) + .thenComparingLong(TConsistencyDeletionSummary::getTimeRangeStart) + .thenComparingLong(TConsistencyDeletionSummary::getTimeRangeEnd)); + return deletionSummaries; + } + + private List filterDeletionSummariesForPartition( + List regionDeletionSummaries, long timePartition) { + if (regionDeletionSummaries.isEmpty()) { + return Collections.emptyList(); + } + + long timePartitionInterval = + org.apache.iotdb.commons.conf.CommonDescriptor.getInstance() + .getConfig() + .getTimePartitionInterval(); + if (timePartitionInterval <= 0) { + return new ArrayList<>(regionDeletionSummaries); + } + + long partitionStart = timePartition * timePartitionInterval; + long partitionEnd = + Long.MAX_VALUE - timePartitionInterval < partitionStart + ? Long.MAX_VALUE + : partitionStart + timePartitionInterval - 1; + + List filtered = new ArrayList<>(); + for (TConsistencyDeletionSummary summary : regionDeletionSummaries) { + if (summary.getTimeRangeEnd() >= partitionStart + && summary.getTimeRangeStart() <= partitionEnd) { + filtered.add(summary); + } + } + return filtered; + } + + private void augmentTimePartitionsWithDeletionRanges( + List timePartitions, List regionDeletionSummaries) { + long timePartitionInterval = + org.apache.iotdb.commons.conf.CommonDescriptor.getInstance() + .getConfig() + .getTimePartitionInterval(); + if (timePartitionInterval <= 0 || regionDeletionSummaries.isEmpty()) { + return; + } + + for (TConsistencyDeletionSummary summary : regionDeletionSummaries) { + long startPartition = Math.floorDiv(summary.getTimeRangeStart(), timePartitionInterval); + long endPartition = Math.floorDiv(summary.getTimeRangeEnd(), timePartitionInterval); + for (long partition = startPartition; partition <= endPartition; partition++) { + if (!timePartitions.contains(partition)) { + timePartitions.add(partition); + } + } + } + } + + private ByteBuffer serializeProgressIndex(ProgressIndex progressIndex) throws IOException { + try (PublicBAOS byteArrayOutputStream = new PublicBAOS(); + DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + if (progressIndex == null) { + outputStream.writeByte(0); + } else { + progressIndex.serialize(outputStream); + } + return ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size()); + } + } + + private DataRegion getDataRegion(TConsensusGroupId consensusGroupId) { + if (consensusGroupId == null) { + return null; + } + ConsensusGroupId groupId = + ConsensusGroupId.Factory.createFromTConsensusGroupId(consensusGroupId); + if (!(groupId instanceof DataRegionId)) { + return null; + } + return storageEngine.getDataRegion((DataRegionId) groupId); + } + + private String normalizeConsensusGroupIdString(TConsensusGroupId consensusGroupId) { + return ConsensusGroupId.Factory.createFromTConsensusGroupId(consensusGroupId).toString(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalConsistencyPartitionStateStore.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalConsistencyPartitionStateStore.java new file mode 100644 index 0000000000000..f51fbb5ea1bbe --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalConsistencyPartitionStateStore.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.consistency; + +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.AtomicMoveNotSupportedException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.util.Base64; +import java.util.Collections; +import java.util.Map; +import java.util.TreeMap; + +/** + * Persists the partition mutation epochs that the logical consistency layer has already observed. + */ +class LogicalConsistencyPartitionStateStore { + + private static final int FORMAT_VERSION = 1; + + private final Path stateDirOverride; + + LogicalConsistencyPartitionStateStore() { + this(null); + } + + LogicalConsistencyPartitionStateStore(Path stateDir) { + this.stateDirOverride = stateDir; + } + + public Map load(String consensusGroupKey) throws IOException { + Path path = statePath(consensusGroupKey); + if (!Files.exists(path)) { + return Collections.emptyMap(); + } + + try (InputStream inputStream = Files.newInputStream(path, StandardOpenOption.READ)) { + int version = ReadWriteIOUtils.readInt(inputStream); + if (version != FORMAT_VERSION) { + throw new IOException("Unsupported logical consistency partition-state format " + version); + } + int partitionCount = ReadWriteIOUtils.readInt(inputStream); + Map mutationEpochs = new TreeMap<>(); + for (int i = 0; i < partitionCount; i++) { + mutationEpochs.put( + ReadWriteIOUtils.readLong(inputStream), ReadWriteIOUtils.readLong(inputStream)); + } + return mutationEpochs; + } + } + + public void persist(String consensusGroupKey, Map mutationEpochs) throws IOException { + Path stateDir = getStateDir(); + Files.createDirectories(stateDir); + Path path = statePath(consensusGroupKey); + Path tmpPath = path.resolveSibling(path.getFileName() + ".tmp"); + + try (FileOutputStream outputStream = new FileOutputStream(tmpPath.toFile())) { + serialize(mutationEpochs, outputStream); + outputStream.getFD().sync(); + } + + try { + Files.move( + tmpPath, path, StandardCopyOption.ATOMIC_MOVE, StandardCopyOption.REPLACE_EXISTING); + } catch (AtomicMoveNotSupportedException e) { + Files.move(tmpPath, path, StandardCopyOption.REPLACE_EXISTING); + } + } + + private Path statePath(String consensusGroupKey) { + return getStateDir().resolve(encodeFileName(consensusGroupKey) + ".state"); + } + + private Path getStateDir() { + if (stateDirOverride != null) { + return stateDirOverride; + } + return Paths.get( + IoTDBDescriptor.getInstance().getConfig().getSystemDir(), + "consistency-check", + "partition-state"); + } + + private String encodeFileName(String consensusGroupKey) { + return Base64.getUrlEncoder() + .withoutPadding() + .encodeToString(consensusGroupKey.getBytes(StandardCharsets.UTF_8)); + } + + private void serialize(Map mutationEpochs, OutputStream outputStream) + throws IOException { + ReadWriteIOUtils.write(FORMAT_VERSION, outputStream); + ReadWriteIOUtils.write(mutationEpochs.size(), outputStream); + for (Map.Entry entry : mutationEpochs.entrySet()) { + ReadWriteIOUtils.write(entry.getKey(), outputStream); + ReadWriteIOUtils.write(entry.getValue(), outputStream); + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalRepairSessionJournal.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalRepairSessionJournal.java new file mode 100644 index 0000000000000..8fb78552f05b3 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalRepairSessionJournal.java @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.consistency; + +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.file.AtomicMoveNotSupportedException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; + +/** Durable staging journal for follower-side logical repair sessions. */ +class LogicalRepairSessionJournal { + + private static final int FORMAT_VERSION = 1; + + private final Path journalDirOverride; + private final ConcurrentHashMap sessions = new ConcurrentHashMap<>(); + + LogicalRepairSessionJournal() { + this(null); + } + + LogicalRepairSessionJournal(Path journalDir) { + this.journalDirOverride = journalDir; + } + + public synchronized void stageBatch( + String consensusGroupKey, + long partitionId, + String repairEpoch, + String sessionId, + String treeKind, + String leafId, + int seqNo, + String batchKind, + ByteBuffer payload) + throws IOException { + SessionState sessionState = + loadOrCreateSession(consensusGroupKey, partitionId, repairEpoch, sessionId); + if (sessionState.batches.containsKey(seqNo)) { + return; + } + sessionState.batches.put( + seqNo, + new StagedBatch(sessionId, treeKind, leafId, seqNo, batchKind, duplicatePayload(payload))); + persist(sessionState); + } + + public synchronized List loadStagedBatches( + String consensusGroupKey, long partitionId, String repairEpoch, String sessionId) + throws IOException { + SessionState sessionState = loadSession(sessionId); + if (sessionState == null) { + return Collections.emptyList(); + } + validateSession(sessionState, consensusGroupKey, partitionId, repairEpoch, sessionId); + return new ArrayList<>(sessionState.batches.values()); + } + + public synchronized void completeSession(String sessionId) throws IOException { + sessions.remove(sessionId); + Files.deleteIfExists(sessionPath(sessionId)); + } + + private SessionState loadOrCreateSession( + String consensusGroupKey, long partitionId, String repairEpoch, String sessionId) + throws IOException { + SessionState existing = loadSession(sessionId); + if (existing != null) { + validateSession(existing, consensusGroupKey, partitionId, repairEpoch, sessionId); + return existing; + } + + SessionState created = + new SessionState(consensusGroupKey, partitionId, repairEpoch, sessionId, new TreeMap<>()); + sessions.put(sessionId, created); + persist(created); + return created; + } + + private SessionState loadSession(String sessionId) throws IOException { + SessionState cached = sessions.get(sessionId); + if (cached != null) { + return cached; + } + + Path sessionPath = sessionPath(sessionId); + if (!Files.exists(sessionPath)) { + return null; + } + + try (InputStream inputStream = Files.newInputStream(sessionPath, StandardOpenOption.READ)) { + SessionState loaded = deserialize(inputStream); + sessions.put(sessionId, loaded); + return loaded; + } + } + + private void validateSession( + SessionState sessionState, + String consensusGroupKey, + long partitionId, + String repairEpoch, + String sessionId) { + if (!Objects.equals(sessionState.consensusGroupKey, consensusGroupKey) + || sessionState.partitionId != partitionId + || !Objects.equals(sessionState.repairEpoch, repairEpoch) + || !Objects.equals(sessionState.sessionId, sessionId)) { + throw new IllegalStateException( + "Logical repair session " + sessionId + " conflicts with current request metadata"); + } + } + + private void persist(SessionState sessionState) throws IOException { + Path journalDir = getJournalDir(); + Files.createDirectories(journalDir); + Path sessionPath = sessionPath(sessionState.sessionId); + Path tmpPath = sessionPath.resolveSibling(sessionPath.getFileName() + ".tmp"); + + try (FileOutputStream outputStream = new FileOutputStream(tmpPath.toFile())) { + serialize(sessionState, outputStream); + outputStream.getFD().sync(); + } + + try { + Files.move( + tmpPath, + sessionPath, + StandardCopyOption.ATOMIC_MOVE, + StandardCopyOption.REPLACE_EXISTING); + } catch (AtomicMoveNotSupportedException e) { + Files.move(tmpPath, sessionPath, StandardCopyOption.REPLACE_EXISTING); + } + } + + private Path sessionPath(String sessionId) { + return getJournalDir().resolve(sessionId + ".session"); + } + + private Path getJournalDir() { + if (journalDirOverride != null) { + return journalDirOverride; + } + return Paths.get( + IoTDBDescriptor.getInstance().getConfig().getSystemDir(), "consistency-repair", "sessions"); + } + + private static byte[] duplicatePayload(ByteBuffer payload) { + if (payload == null) { + return new byte[0]; + } + ByteBuffer duplicate = payload.duplicate(); + byte[] bytes = new byte[duplicate.remaining()]; + duplicate.get(bytes); + return bytes; + } + + private static void serialize(SessionState sessionState, OutputStream outputStream) + throws IOException { + ReadWriteIOUtils.write(FORMAT_VERSION, outputStream); + ReadWriteIOUtils.write(sessionState.consensusGroupKey, outputStream); + ReadWriteIOUtils.write(sessionState.partitionId, outputStream); + ReadWriteIOUtils.write(sessionState.repairEpoch, outputStream); + ReadWriteIOUtils.write(sessionState.sessionId, outputStream); + ReadWriteIOUtils.write(sessionState.batches.size(), outputStream); + for (StagedBatch stagedBatch : sessionState.batches.values()) { + ReadWriteIOUtils.write(stagedBatch.sessionId, outputStream); + ReadWriteIOUtils.write(stagedBatch.treeKind, outputStream); + ReadWriteIOUtils.write(stagedBatch.leafId, outputStream); + ReadWriteIOUtils.write(stagedBatch.seqNo, outputStream); + ReadWriteIOUtils.write(stagedBatch.batchKind, outputStream); + ReadWriteIOUtils.write(stagedBatch.payload.length, outputStream); + outputStream.write(stagedBatch.payload); + } + } + + private static SessionState deserialize(InputStream inputStream) throws IOException { + int formatVersion = ReadWriteIOUtils.readInt(inputStream); + if (formatVersion != FORMAT_VERSION) { + throw new IOException("Unsupported logical repair session format " + formatVersion); + } + String consensusGroupKey = ReadWriteIOUtils.readString(inputStream); + long partitionId = ReadWriteIOUtils.readLong(inputStream); + String repairEpoch = ReadWriteIOUtils.readString(inputStream); + String sessionId = ReadWriteIOUtils.readString(inputStream); + int batchCount = ReadWriteIOUtils.readInt(inputStream); + Map batches = new TreeMap<>(); + for (int i = 0; i < batchCount; i++) { + String batchSessionId = ReadWriteIOUtils.readString(inputStream); + String treeKind = ReadWriteIOUtils.readString(inputStream); + String leafId = ReadWriteIOUtils.readString(inputStream); + int seqNo = ReadWriteIOUtils.readInt(inputStream); + String batchKind = ReadWriteIOUtils.readString(inputStream); + int payloadSize = ReadWriteIOUtils.readInt(inputStream); + byte[] payload = new byte[payloadSize]; + int offset = 0; + while (offset < payloadSize) { + int read = inputStream.read(payload, offset, payloadSize - offset); + if (read < 0) { + throw new IOException("Unexpected end of logical repair session journal"); + } + offset += read; + } + batches.put( + seqNo, new StagedBatch(batchSessionId, treeKind, leafId, seqNo, batchKind, payload)); + } + return new SessionState(consensusGroupKey, partitionId, repairEpoch, sessionId, batches); + } + + static final class StagedBatch { + private final String sessionId; + private final String treeKind; + private final String leafId; + private final int seqNo; + private final String batchKind; + private final byte[] payload; + + private StagedBatch( + String sessionId, + String treeKind, + String leafId, + int seqNo, + String batchKind, + byte[] payload) { + this.sessionId = sessionId; + this.treeKind = treeKind; + this.leafId = leafId; + this.seqNo = seqNo; + this.batchKind = batchKind; + this.payload = payload == null ? new byte[0] : Arrays.copyOf(payload, payload.length); + } + + String getSessionId() { + return sessionId; + } + + String getTreeKind() { + return treeKind; + } + + String getLeafId() { + return leafId; + } + + int getSeqNo() { + return seqNo; + } + + String getBatchKind() { + return batchKind; + } + + ByteBuffer duplicatePayload() { + return ByteBuffer.wrap(Arrays.copyOf(payload, payload.length)); + } + } + + private static final class SessionState { + private final String consensusGroupKey; + private final long partitionId; + private final String repairEpoch; + private final String sessionId; + private final Map batches; + + private SessionState( + String consensusGroupKey, + long partitionId, + String repairEpoch, + String sessionId, + Map batches) { + this.consensusGroupKey = consensusGroupKey; + this.partitionId = partitionId; + this.repairEpoch = repairEpoch; + this.sessionId = sessionId; + this.batches = batches; + } + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/read/control/QueryResourceManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/read/control/QueryResourceManager.java index 930d68b4e891d..911d50e37f31f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/read/control/QueryResourceManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/read/control/QueryResourceManager.java @@ -49,6 +49,16 @@ public long assignQueryId() { return queryIdAtom.incrementAndGet(); } + /** + * Register a unique internal read id that also participates in QueryFileManager reference + * tracking. Use this for non-session background reads that manage file references directly. + */ + public long assignInternalQueryId() { + long queryId = queryIdAtom.incrementAndGet(); + filePathsManager.addQueryId(queryId); + return queryId; + } + /** * Register a read id for compaction. The name of the compaction thread is * 'pool-x-IoTDB-Compaction-xx', xx in which is usually an integer from 0 to diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileManager.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileManager.java index b7c1ba2c14fb4..c44d03bc3a8cf 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileManager.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileManager.java @@ -19,8 +19,11 @@ package org.apache.iotdb.db.storageengine.dataregion.tsfile; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; import org.apache.iotdb.commons.utils.TimePartitionUtils; import org.apache.iotdb.db.pipe.resource.PipeDataNodeResourceManager; +import org.apache.iotdb.db.storageengine.dataregion.consistency.DataRegionConsistencyManager; import org.apache.iotdb.db.storageengine.dataregion.modification.ModFileManagement; import org.apache.iotdb.db.storageengine.dataregion.modification.PartitionLevelModFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.timeindex.FileTimeIndexCacheRecorder; @@ -349,6 +352,14 @@ public void replace( unseqFileResources, targetFileResources); } + + DataRegionConsistencyManager.getInstance() + .onCompaction( + new TConsensusGroupId(TConsensusGroupType.DataRegion, Integer.parseInt(dataRegionId)), + seqFileResources, + unseqFileResources, + targetFileResources, + timePartition); } public boolean contains(TsFileResource tsFileResource, boolean sequence) { diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSinkTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSinkTest.java new file mode 100644 index 0000000000000..b7ef3e9570310 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/pipe/sink/protocol/iotconsensusv2/IoTConsensusV2AsyncSinkTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.pipe.sink.protocol.iotconsensusv2; + +import org.apache.iotdb.commons.pipe.event.EnrichedEvent; +import org.apache.iotdb.db.pipe.consensus.ReplicateProgressDataNodeManager; +import org.apache.iotdb.db.pipe.consensus.metric.IoTConsensusV2SinkMetrics; + +import org.junit.Assert; +import org.junit.Test; +import org.mockito.Mockito; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +public class IoTConsensusV2AsyncSinkTest { + + @Test + public void testLeaderReplicateProgressIgnoresPreAssignedButDiscardedEvents() throws Exception { + IoTConsensusV2AsyncSink sink = new IoTConsensusV2AsyncSink(); + setField(sink, "iotConsensusV2SinkMetrics", Mockito.mock(IoTConsensusV2SinkMetrics.class)); + + String pipeName = "__consensus.test_" + System.nanoTime(); + try { + Assert.assertEquals( + 1L, ReplicateProgressDataNodeManager.assignReplicateIndexForIoTV2(pipeName)); + Assert.assertEquals( + "Source-side replicate index assignment should not affect connector sync lag progress", + 0L, + sink.getLeaderReplicateProgress()); + + EnrichedEvent enqueuedEvent = Mockito.mock(EnrichedEvent.class); + Mockito.when(enqueuedEvent.getReplicateIndexForIoTV2()).thenReturn(1L); + Mockito.when(enqueuedEvent.increaseReferenceCount(Mockito.anyString())).thenReturn(true); + + Assert.assertTrue(invokeAddEvent2Buffer(sink, enqueuedEvent)); + Assert.assertEquals(1L, sink.getLeaderReplicateProgress()); + + Assert.assertEquals( + 2L, ReplicateProgressDataNodeManager.assignReplicateIndexForIoTV2(pipeName)); + Assert.assertEquals( + "Discarded events that never enter the connector must not create phantom lag", + 1L, + sink.getLeaderReplicateProgress()); + } finally { + ReplicateProgressDataNodeManager.resetReplicateIndexForIoTV2(pipeName); + } + } + + private boolean invokeAddEvent2Buffer(IoTConsensusV2AsyncSink sink, EnrichedEvent event) + throws Exception { + Method method = + IoTConsensusV2AsyncSink.class.getDeclaredMethod("addEvent2Buffer", EnrichedEvent.class); + method.setAccessible(true); + return (boolean) method.invoke(sink, event); + } + + private void setField(Object target, String fieldName, Object value) throws Exception { + Field field = target.getClass().getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactoryTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactoryTest.java new file mode 100644 index 0000000000000..eac9b59250937 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/queryengine/plan/relational/planner/optimizations/DataNodeLocationSupplierFactoryTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.queryengine.plan.relational.planner.optimizations; + +import org.apache.iotdb.commons.schema.table.InformationSchema; + +import org.junit.Assert; +import org.junit.Test; + +public class DataNodeLocationSupplierFactoryTest { + + @Test + public void testRepairProgressUsesLocalInformationSchemaSupplier() { + Assert.assertEquals( + 1, + DataNodeLocationSupplierFactory.getSupplier() + .getDataNodeLocations(InformationSchema.REPAIR_PROGRESS) + .size()); + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyManagerTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyManagerTest.java new file mode 100644 index 0000000000000..e1bc5c58efb69 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/consistency/DataRegionConsistencyManagerTest.java @@ -0,0 +1,595 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.consistency; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; +import org.apache.iotdb.commons.consensus.iotv2.consistency.RepairProgressTable; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.storageengine.dataregion.compaction.execute.utils.MultiTsFileDeviceIterator; +import org.apache.iotdb.db.storageengine.dataregion.compaction.utils.CompactionTestFileWriter; +import org.apache.iotdb.db.storageengine.dataregion.memtable.AlignedWritableMemChunkGroup; +import org.apache.iotdb.db.storageengine.dataregion.memtable.IMemTable; +import org.apache.iotdb.db.storageengine.dataregion.memtable.IWritableMemChunkGroup; +import org.apache.iotdb.db.storageengine.dataregion.memtable.PrimitiveMemTable; +import org.apache.iotdb.db.storageengine.dataregion.memtable.TsFileProcessor; +import org.apache.iotdb.db.storageengine.dataregion.memtable.WritableMemChunkGroup; +import org.apache.iotdb.db.storageengine.dataregion.read.control.FileReaderManager; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + +import org.apache.tsfile.enums.TSDataType; +import org.apache.tsfile.file.metadata.IDeviceID; +import org.apache.tsfile.file.metadata.enums.CompressionType; +import org.apache.tsfile.file.metadata.enums.TSEncoding; +import org.apache.tsfile.read.common.TimeRange; +import org.apache.tsfile.write.schema.IMeasurementSchema; +import org.apache.tsfile.write.schema.MeasurementSchema; +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; +import org.mockito.Mockito; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.nio.channels.ClosedChannelException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; + +public class DataRegionConsistencyManagerTest { + + private final List tempStateDirs = new ArrayList<>(); + + @After + public void tearDown() throws IOException { + for (Path tempStateDir : tempStateDirs) { + deleteRecursively(tempStateDir); + } + tempStateDirs.clear(); + } + + @Test + public void compactionShouldNotAdvancePartitionMutationEpoch() throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + TConsensusGroupId consensusGroupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 901); + + consistencyManager.onPartitionMutation(consensusGroupId, 7L); + long beforeEpoch = getPartitionMutationEpoch(consistencyManager, consensusGroupId, 7L); + + consistencyManager.onCompaction( + consensusGroupId, + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList(), + 7L); + + Assert.assertEquals( + beforeEpoch, getPartitionMutationEpoch(consistencyManager, consensusGroupId, 7L)); + Assert.assertEquals( + RepairProgressTable.SnapshotState.DIRTY, + getSnapshotState(consistencyManager, consensusGroupId, 7L)); + } + + @Test + public void deletionShouldMarkAffectedPartitionDirty() throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + TConsensusGroupId consensusGroupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 902); + + consistencyManager.onDeletion(consensusGroupId, 0L, 0L); + + Assert.assertEquals(1L, getPartitionMutationEpoch(consistencyManager, consensusGroupId, 0L)); + Assert.assertEquals( + RepairProgressTable.SnapshotState.DIRTY, + getSnapshotState(consistencyManager, consensusGroupId, 0L)); + } + + @Test + public void logicalRepairMutationShouldNotAdvancePartitionMutationEpoch() throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + TConsensusGroupId consensusGroupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 903); + + consistencyManager.onPartitionMutation(consensusGroupId, 11L); + long beforeEpoch = getPartitionMutationEpoch(consistencyManager, consensusGroupId, 11L); + + consistencyManager.runWithLogicalRepairMutation( + consensusGroupId, + 11L, + "1:11:0:" + beforeEpoch + ":" + beforeEpoch, + () -> { + consistencyManager.onPartitionMutation(consensusGroupId, 11L); + return null; + }); + + Assert.assertEquals( + beforeEpoch, getPartitionMutationEpoch(consistencyManager, consensusGroupId, 11L)); + Assert.assertEquals( + RepairProgressTable.SnapshotState.DIRTY, + getSnapshotState(consistencyManager, consensusGroupId, 11L)); + } + + @Test + public void exactRepairSelectorShouldRoundTripExactKeys() throws Exception { + Class selectorClass = + Class.forName( + "org.apache.iotdb.db.storageengine.dataregion.consistency.DataRegionConsistencyManager$LogicalLeafSelector"); + Method parseMethod = selectorClass.getDeclaredMethod("parse", String.class); + parseMethod.setAccessible(true); + + String selectorToken = + "leaf:4:9@@@" + + Base64.getUrlEncoder() + .withoutPadding() + .encodeToString( + String.join("\n", "root.db.d1|s1|1|INT64|1", "root.db.d1|s2|2|INT64|2") + .getBytes(StandardCharsets.UTF_8)); + Object selector = parseMethod.invoke(null, selectorToken); + + Field exactKeysField = selectorClass.getDeclaredField("exactKeys"); + exactKeysField.setAccessible(true); + + @SuppressWarnings("unchecked") + java.util.Set exactKeys = (java.util.Set) exactKeysField.get(selector); + Assert.assertEquals(2, exactKeys.size()); + Assert.assertTrue(exactKeys.contains("root.db.d1|s1|1|INT64|1")); + Assert.assertTrue(exactKeys.contains("root.db.d1|s2|2|INT64|2")); + } + + @Test + public void rangeRepairSelectorShouldHonorLogicalKeyRange() throws Exception { + Class selectorClass = + Class.forName( + "org.apache.iotdb.db.storageengine.dataregion.consistency.DataRegionConsistencyManager$LogicalLeafSelector"); + Method parseMethod = selectorClass.getDeclaredMethod("parse", String.class); + parseMethod.setAccessible(true); + + Method computeDeviceShardMethod = + DataRegionConsistencyManager.class.getDeclaredMethod("computeDeviceShard", String.class); + computeDeviceShardMethod.setAccessible(true); + int shard = (Integer) computeDeviceShardMethod.invoke(null, "root.db.d1"); + + String selectorToken = + "leaf:" + + shard + + ":0@" + + encodeBase64("root.db.d1|s1|1|INT64|1") + + "@" + + encodeBase64("root.db.d1|s1|3|INT64|3") + + "@"; + Object selector = parseMethod.invoke(null, selectorToken); + + Method matchesLiveCellMethod = + selectorClass.getDeclaredMethod( + "matchesLiveCell", + String.class, + String.class, + TSDataType.class, + long.class, + Object.class); + matchesLiveCellMethod.setAccessible(true); + Method requiresScopedResetMethod = selectorClass.getDeclaredMethod("requiresScopedReset"); + requiresScopedResetMethod.setAccessible(true); + + Assert.assertTrue((Boolean) requiresScopedResetMethod.invoke(selector)); + Assert.assertTrue( + (Boolean) + matchesLiveCellMethod.invoke(selector, "root.db.d1", "s1", TSDataType.INT64, 2L, 2L)); + Assert.assertFalse( + (Boolean) + matchesLiveCellMethod.invoke(selector, "root.db.d1", "s1", TSDataType.INT64, 4L, 4L)); + Assert.assertFalse( + (Boolean) + matchesLiveCellMethod.invoke(selector, "root.db.d1", "s1", TSDataType.INT64, 0L, 0L)); + } + + @Test + public void snapshotTreeShouldTrackLogicalKeyBounds() throws Exception { + Class snapshotTreeClass = + Class.forName( + "org.apache.iotdb.db.storageengine.dataregion.consistency.DataRegionConsistencyManager$SnapshotTree"); + Method emptyMethod = snapshotTreeClass.getDeclaredMethod("empty"); + emptyMethod.setAccessible(true); + Object snapshotTree = emptyMethod.invoke(null); + + Method addLeafEntryMethod = + snapshotTreeClass.getDeclaredMethod( + "addLeafEntry", String.class, int.class, String.class, long.class, long.class); + addLeafEntryMethod.setAccessible(true); + addLeafEntryMethod.invoke(snapshotTree, "leaf:7:0", 7, "root.db.d1|s1|2|INT64|2", 11L, 1L); + addLeafEntryMethod.invoke(snapshotTree, "leaf:7:0", 7, "root.db.d1|s1|1|INT64|1", 13L, 1L); + addLeafEntryMethod.invoke(snapshotTree, "leaf:7:0", 7, "root.db.d1|s1|3|INT64|3", 17L, 1L); + + Field nodesByHandleField = snapshotTreeClass.getDeclaredField("nodesByHandle"); + nodesByHandleField.setAccessible(true); + @SuppressWarnings("unchecked") + Map nodesByHandle = (Map) nodesByHandleField.get(snapshotTree); + Object leafNode = nodesByHandle.get("leaf:7:0"); + Assert.assertNotNull(leafNode); + + Field keyRangeStartField = leafNode.getClass().getDeclaredField("keyRangeStart"); + keyRangeStartField.setAccessible(true); + Field keyRangeEndField = leafNode.getClass().getDeclaredField("keyRangeEnd"); + keyRangeEndField.setAccessible(true); + + Assert.assertEquals("root.db.d1|s1|1|INT64|1", keyRangeStartField.get(leafNode)); + Assert.assertEquals("root.db.d1|s1|3|INT64|3", keyRangeEndField.get(leafNode)); + } + + @Test + public void memTableSeriesDiscoveryShouldIncludeUnsealedMeasurements() throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + + IMeasurementSchema alignedS1 = new MeasurementSchema("s1", TSDataType.INT64); + IMeasurementSchema alignedS2 = new MeasurementSchema("s2", TSDataType.DOUBLE); + List alignedSchemas = java.util.Arrays.asList(alignedS1, alignedS2); + AlignedWritableMemChunkGroup alignedGroup = + new AlignedWritableMemChunkGroup(alignedSchemas, false); + alignedGroup.writeRow(1L, new Object[] {1L, 1.0d}, alignedSchemas); + + IMeasurementSchema nonAlignedSchema = new MeasurementSchema("s3", TSDataType.INT32); + WritableMemChunkGroup nonAlignedGroup = new WritableMemChunkGroup(); + nonAlignedGroup.writeRow(2L, new Object[] {1}, Collections.singletonList(nonAlignedSchema)); + + Map memTableMap = new HashMap<>(); + memTableMap.put(IDeviceID.Factory.DEFAULT_FACTORY.create("root.db.d1"), alignedGroup); + memTableMap.put(IDeviceID.Factory.DEFAULT_FACTORY.create("root.db.d2"), nonAlignedGroup); + PrimitiveMemTable memTable = new PrimitiveMemTable("root.db", "1", memTableMap); + + Method collectMemTableSeriesContexts = + DataRegionConsistencyManager.class.getDeclaredMethod( + "collectMemTableSeriesContexts", IMemTable.class, Map.class); + collectMemTableSeriesContexts.setAccessible(true); + + Map deviceSeriesContexts = new java.util.TreeMap<>(); + collectMemTableSeriesContexts.invoke(consistencyManager, memTable, deviceSeriesContexts); + + Assert.assertEquals(2, deviceSeriesContexts.size()); + assertDeviceSeriesContext(deviceSeriesContexts.get("root.db.d1"), true, "s1", "s2"); + assertDeviceSeriesContext(deviceSeriesContexts.get("root.db.d2"), false, "s3"); + } + + @Test + public void closedChannelFailuresShouldBeClassifiedAsRetryable() throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + Method method = + DataRegionConsistencyManager.class.getDeclaredMethod( + "isRetryableSnapshotReadFailure", Throwable.class); + method.setAccessible(true); + + Assert.assertTrue((Boolean) method.invoke(consistencyManager, new ClosedChannelException())); + Assert.assertTrue( + (Boolean) + method.invoke( + consistencyManager, new IOException("wrapped", new ClosedChannelException()))); + Assert.assertFalse( + (Boolean) method.invoke(consistencyManager, new IOException("other io failure"))); + } + + @Test + public void inspectPartitionShouldRebuildSnapshotEvenWhenWorkingProcessorsExist() + throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + TConsensusGroupId consensusGroupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 904); + DataRegion dataRegion = createDataRegionWithEmptySnapshotInputs(1L, true); + + DataRegionConsistencyManager.PartitionInspection inspection = + consistencyManager.inspectPartition( + consensusGroupId, dataRegion, 1L, Collections.emptyList()); + + Assert.assertEquals(1L, inspection.getPartitionId()); + Assert.assertEquals(RepairProgressTable.SnapshotState.READY, inspection.getSnapshotState()); + Assert.assertEquals(0L, inspection.getPartitionMutationEpoch()); + Assert.assertEquals(0L, inspection.getSnapshotEpoch()); + } + + @Test + public void knownPartitionsShouldRecoverFromPersistedPartitionState() throws Exception { + Path stateDir = Files.createTempDirectory("logical-consistency-partition-state"); + try { + LogicalConsistencyPartitionStateStore store = + new LogicalConsistencyPartitionStateStore(stateDir); + DataRegionConsistencyManager writer = new DataRegionConsistencyManager(store); + TConsensusGroupId consensusGroupId = + new TConsensusGroupId(TConsensusGroupType.DataRegion, 905); + + writer.onPartitionMutation(consensusGroupId, 2L); + + DataRegionConsistencyManager recovered = new DataRegionConsistencyManager(store); + Assert.assertEquals( + Collections.singletonList(2L), recovered.getKnownPartitions(consensusGroupId)); + Assert.assertEquals(1L, getPartitionMutationEpoch(recovered, consensusGroupId, 2L)); + } finally { + deleteRecursively(stateDir); + } + } + + @Test + public void persistKnownPartitionsShouldNotBlockConcurrentMutation() throws Exception { + BlockingPartitionStateStore store = new BlockingPartitionStateStore(); + DataRegionConsistencyManager consistencyManager = new DataRegionConsistencyManager(store); + TConsensusGroupId consensusGroupId = new TConsensusGroupId(TConsensusGroupType.DataRegion, 906); + DataRegion dataRegion = createDataRegionWithEmptySnapshotInputs(1L, true); + AtomicReference inspectionFailure = new AtomicReference<>(); + + Thread inspectionThread = + new Thread( + () -> { + try { + consistencyManager.inspectPartition( + consensusGroupId, dataRegion, 1L, Collections.emptyList()); + } catch (Throwable t) { + inspectionFailure.set(t); + } + }); + inspectionThread.start(); + + Assert.assertTrue( + "The initial persistence should start", store.awaitPersistStarted(5, TimeUnit.SECONDS)); + + long mutationStartNanos = System.nanoTime(); + consistencyManager.onPartitionMutation(consensusGroupId, 2L); + long mutationElapsedMillis = + TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - mutationStartNanos); + Assert.assertTrue( + "Concurrent partition mutation should not wait on slow persistence", + mutationElapsedMillis < 1_000L); + + store.releasePersist(); + inspectionThread.join(TimeUnit.SECONDS.toMillis(5)); + if (inspectionFailure.get() != null) { + throw new AssertionError("Inspection thread should succeed", inspectionFailure.get()); + } + + Map persistedState = store.load(consensusGroupId.toString()); + Assert.assertEquals(Long.valueOf(0L), persistedState.get(1L)); + Assert.assertEquals(Long.valueOf(1L), persistedState.get(2L)); + } + + @Test + public void collectLogicalSeriesContextsShouldNotPoisonSharedReaders() throws Exception { + DataRegionConsistencyManager consistencyManager = newTestConsistencyManager(); + Path tempDir = Files.createTempDirectory("consistency-manager-reader-regression"); + TsFileResource resource = null; + MultiTsFileDeviceIterator verificationIterator = null; + try { + Path tsFileDir = tempDir.resolve("sequence/root.testsg/1/1"); + Files.createDirectories(tsFileDir); + File tsFile = tsFileDir.resolve("1-1-0-0.tsfile").toFile(); + resource = new TsFileResource(tsFile); + try (CompactionTestFileWriter writer = new CompactionTestFileWriter(resource)) { + writer.startChunkGroup("d1"); + writer.generateSimpleNonAlignedSeriesToCurrentDevice( + "s1", new TimeRange[] {new TimeRange(1, 3)}, TSEncoding.PLAIN, CompressionType.LZ4); + writer.endChunkGroup(); + writer.endFile(); + } + resource.close(); + + TsFileManager tsFileManager = Mockito.mock(TsFileManager.class); + DataRegion dataRegion = Mockito.mock(DataRegion.class); + Mockito.when(dataRegion.getTsFileManager()).thenReturn(tsFileManager); + Mockito.when(tsFileManager.getTsFileListSnapshot(1L, true)) + .thenReturn(Collections.singletonList(resource)); + Mockito.when(tsFileManager.getTsFileListSnapshot(1L, false)) + .thenReturn(Collections.emptyList()); + Mockito.when(dataRegion.getWorkSequenceTsFileProcessors()) + .thenReturn(Collections.emptyList()); + Mockito.when(dataRegion.getWorkUnsequenceTsFileProcessors()) + .thenReturn(Collections.emptyList()); + + Method method = + DataRegionConsistencyManager.class.getDeclaredMethod( + "collectLogicalSeriesContexts", DataRegion.class, long.class); + method.setAccessible(true); + + @SuppressWarnings("unchecked") + Map deviceSeriesContexts = + (Map) method.invoke(consistencyManager, dataRegion, 1L); + assertDeviceSeriesContext(deviceSeriesContexts.get("root.testsg.d1"), false, "s1"); + + verificationIterator = + new MultiTsFileDeviceIterator( + Collections.singletonList(resource), Collections.emptyList()); + Assert.assertTrue(verificationIterator.hasNextDevice()); + Assert.assertEquals("root.testsg.d1", String.valueOf(verificationIterator.nextDevice().left)); + Assert.assertTrue(verificationIterator.getAllSchemasOfCurrentDevice().containsKey("s1")); + } finally { + if (resource != null) { + FileReaderManager.getInstance().closeFileAndRemoveReader(resource.getTsFileID()); + } + Files.walk(tempDir).sorted(Comparator.reverseOrder()).map(Path::toFile).forEach(File::delete); + } + } + + private String encodeBase64(String value) { + return Base64.getUrlEncoder() + .withoutPadding() + .encodeToString(value.getBytes(StandardCharsets.UTF_8)); + } + + private DataRegion createDataRegionWithEmptySnapshotInputs( + long partitionId, boolean includeWorkingProcessor) throws Exception { + DataRegion dataRegion = Mockito.mock(DataRegion.class); + TsFileManager tsFileManager = Mockito.mock(TsFileManager.class); + Mockito.when(dataRegion.getTsFileManager()).thenReturn(tsFileManager); + Mockito.when(tsFileManager.getTsFileListSnapshot(partitionId, true)) + .thenReturn(Collections.emptyList()); + Mockito.when(tsFileManager.getTsFileListSnapshot(partitionId, false)) + .thenReturn(Collections.emptyList()); + Mockito.when(dataRegion.getWorkUnsequenceTsFileProcessors()) + .thenReturn(Collections.emptyList()); + + if (!includeWorkingProcessor) { + Mockito.when(dataRegion.getWorkSequenceTsFileProcessors()) + .thenReturn(Collections.emptyList()); + return dataRegion; + } + + TsFileProcessor processor = Mockito.mock(TsFileProcessor.class); + IMemTable emptyMemTable = Mockito.mock(IMemTable.class); + Mockito.when(processor.getTimeRangeId()).thenReturn(partitionId); + Mockito.when(processor.tryReadLock(1_000L)).thenReturn(true); + Mockito.when(processor.getWorkMemTable()).thenReturn(emptyMemTable); + Mockito.when(processor.getFlushingMemTable()).thenReturn(new ConcurrentLinkedDeque<>()); + Mockito.doNothing().when(processor).readUnLock(); + Mockito.when(emptyMemTable.getMemTableMap()).thenReturn(Collections.emptyMap()); + Mockito.when(dataRegion.getWorkSequenceTsFileProcessors()) + .thenReturn(Collections.singletonList(processor)); + return dataRegion; + } + + private DataRegionConsistencyManager newTestConsistencyManager() throws IOException { + Path tempStateDir = Files.createTempDirectory("logical-consistency-test-state"); + tempStateDirs.add(tempStateDir); + return new DataRegionConsistencyManager( + new LogicalConsistencyPartitionStateStore(tempStateDir)); + } + + private void deleteRecursively(Path root) throws IOException { + if (root == null || !Files.exists(root)) { + return; + } + try { + Files.walk(root) + .sorted(Comparator.reverseOrder()) + .forEach( + path -> { + try { + Files.deleteIfExists(path); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + } + + @SuppressWarnings("unchecked") + private void assertDeviceSeriesContext( + Object deviceSeriesContext, boolean expectedAligned, String... expectedMeasurements) + throws Exception { + Assert.assertNotNull(deviceSeriesContext); + + Field alignedField = deviceSeriesContext.getClass().getDeclaredField("aligned"); + alignedField.setAccessible(true); + Assert.assertEquals(expectedAligned, alignedField.getBoolean(deviceSeriesContext)); + + Field measurementSchemasField = + deviceSeriesContext.getClass().getDeclaredField("measurementSchemas"); + measurementSchemasField.setAccessible(true); + Map measurementSchemas = + (Map) measurementSchemasField.get(deviceSeriesContext); + Assert.assertEquals(expectedMeasurements.length, measurementSchemas.size()); + for (String expectedMeasurement : expectedMeasurements) { + Assert.assertTrue(measurementSchemas.containsKey(expectedMeasurement)); + } + } + + @SuppressWarnings("unchecked") + private Object getPartitionState( + DataRegionConsistencyManager consistencyManager, + TConsensusGroupId consensusGroupId, + long partitionId) + throws Exception { + Field regionStatesField = DataRegionConsistencyManager.class.getDeclaredField("regionStates"); + regionStatesField.setAccessible(true); + ConcurrentHashMap regionStates = + (ConcurrentHashMap) regionStatesField.get(consistencyManager); + Object regionState = regionStates.get(consensusGroupId.toString()); + Assert.assertNotNull(regionState); + + Field partitionsField = regionState.getClass().getDeclaredField("partitions"); + partitionsField.setAccessible(true); + Map partitions = (Map) partitionsField.get(regionState); + Object partitionState = partitions.get(partitionId); + Assert.assertNotNull(partitionState); + return partitionState; + } + + private long getPartitionMutationEpoch( + DataRegionConsistencyManager consistencyManager, + TConsensusGroupId consensusGroupId, + long partitionId) + throws Exception { + Object partitionState = getPartitionState(consistencyManager, consensusGroupId, partitionId); + Field mutationEpochField = partitionState.getClass().getDeclaredField("partitionMutationEpoch"); + mutationEpochField.setAccessible(true); + return mutationEpochField.getLong(partitionState); + } + + private RepairProgressTable.SnapshotState getSnapshotState( + DataRegionConsistencyManager consistencyManager, + TConsensusGroupId consensusGroupId, + long partitionId) + throws Exception { + Object partitionState = getPartitionState(consistencyManager, consensusGroupId, partitionId); + Field snapshotStateField = partitionState.getClass().getDeclaredField("snapshotState"); + snapshotStateField.setAccessible(true); + return (RepairProgressTable.SnapshotState) snapshotStateField.get(partitionState); + } + + private static class BlockingPartitionStateStore extends LogicalConsistencyPartitionStateStore { + private final CountDownLatch persistStarted = new CountDownLatch(1); + private final CountDownLatch allowPersist = new CountDownLatch(1); + private final ConcurrentHashMap> persisted = new ConcurrentHashMap<>(); + + @Override + public void persist(String consensusGroupKey, Map mutationEpochs) + throws IOException { + persistStarted.countDown(); + try { + allowPersist.await(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while blocking persist", e); + } + persisted.put(consensusGroupKey, new HashMap<>(mutationEpochs)); + } + + @Override + public Map load(String consensusGroupKey) { + return persisted.getOrDefault(consensusGroupKey, Collections.emptyMap()); + } + + private boolean awaitPersistStarted(long timeout, TimeUnit unit) throws InterruptedException { + return persistStarted.await(timeout, unit); + } + + private void releasePersist() { + allowPersist.countDown(); + } + } +} diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalRepairSessionJournalTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalRepairSessionJournalTest.java new file mode 100644 index 0000000000000..6f9fb25684af7 --- /dev/null +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/storageengine/dataregion/consistency/LogicalRepairSessionJournalTest.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.storageengine.dataregion.consistency; + +import org.apache.iotdb.db.conf.IoTDBDescriptor; + +import org.junit.Assert; +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Comparator; +import java.util.List; + +public class LogicalRepairSessionJournalTest { + + @Test + public void stagedBatchesShouldPersistAcrossJournalReload() throws Exception { + Path journalDir = Files.createTempDirectory("logical-repair-journal"); + try { + LogicalRepairSessionJournal journal = new LogicalRepairSessionJournal(journalDir); + String repairEpoch = "1:7:1000:2000:2000:1-1_2_3"; + + journal.stageBatch( + "DataRegion-7", + 7L, + repairEpoch, + "session-a", + "LIVE", + "leaf:1:0", + 2, + "INSERT_ROWS", + ByteBuffer.wrap(new byte[] {2})); + journal.stageBatch( + "DataRegion-7", + 7L, + repairEpoch, + "session-a", + "LIVE", + "leaf:1:0", + 0, + "RESET_LEAF", + ByteBuffer.allocate(0)); + journal.stageBatch( + "DataRegion-7", + 7L, + repairEpoch, + "session-a", + "LIVE", + "leaf:1:0", + 1, + "INSERT_ROWS", + ByteBuffer.wrap(new byte[] {1})); + + LogicalRepairSessionJournal recovered = new LogicalRepairSessionJournal(journalDir); + List recoveredBatches = + recovered.loadStagedBatches("DataRegion-7", 7L, repairEpoch, "session-a"); + + Assert.assertEquals(3, recoveredBatches.size()); + Assert.assertEquals(0, recoveredBatches.get(0).getSeqNo()); + Assert.assertEquals("RESET_LEAF", recoveredBatches.get(0).getBatchKind()); + Assert.assertEquals(1, recoveredBatches.get(1).getSeqNo()); + Assert.assertEquals(2, recoveredBatches.get(2).getSeqNo()); + Assert.assertArrayEquals( + new byte[] {1}, toByteArray(recoveredBatches.get(1).duplicatePayload())); + Assert.assertArrayEquals( + new byte[] {2}, toByteArray(recoveredBatches.get(2).duplicatePayload())); + } finally { + deleteRecursively(journalDir); + } + } + + @Test + public void duplicateSeqNoShouldBeDeduplicatedDurably() throws Exception { + Path journalDir = Files.createTempDirectory("logical-repair-journal"); + try { + LogicalRepairSessionJournal journal = new LogicalRepairSessionJournal(journalDir); + String repairEpoch = "1:8:1000:2000:2000:1-1_2_3"; + + journal.stageBatch( + "DataRegion-8", + 8L, + repairEpoch, + "session-b", + "TOMBSTONE", + "leaf:2:0", + 0, + "DELETE_DATA", + ByteBuffer.wrap(new byte[] {1, 2, 3})); + journal.stageBatch( + "DataRegion-8", + 8L, + repairEpoch, + "session-b", + "TOMBSTONE", + "leaf:2:0", + 0, + "DELETE_DATA", + ByteBuffer.wrap(new byte[] {9, 9, 9})); + + LogicalRepairSessionJournal recovered = new LogicalRepairSessionJournal(journalDir); + List recoveredBatches = + recovered.loadStagedBatches("DataRegion-8", 8L, repairEpoch, "session-b"); + + Assert.assertEquals(1, recoveredBatches.size()); + Assert.assertArrayEquals( + new byte[] {1, 2, 3}, toByteArray(recoveredBatches.get(0).duplicatePayload())); + } finally { + deleteRecursively(journalDir); + } + } + + @Test + public void completeShouldDeletePersistedSession() throws Exception { + Path journalDir = Files.createTempDirectory("logical-repair-journal"); + try { + LogicalRepairSessionJournal journal = new LogicalRepairSessionJournal(journalDir); + String repairEpoch = "1:9:1000:2000:2000:1-1_2_3"; + + journal.stageBatch( + "DataRegion-9", + 9L, + repairEpoch, + "session-c", + "LIVE", + "leaf:3:0", + 0, + "RESET_LEAF", + ByteBuffer.allocate(0)); + journal.completeSession("session-c"); + + LogicalRepairSessionJournal recovered = new LogicalRepairSessionJournal(journalDir); + Assert.assertTrue( + recovered.loadStagedBatches("DataRegion-9", 9L, repairEpoch, "session-c").isEmpty()); + } finally { + deleteRecursively(journalDir); + } + } + + @Test + public void defaultJournalShouldResolveSystemDirLazily() throws Exception { + String originalSystemDir = IoTDBDescriptor.getInstance().getConfig().getSystemDir(); + Path initialSystemDir = Files.createTempDirectory("logical-repair-journal-initial"); + Path effectiveSystemDir = Files.createTempDirectory("logical-repair-journal-effective"); + try { + IoTDBDescriptor.getInstance().getConfig().setSystemDir(initialSystemDir.toString()); + LogicalRepairSessionJournal journal = new LogicalRepairSessionJournal(); + + IoTDBDescriptor.getInstance().getConfig().setSystemDir(effectiveSystemDir.toString()); + String repairEpoch = "1:10:1000:2000:2000:1-1_2_3"; + journal.stageBatch( + "DataRegion-10", + 10L, + repairEpoch, + "session-d", + "LIVE", + "leaf:4:0", + 0, + "RESET_LEAF", + ByteBuffer.allocate(0)); + + Assert.assertTrue( + Files.exists( + effectiveSystemDir + .resolve("consistency-repair") + .resolve("sessions") + .resolve("session-d.session"))); + Assert.assertFalse( + Files.exists( + initialSystemDir + .resolve("consistency-repair") + .resolve("sessions") + .resolve("session-d.session"))); + } finally { + IoTDBDescriptor.getInstance().getConfig().setSystemDir(originalSystemDir); + deleteRecursively(initialSystemDir); + deleteRecursively(effectiveSystemDir); + } + } + + private byte[] toByteArray(ByteBuffer buffer) { + ByteBuffer duplicate = buffer.duplicate(); + byte[] bytes = new byte[duplicate.remaining()]; + duplicate.get(bytes); + return bytes; + } + + private void deleteRecursively(Path dir) throws Exception { + if (dir == null || !Files.exists(dir)) { + return; + } + try (java.util.stream.Stream stream = Files.walk(dir)) { + stream.sorted(Comparator.reverseOrder()).forEach(this::deleteSilently); + } + } + + private void deleteSilently(Path path) { + try { + Files.deleteIfExists(path); + } catch (Exception ignored) { + // best effort for test cleanup + } + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/DualDigest.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/DualDigest.java new file mode 100644 index 0000000000000..6a5b1c34e15d0 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/DualDigest.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.consensus.iotv2.consistency; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Objects; + +/** + * Immutable dual-digest tuple combining XOR and additive hashing to eliminate the cancellation + * vulnerability of single-XOR aggregation. For two datasets to produce the same dual-digest while + * being different, they must satisfy BOTH XOR(S1)==XOR(S2) AND SUM(S1)==SUM(S2) mod 2^64 + * simultaneously, giving a false-negative probability of ~2^(-128). + */ +public final class DualDigest { + + public static final DualDigest ZERO = new DualDigest(0L, 0L); + + private final long xorHash; + private final long additiveHash; + + public DualDigest(long xorHash, long additiveHash) { + this.xorHash = xorHash; + this.additiveHash = additiveHash; + } + + public static DualDigest fromSingleHash(long hash) { + return new DualDigest(hash, hash); + } + + public DualDigest xorIn(long childHash) { + return new DualDigest(this.xorHash ^ childHash, this.additiveHash + childHash); + } + + public DualDigest xorOut(long childHash) { + return new DualDigest(this.xorHash ^ childHash, this.additiveHash - childHash); + } + + public DualDigest merge(DualDigest other) { + return new DualDigest(this.xorHash ^ other.xorHash, this.additiveHash + other.additiveHash); + } + + public DualDigest subtract(DualDigest other) { + return new DualDigest(this.xorHash ^ other.xorHash, this.additiveHash - other.additiveHash); + } + + public boolean matches(DualDigest other) { + return this.xorHash == other.xorHash && this.additiveHash == other.additiveHash; + } + + public long getXorHash() { + return xorHash; + } + + public long getAdditiveHash() { + return additiveHash; + } + + public void serialize(DataOutputStream stream) throws IOException { + stream.writeLong(xorHash); + stream.writeLong(additiveHash); + } + + public static DualDigest deserialize(ByteBuffer buffer) { + long xor = buffer.getLong(); + long add = buffer.getLong(); + return new DualDigest(xor, add); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof DualDigest)) return false; + DualDigest that = (DualDigest) o; + return xorHash == that.xorHash && additiveHash == that.additiveHash; + } + + @Override + public int hashCode() { + return Objects.hash(xorHash, additiveHash); + } + + @Override + public String toString() { + return String.format("DualDigest{xor=0x%016X, add=0x%016X}", xorHash, additiveHash); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/LogicalMismatchScope.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/LogicalMismatchScope.java new file mode 100644 index 0000000000000..f9f4c8921caef --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/LogicalMismatchScope.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.consensus.iotv2.consistency; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Durable partition-scoped mismatch description produced by logical snapshot compare. + * + *

The persisted string keeps backward compatibility with the legacy {@code treeKind@leafId} + * format while allowing the new logical-snapshot workflow to optionally carry a narrowed key range + * for each mismatched leaf. + */ +public final class LogicalMismatchScope { + + private static final String ENTRY_SEPARATOR = ","; + private static final String TOKEN_SEPARATOR = "@"; + + private LogicalMismatchScope() {} + + public enum RepairDirective { + REPAIRABLE, + FOLLOWER_EXTRA_TOMBSTONE + } + + public static String serialize(Collection scopes) { + if (scopes == null || scopes.isEmpty()) { + return null; + } + return scopes.stream() + .filter(Objects::nonNull) + .distinct() + .sorted(Comparator.comparing(Scope::toPersistentString)) + .map(Scope::toPersistentString) + .collect(Collectors.joining(ENTRY_SEPARATOR)); + } + + public static List deserialize(String serialized) { + if (serialized == null || serialized.trim().isEmpty()) { + return Collections.emptyList(); + } + + LinkedHashSet scopes = new LinkedHashSet<>(); + for (String rawEntry : serialized.split(ENTRY_SEPARATOR)) { + String entry = rawEntry.trim(); + if (entry.isEmpty()) { + continue; + } + String[] tokens = entry.split(TOKEN_SEPARATOR, -1); + if (tokens.length < 2) { + continue; + } + String treeKind = tokens[0]; + String leafId = tokens[1]; + String keyRangeStart = tokens.length >= 3 ? decodeNullable(tokens[2]) : null; + String keyRangeEnd = tokens.length >= 4 ? decodeNullable(tokens[3]) : null; + List exactKeys = + tokens.length >= 5 ? decodeStringList(tokens[4]) : Collections.emptyList(); + RepairDirective repairDirective = + tokens.length >= 6 ? decodeRepairDirective(tokens[5]) : RepairDirective.REPAIRABLE; + scopes.add( + new Scope(treeKind, leafId, keyRangeStart, keyRangeEnd, exactKeys, repairDirective)); + } + return new ArrayList<>(scopes); + } + + private static String encodeNullable(String value) { + if (value == null) { + return ""; + } + return Base64.getUrlEncoder() + .withoutPadding() + .encodeToString(value.getBytes(StandardCharsets.UTF_8)); + } + + private static String decodeNullable(String value) { + if (value == null || value.isEmpty()) { + return null; + } + return new String(Base64.getUrlDecoder().decode(value), StandardCharsets.UTF_8); + } + + private static String encodeStringList(List values) { + if (values == null || values.isEmpty()) { + return ""; + } + return encodeNullable(String.join("\n", values)); + } + + private static List decodeStringList(String value) { + String decoded = decodeNullable(value); + if (decoded == null || decoded.isEmpty()) { + return Collections.emptyList(); + } + LinkedHashSet result = new LinkedHashSet<>(); + for (String key : decoded.split("\n")) { + if (!key.isEmpty()) { + result.add(key); + } + } + return new ArrayList<>(result); + } + + private static RepairDirective decodeRepairDirective(String value) { + String decoded = decodeNullable(value); + if (decoded == null || decoded.isEmpty()) { + return RepairDirective.REPAIRABLE; + } + try { + return RepairDirective.valueOf(decoded); + } catch (IllegalArgumentException ignored) { + return RepairDirective.REPAIRABLE; + } + } + + public static final class Scope { + private final String treeKind; + private final String leafId; + private final String keyRangeStart; + private final String keyRangeEnd; + private final List exactKeys; + private final RepairDirective repairDirective; + + public Scope(String treeKind, String leafId) { + this(treeKind, leafId, null, null, Collections.emptyList(), RepairDirective.REPAIRABLE); + } + + public Scope(String treeKind, String leafId, String keyRangeStart, String keyRangeEnd) { + this( + treeKind, + leafId, + keyRangeStart, + keyRangeEnd, + Collections.emptyList(), + RepairDirective.REPAIRABLE); + } + + public Scope( + String treeKind, + String leafId, + String keyRangeStart, + String keyRangeEnd, + List exactKeys) { + this(treeKind, leafId, keyRangeStart, keyRangeEnd, exactKeys, RepairDirective.REPAIRABLE); + } + + public Scope( + String treeKind, + String leafId, + String keyRangeStart, + String keyRangeEnd, + List exactKeys, + RepairDirective repairDirective) { + this.treeKind = treeKind; + this.leafId = leafId; + this.keyRangeStart = keyRangeStart; + this.keyRangeEnd = keyRangeEnd; + this.exactKeys = + exactKeys == null + ? Collections.emptyList() + : Collections.unmodifiableList( + exactKeys.stream() + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList())); + this.repairDirective = repairDirective == null ? RepairDirective.REPAIRABLE : repairDirective; + } + + public String getTreeKind() { + return treeKind; + } + + public String getLeafId() { + return leafId; + } + + public String getKeyRangeStart() { + return keyRangeStart; + } + + public String getKeyRangeEnd() { + return keyRangeEnd; + } + + public List getExactKeys() { + return exactKeys; + } + + public RepairDirective getRepairDirective() { + return repairDirective; + } + + public boolean isRepairable() { + return repairDirective == RepairDirective.REPAIRABLE; + } + + public String toPersistentString() { + if (keyRangeStart == null + && keyRangeEnd == null + && exactKeys.isEmpty() + && repairDirective == RepairDirective.REPAIRABLE) { + return treeKind + TOKEN_SEPARATOR + leafId; + } + String base = + treeKind + + TOKEN_SEPARATOR + + leafId + + TOKEN_SEPARATOR + + encodeNullable(keyRangeStart) + + TOKEN_SEPARATOR + + encodeNullable(keyRangeEnd) + + TOKEN_SEPARATOR + + encodeStringList(exactKeys); + if (repairDirective == RepairDirective.REPAIRABLE) { + return base; + } + return base + TOKEN_SEPARATOR + encodeNullable(repairDirective.name()); + } + + @Override + public String toString() { + return toPersistentString(); + } + + @Override + public boolean equals(Object object) { + if (this == object) { + return true; + } + if (!(object instanceof Scope)) { + return false; + } + Scope that = (Scope) object; + return Objects.equals(treeKind, that.treeKind) + && Objects.equals(leafId, that.leafId) + && Objects.equals(keyRangeStart, that.keyRangeStart) + && Objects.equals(keyRangeEnd, that.keyRangeEnd) + && Objects.equals(exactKeys, that.exactKeys) + && repairDirective == that.repairDirective; + } + + @Override + public int hashCode() { + return Objects.hash(treeKind, leafId, keyRangeStart, keyRangeEnd, exactKeys, repairDirective); + } + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/RepairProgressTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/RepairProgressTable.java new file mode 100644 index 0000000000000..3f40c0f1aaa5f --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/consensus/iotv2/consistency/RepairProgressTable.java @@ -0,0 +1,753 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.consensus.iotv2.consistency; + +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; + +/** + * In-memory representation of the partition-scoped consistency check / repair progress for a single + * consensus group. + */ +public class RepairProgressTable { + + public enum CheckState { + PENDING, + DIRTY, + VERIFIED, + MISMATCH, + FAILED + } + + public enum RepairState { + IDLE, + PENDING, + RUNNING, + SUCCEEDED, + FAILED + } + + public enum SnapshotState { + PENDING, + BUILDING, + READY, + DIRTY, + FAILED + } + + public static class PartitionProgress { + private final long partitionId; + private volatile CheckState checkState; + private volatile RepairState repairState; + private volatile long lastCheckedAt; + private volatile long lastSafeWatermark; + private volatile long partitionMutationEpoch; + private volatile long snapshotEpoch; + private volatile SnapshotState snapshotState; + private volatile long lastMismatchAt; + private volatile String mismatchScopeRef; + private volatile int mismatchLeafCount; + private volatile String repairEpoch; + private volatile String replicaObservationToken; + private volatile String lastErrorCode; + private volatile String lastErrorMessage; + + public PartitionProgress(long partitionId) { + this.partitionId = partitionId; + this.checkState = CheckState.PENDING; + this.repairState = RepairState.IDLE; + this.lastCheckedAt = 0L; + this.lastSafeWatermark = Long.MIN_VALUE; + this.partitionMutationEpoch = Long.MIN_VALUE; + this.snapshotEpoch = Long.MIN_VALUE; + this.snapshotState = SnapshotState.PENDING; + this.lastMismatchAt = 0L; + this.mismatchScopeRef = null; + this.mismatchLeafCount = 0; + this.repairEpoch = null; + this.replicaObservationToken = null; + this.lastErrorCode = null; + this.lastErrorMessage = null; + } + + public long getPartitionId() { + return partitionId; + } + + public CheckState getCheckState() { + return checkState; + } + + public RepairState getRepairState() { + return repairState; + } + + public long getLastCheckedAt() { + return lastCheckedAt; + } + + public long getLastSafeWatermark() { + return lastSafeWatermark; + } + + public long getPartitionMutationEpoch() { + return partitionMutationEpoch; + } + + public long getSnapshotEpoch() { + return snapshotEpoch; + } + + public SnapshotState getSnapshotState() { + return snapshotState; + } + + public long getLastMismatchAt() { + return lastMismatchAt; + } + + public String getMismatchScopeRef() { + return mismatchScopeRef; + } + + public int getMismatchLeafCount() { + return mismatchLeafCount; + } + + public String getRepairEpoch() { + return repairEpoch; + } + + public String getReplicaObservationToken() { + return replicaObservationToken; + } + + public String getLastErrorCode() { + return lastErrorCode; + } + + public String getLastErrorMessage() { + return lastErrorMessage; + } + + public boolean shouldCheck( + long candidatePartitionMutationEpoch, + long candidateSnapshotEpoch, + SnapshotState candidateSnapshotState) { + return shouldCheck( + candidatePartitionMutationEpoch, candidateSnapshotEpoch, candidateSnapshotState, null); + } + + public boolean shouldCheck( + long candidatePartitionMutationEpoch, + long candidateSnapshotEpoch, + SnapshotState candidateSnapshotState, + String candidateReplicaObservationToken) { + return checkState != CheckState.VERIFIED + || partitionMutationEpoch != candidatePartitionMutationEpoch + || snapshotEpoch != candidateSnapshotEpoch + || snapshotState != candidateSnapshotState + || !Objects.equals(replicaObservationToken, candidateReplicaObservationToken); + } + + public void markVerified( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState) { + markVerified( + checkedAt, + safeWatermark, + newPartitionMutationEpoch, + newSnapshotEpoch, + newSnapshotState, + null); + } + + public void markVerified( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String newReplicaObservationToken) { + checkState = CheckState.VERIFIED; + lastCheckedAt = checkedAt; + lastSafeWatermark = safeWatermark; + partitionMutationEpoch = newPartitionMutationEpoch; + snapshotEpoch = newSnapshotEpoch; + snapshotState = newSnapshotState; + replicaObservationToken = newReplicaObservationToken; + mismatchScopeRef = null; + mismatchLeafCount = 0; + clearError(); + if (repairState == RepairState.RUNNING + || repairState == RepairState.PENDING + || repairState == RepairState.FAILED) { + repairState = RepairState.SUCCEEDED; + } + } + + public void markMismatch( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String newMismatchScopeRef, + int newMismatchLeafCount, + String newRepairEpoch) { + markMismatch( + checkedAt, + safeWatermark, + newPartitionMutationEpoch, + newSnapshotEpoch, + newSnapshotState, + newMismatchScopeRef, + newMismatchLeafCount, + newRepairEpoch, + null); + } + + public void markMismatch( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String newMismatchScopeRef, + int newMismatchLeafCount, + String newRepairEpoch, + String newReplicaObservationToken) { + checkState = CheckState.MISMATCH; + repairState = RepairState.PENDING; + lastCheckedAt = checkedAt; + lastSafeWatermark = safeWatermark; + partitionMutationEpoch = newPartitionMutationEpoch; + snapshotEpoch = newSnapshotEpoch; + snapshotState = newSnapshotState; + lastMismatchAt = checkedAt; + mismatchScopeRef = newMismatchScopeRef; + mismatchLeafCount = newMismatchLeafCount; + repairEpoch = newRepairEpoch; + replicaObservationToken = newReplicaObservationToken; + clearError(); + } + + public void markCheckFailed( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String errorCode, + String errorMessage) { + markCheckFailed( + checkedAt, + safeWatermark, + newPartitionMutationEpoch, + newSnapshotEpoch, + newSnapshotState, + errorCode, + errorMessage, + null); + } + + public void markCheckFailed( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String errorCode, + String errorMessage, + String newReplicaObservationToken) { + checkState = CheckState.FAILED; + lastCheckedAt = checkedAt; + lastSafeWatermark = safeWatermark; + partitionMutationEpoch = newPartitionMutationEpoch; + snapshotEpoch = newSnapshotEpoch; + snapshotState = newSnapshotState; + replicaObservationToken = newReplicaObservationToken; + lastErrorCode = errorCode; + lastErrorMessage = errorMessage; + } + + public void markDirty() { + checkState = CheckState.DIRTY; + snapshotState = SnapshotState.DIRTY; + mismatchScopeRef = null; + mismatchLeafCount = 0; + if (repairState == RepairState.SUCCEEDED) { + repairState = RepairState.IDLE; + } + } + + public void markRepairRunning(String newRepairEpoch) { + repairState = RepairState.RUNNING; + repairEpoch = newRepairEpoch; + clearError(); + } + + public void markRepairSucceeded( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String newRepairEpoch) { + markRepairSucceeded( + checkedAt, + safeWatermark, + newPartitionMutationEpoch, + newSnapshotEpoch, + newSnapshotState, + newRepairEpoch, + null); + } + + public void markRepairSucceeded( + long checkedAt, + long safeWatermark, + long newPartitionMutationEpoch, + long newSnapshotEpoch, + SnapshotState newSnapshotState, + String newRepairEpoch, + String newReplicaObservationToken) { + repairState = RepairState.SUCCEEDED; + repairEpoch = newRepairEpoch; + markVerified( + checkedAt, + safeWatermark, + newPartitionMutationEpoch, + newSnapshotEpoch, + newSnapshotState, + newReplicaObservationToken); + repairState = RepairState.SUCCEEDED; + } + + public void markRepairFailed(String newRepairEpoch, String errorCode, String errorMessage) { + repairState = RepairState.FAILED; + repairEpoch = newRepairEpoch; + lastErrorCode = errorCode; + lastErrorMessage = errorMessage; + } + + public PartitionProgress copy() { + PartitionProgress copy = new PartitionProgress(partitionId); + copy.checkState = checkState; + copy.repairState = repairState; + copy.lastCheckedAt = lastCheckedAt; + copy.lastSafeWatermark = lastSafeWatermark; + copy.partitionMutationEpoch = partitionMutationEpoch; + copy.snapshotEpoch = snapshotEpoch; + copy.snapshotState = snapshotState; + copy.lastMismatchAt = lastMismatchAt; + copy.mismatchScopeRef = mismatchScopeRef; + copy.mismatchLeafCount = mismatchLeafCount; + copy.repairEpoch = repairEpoch; + copy.replicaObservationToken = replicaObservationToken; + copy.lastErrorCode = lastErrorCode; + copy.lastErrorMessage = lastErrorMessage; + return copy; + } + + private void clearError() { + lastErrorCode = null; + lastErrorMessage = null; + } + + private void serialize(OutputStream outputStream) throws IOException { + ReadWriteIOUtils.write(partitionId, outputStream); + ReadWriteIOUtils.write(checkState.ordinal(), outputStream); + ReadWriteIOUtils.write(repairState.ordinal(), outputStream); + ReadWriteIOUtils.write(lastCheckedAt, outputStream); + ReadWriteIOUtils.write(lastSafeWatermark, outputStream); + ReadWriteIOUtils.write(partitionMutationEpoch, outputStream); + ReadWriteIOUtils.write(snapshotEpoch, outputStream); + ReadWriteIOUtils.write(snapshotState.ordinal(), outputStream); + ReadWriteIOUtils.write(lastMismatchAt, outputStream); + ReadWriteIOUtils.write(mismatchScopeRef, outputStream); + ReadWriteIOUtils.write(mismatchLeafCount, outputStream); + ReadWriteIOUtils.write(repairEpoch, outputStream); + ReadWriteIOUtils.write(replicaObservationToken, outputStream); + ReadWriteIOUtils.write(lastErrorCode, outputStream); + ReadWriteIOUtils.write(lastErrorMessage, outputStream); + } + + private static PartitionProgress deserialize(InputStream inputStream) throws IOException { + PartitionProgress progress = new PartitionProgress(ReadWriteIOUtils.readLong(inputStream)); + progress.checkState = CheckState.values()[ReadWriteIOUtils.readInt(inputStream)]; + progress.repairState = RepairState.values()[ReadWriteIOUtils.readInt(inputStream)]; + progress.lastCheckedAt = ReadWriteIOUtils.readLong(inputStream); + progress.lastSafeWatermark = ReadWriteIOUtils.readLong(inputStream); + progress.partitionMutationEpoch = ReadWriteIOUtils.readLong(inputStream); + progress.snapshotEpoch = ReadWriteIOUtils.readLong(inputStream); + progress.snapshotState = SnapshotState.values()[ReadWriteIOUtils.readInt(inputStream)]; + progress.lastMismatchAt = ReadWriteIOUtils.readLong(inputStream); + progress.mismatchScopeRef = ReadWriteIOUtils.readString(inputStream); + progress.mismatchLeafCount = ReadWriteIOUtils.readInt(inputStream); + progress.repairEpoch = ReadWriteIOUtils.readString(inputStream); + progress.replicaObservationToken = ReadWriteIOUtils.readString(inputStream); + progress.lastErrorCode = ReadWriteIOUtils.readString(inputStream); + progress.lastErrorMessage = ReadWriteIOUtils.readString(inputStream); + return progress; + } + + private static PartitionProgress deserialize(ByteBuffer byteBuffer) { + PartitionProgress progress = new PartitionProgress(ReadWriteIOUtils.readLong(byteBuffer)); + progress.checkState = CheckState.values()[ReadWriteIOUtils.readInt(byteBuffer)]; + progress.repairState = RepairState.values()[ReadWriteIOUtils.readInt(byteBuffer)]; + progress.lastCheckedAt = ReadWriteIOUtils.readLong(byteBuffer); + progress.lastSafeWatermark = ReadWriteIOUtils.readLong(byteBuffer); + progress.partitionMutationEpoch = ReadWriteIOUtils.readLong(byteBuffer); + progress.snapshotEpoch = ReadWriteIOUtils.readLong(byteBuffer); + progress.snapshotState = SnapshotState.values()[ReadWriteIOUtils.readInt(byteBuffer)]; + progress.lastMismatchAt = ReadWriteIOUtils.readLong(byteBuffer); + progress.mismatchScopeRef = ReadWriteIOUtils.readString(byteBuffer); + progress.mismatchLeafCount = ReadWriteIOUtils.readInt(byteBuffer); + progress.repairEpoch = ReadWriteIOUtils.readString(byteBuffer); + progress.replicaObservationToken = ReadWriteIOUtils.readString(byteBuffer); + progress.lastErrorCode = ReadWriteIOUtils.readString(byteBuffer); + progress.lastErrorMessage = ReadWriteIOUtils.readString(byteBuffer); + return progress; + } + + @Override + public boolean equals(Object object) { + if (this == object) { + return true; + } + if (!(object instanceof PartitionProgress)) { + return false; + } + PartitionProgress that = (PartitionProgress) object; + return partitionId == that.partitionId + && lastCheckedAt == that.lastCheckedAt + && lastSafeWatermark == that.lastSafeWatermark + && partitionMutationEpoch == that.partitionMutationEpoch + && snapshotEpoch == that.snapshotEpoch + && lastMismatchAt == that.lastMismatchAt + && mismatchLeafCount == that.mismatchLeafCount + && checkState == that.checkState + && repairState == that.repairState + && snapshotState == that.snapshotState + && Objects.equals(mismatchScopeRef, that.mismatchScopeRef) + && Objects.equals(repairEpoch, that.repairEpoch) + && Objects.equals(replicaObservationToken, that.replicaObservationToken) + && Objects.equals(lastErrorCode, that.lastErrorCode) + && Objects.equals(lastErrorMessage, that.lastErrorMessage); + } + + @Override + public int hashCode() { + return Objects.hash( + partitionId, + checkState, + repairState, + lastCheckedAt, + lastSafeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + lastMismatchAt, + mismatchScopeRef, + mismatchLeafCount, + repairEpoch, + replicaObservationToken, + lastErrorCode, + lastErrorMessage); + } + } + + private final String consensusGroupId; + private final ConcurrentHashMap partitionProgress; + + public RepairProgressTable(String consensusGroupId) { + this.consensusGroupId = consensusGroupId; + this.partitionProgress = new ConcurrentHashMap<>(); + } + + public String getConsensusGroupId() { + return consensusGroupId; + } + + public PartitionProgress getOrCreatePartition(long partitionId) { + return partitionProgress.computeIfAbsent(partitionId, PartitionProgress::new); + } + + public PartitionProgress getPartition(long partitionId) { + return partitionProgress.get(partitionId); + } + + public void markVerified( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState) { + markVerified( + partitionId, + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + null); + } + + public void markVerified( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String replicaObservationToken) { + getOrCreatePartition(partitionId) + .markVerified( + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + replicaObservationToken); + } + + public void markMismatch( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String mismatchScopeRef, + int mismatchLeafCount, + String repairEpoch) { + markMismatch( + partitionId, + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + mismatchScopeRef, + mismatchLeafCount, + repairEpoch, + null); + } + + public void markMismatch( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String mismatchScopeRef, + int mismatchLeafCount, + String repairEpoch, + String replicaObservationToken) { + getOrCreatePartition(partitionId) + .markMismatch( + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + mismatchScopeRef, + mismatchLeafCount, + repairEpoch, + replicaObservationToken); + } + + public void markCheckFailed( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String errorCode, + String errorMessage) { + markCheckFailed( + partitionId, + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + errorCode, + errorMessage, + null); + } + + public void markCheckFailed( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String errorCode, + String errorMessage, + String replicaObservationToken) { + getOrCreatePartition(partitionId) + .markCheckFailed( + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + errorCode, + errorMessage, + replicaObservationToken); + } + + public void markDirty(long partitionId) { + getOrCreatePartition(partitionId).markDirty(); + } + + public void markRepairRunning(long partitionId, String repairEpoch) { + getOrCreatePartition(partitionId).markRepairRunning(repairEpoch); + } + + public void markRepairSucceeded( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String repairEpoch) { + markRepairSucceeded( + partitionId, + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + repairEpoch, + null); + } + + public void markRepairSucceeded( + long partitionId, + long checkedAt, + long safeWatermark, + long partitionMutationEpoch, + long snapshotEpoch, + SnapshotState snapshotState, + String repairEpoch, + String replicaObservationToken) { + getOrCreatePartition(partitionId) + .markRepairSucceeded( + checkedAt, + safeWatermark, + partitionMutationEpoch, + snapshotEpoch, + snapshotState, + repairEpoch, + replicaObservationToken); + } + + public void markRepairFailed( + long partitionId, String repairEpoch, String errorCode, String errorMessage) { + getOrCreatePartition(partitionId).markRepairFailed(repairEpoch, errorCode, errorMessage); + } + + public List getAllPartitions() { + List result = new ArrayList<>(); + for (PartitionProgress progress : partitionProgress.values()) { + result.add(progress.copy()); + } + result.sort((left, right) -> Long.compare(left.getPartitionId(), right.getPartitionId())); + return Collections.unmodifiableList(result); + } + + public Map getAllPartitionProgress() { + return Collections.unmodifiableMap(partitionProgress); + } + + public RepairProgressTable copy() { + RepairProgressTable copy = new RepairProgressTable(consensusGroupId); + partitionProgress.forEach( + (partitionId, progress) -> copy.partitionProgress.put(partitionId, progress.copy())); + return copy; + } + + public void serialize(OutputStream outputStream) throws IOException { + ReadWriteIOUtils.write(consensusGroupId, outputStream); + ReadWriteIOUtils.write(partitionProgress.size(), outputStream); + for (PartitionProgress progress : getAllPartitions()) { + progress.serialize(outputStream); + } + } + + public static RepairProgressTable deserialize(InputStream inputStream) throws IOException { + RepairProgressTable table = new RepairProgressTable(ReadWriteIOUtils.readString(inputStream)); + int size = ReadWriteIOUtils.readInt(inputStream); + for (int i = 0; i < size; i++) { + PartitionProgress progress = PartitionProgress.deserialize(inputStream); + table.partitionProgress.put(progress.getPartitionId(), progress); + } + return table; + } + + public static RepairProgressTable deserialize(ByteBuffer byteBuffer) { + RepairProgressTable table = new RepairProgressTable(ReadWriteIOUtils.readString(byteBuffer)); + int size = ReadWriteIOUtils.readInt(byteBuffer); + for (int i = 0; i < size; i++) { + PartitionProgress progress = PartitionProgress.deserialize(byteBuffer); + table.partitionProgress.put(progress.getPartitionId(), progress); + } + return table; + } + + @Override + public boolean equals(Object object) { + if (this == object) { + return true; + } + if (!(object instanceof RepairProgressTable)) { + return false; + } + RepairProgressTable that = (RepairProgressTable) object; + return Objects.equals(consensusGroupId, that.consensusGroupId) + && Objects.equals(getAllPartitions(), that.getAllPartitions()); + } + + @Override + public int hashCode() { + return Objects.hash(consensusGroupId, getAllPartitions()); + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/column/ColumnHeaderConstant.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/column/ColumnHeaderConstant.java index 7f6fedfc224f7..e8a1d1a3a9cbc 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/column/ColumnHeaderConstant.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/column/ColumnHeaderConstant.java @@ -247,6 +247,20 @@ private ColumnHeaderConstant() { public static final String REGION_ID_TABLE_MODEL = "region_id"; public static final String DATANODE_ID_TABLE_MODEL = "datanode_id"; + public static final String TIME_PARTITION_TABLE_MODEL = "time_partition"; + public static final String CHECK_STATE_TABLE_MODEL = "check_state"; + public static final String REPAIR_STATE_TABLE_MODEL = "repair_state"; + public static final String LAST_CHECKED_AT_TABLE_MODEL = "last_checked_at"; + public static final String LAST_SAFE_WATERMARK_TABLE_MODEL = "last_safe_watermark"; + public static final String PARTITION_MUTATION_EPOCH_TABLE_MODEL = "partition_mutation_epoch"; + public static final String SNAPSHOT_EPOCH_TABLE_MODEL = "snapshot_epoch"; + public static final String SNAPSHOT_STATE_TABLE_MODEL = "snapshot_state"; + public static final String LAST_MISMATCH_AT_TABLE_MODEL = "last_mismatch_at"; + public static final String MISMATCH_SCOPE_REF_TABLE_MODEL = "mismatch_scope_ref"; + public static final String MISMATCH_LEAF_COUNT_TABLE_MODEL = "mismatch_leaf_count"; + public static final String REPAIR_EPOCH_TABLE_MODEL = "repair_epoch"; + public static final String LAST_ERROR_CODE_TABLE_MODEL = "last_error_code"; + public static final String LAST_ERROR_MESSAGE_TABLE_MODEL = "last_error_message"; public static final String SERIES_SLOT_NUM_TABLE_MODEL = "series_slot_num"; public static final String TIME_SLOT_NUM_TABLE_MODEL = "time_slot_num"; public static final String RPC_ADDRESS_TABLE_MODEL = "rpc_address"; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/InformationSchema.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/InformationSchema.java index ac6c8ebab24f2..787ebe75a4988 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/InformationSchema.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/schema/table/InformationSchema.java @@ -38,6 +38,7 @@ public class InformationSchema { public static final String TABLES = "tables"; public static final String COLUMNS = "columns"; public static final String REGIONS = "regions"; + public static final String REPAIR_PROGRESS = "repair_progress"; public static final String PIPES = "pipes"; public static final String PIPE_PLUGINS = "pipe_plugins"; public static final String TOPICS = "topics"; @@ -177,6 +178,51 @@ public class InformationSchema { ColumnHeaderConstant.COMPRESSION_RATIO_TABLE_MODEL, TSDataType.DOUBLE)); schemaTables.put(REGIONS, regionTable); + final TsTable repairProgressTable = new TsTable(REPAIR_PROGRESS); + repairProgressTable.addColumnSchema( + new TagColumnSchema(ColumnHeaderConstant.REGION_ID_TABLE_MODEL, TSDataType.INT32)); + repairProgressTable.addColumnSchema( + new TagColumnSchema(ColumnHeaderConstant.TIME_PARTITION_TABLE_MODEL, TSDataType.INT64)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema(ColumnHeaderConstant.CHECK_STATE_TABLE_MODEL, TSDataType.STRING)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.REPAIR_STATE_TABLE_MODEL, TSDataType.STRING)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.LAST_CHECKED_AT_TABLE_MODEL, TSDataType.TIMESTAMP)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.LAST_SAFE_WATERMARK_TABLE_MODEL, TSDataType.INT64)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.PARTITION_MUTATION_EPOCH_TABLE_MODEL, TSDataType.INT64)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.SNAPSHOT_EPOCH_TABLE_MODEL, TSDataType.INT64)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.SNAPSHOT_STATE_TABLE_MODEL, TSDataType.STRING)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.LAST_MISMATCH_AT_TABLE_MODEL, TSDataType.TIMESTAMP)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.MISMATCH_SCOPE_REF_TABLE_MODEL, TSDataType.STRING)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.MISMATCH_LEAF_COUNT_TABLE_MODEL, TSDataType.INT32)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.REPAIR_EPOCH_TABLE_MODEL, TSDataType.STRING)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.LAST_ERROR_CODE_TABLE_MODEL, TSDataType.STRING)); + repairProgressTable.addColumnSchema( + new AttributeColumnSchema( + ColumnHeaderConstant.LAST_ERROR_MESSAGE_TABLE_MODEL, TSDataType.STRING)); + schemaTables.put(REPAIR_PROGRESS, repairProgressTable); + final TsTable pipeTable = new TsTable(PIPES); pipeTable.addColumnSchema( new TagColumnSchema( diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/consensus/iotv2/consistency/LogicalMismatchScopeTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/consensus/iotv2/consistency/LogicalMismatchScopeTest.java new file mode 100644 index 0000000000000..0ac65a1d3bcf3 --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/consensus/iotv2/consistency/LogicalMismatchScopeTest.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.consensus.iotv2.consistency; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class LogicalMismatchScopeTest { + + @Test + public void shouldRoundTripLegacyLeafScope() { + List scopes = + LogicalMismatchScope.deserialize("LIVE@leaf:1:0,TOMBSTONE@leaf:2:0"); + + Assert.assertEquals( + Arrays.asList( + new LogicalMismatchScope.Scope("LIVE", "leaf:1:0"), + new LogicalMismatchScope.Scope("TOMBSTONE", "leaf:2:0")), + scopes); + Assert.assertEquals("LIVE@leaf:1:0,TOMBSTONE@leaf:2:0", LogicalMismatchScope.serialize(scopes)); + } + + @Test + public void shouldPersistOptionalLogicalKeyRange() { + List scopes = + Collections.singletonList( + new LogicalMismatchScope.Scope( + "LIVE", "leaf:3:0", "root.db.d1|s1|1|INT64|1", "root.db.d1|s1|9|INT64|9")); + + String serialized = LogicalMismatchScope.serialize(scopes); + List recovered = LogicalMismatchScope.deserialize(serialized); + + Assert.assertEquals(scopes, recovered); + } + + @Test + public void shouldPersistExactLogicalKeysForMicroRepair() { + List scopes = + Collections.singletonList( + new LogicalMismatchScope.Scope( + "LIVE", + "leaf:7:0", + "root.db.d1|s1|1|INT64|1", + "root.db.d1|s1|5|INT64|5", + Arrays.asList( + "root.db.d1|s1|1|INT64|1", + "root.db.d1|s2|2|INT64|2", + "root.db.d2|s1|5|INT64|5"))); + + String serialized = LogicalMismatchScope.serialize(scopes); + List recovered = LogicalMismatchScope.deserialize(serialized); + + Assert.assertEquals(scopes, recovered); + } + + @Test + public void shouldPersistNonRepairableDirective() { + List scopes = + Collections.singletonList( + new LogicalMismatchScope.Scope( + "TOMBSTONE", + "leaf:9:0", + "root.db.d1|s1|1|INT64|1", + "root.db.d1|s1|5|INT64|5", + Collections.singletonList("root.db.d1|s1|1|INT64|1"), + LogicalMismatchScope.RepairDirective.FOLLOWER_EXTRA_TOMBSTONE)); + + String serialized = LogicalMismatchScope.serialize(scopes); + List recovered = LogicalMismatchScope.deserialize(serialized); + + Assert.assertEquals(scopes, recovered); + Assert.assertFalse(recovered.get(0).isRepairable()); + Assert.assertEquals( + LogicalMismatchScope.RepairDirective.FOLLOWER_EXTRA_TOMBSTONE, + recovered.get(0).getRepairDirective()); + } +} diff --git a/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/consensus/iotv2/consistency/RepairProgressTableTest.java b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/consensus/iotv2/consistency/RepairProgressTableTest.java new file mode 100644 index 0000000000000..b4b6c9789f8a7 --- /dev/null +++ b/iotdb-core/node-commons/src/test/java/org/apache/iotdb/commons/consensus/iotv2/consistency/RepairProgressTableTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.consensus.iotv2.consistency; + +import org.apache.tsfile.utils.PublicBAOS; +import org.junit.Assert; +import org.junit.Test; + +import java.io.DataOutputStream; +import java.nio.ByteBuffer; + +public class RepairProgressTableTest { + + @Test + public void verifiedPartitionWithSameSnapshotShouldBePrunedFromNextCheck() { + RepairProgressTable table = new RepairProgressTable("DataRegion-1"); + + table.markVerified(100L, 1000L, 2000L, 3000L, 3000L, RepairProgressTable.SnapshotState.READY); + + RepairProgressTable.PartitionProgress progress = table.getPartition(100L); + Assert.assertNotNull(progress); + Assert.assertFalse(progress.shouldCheck(3000L, 3000L, RepairProgressTable.SnapshotState.READY)); + Assert.assertTrue(progress.shouldCheck(3001L, 3001L, RepairProgressTable.SnapshotState.READY)); + } + + @Test + public void replicaObservationTokenChangeShouldForceRecheck() { + RepairProgressTable table = new RepairProgressTable("DataRegion-1"); + + table.markVerified( + 100L, + 1000L, + 2000L, + 3000L, + 3000L, + RepairProgressTable.SnapshotState.READY, + "leader=4|follower=3:snapshot=3000"); + + RepairProgressTable.PartitionProgress progress = table.getPartition(100L); + Assert.assertNotNull(progress); + Assert.assertFalse( + progress.shouldCheck( + 3000L, + 3000L, + RepairProgressTable.SnapshotState.READY, + "leader=4|follower=3:snapshot=3000")); + Assert.assertTrue( + progress.shouldCheck( + 3000L, + 3000L, + RepairProgressTable.SnapshotState.READY, + "leader=4|follower=3:snapshot=0")); + } + + @Test + public void dirtyTransitionShouldClearVerifiedRepairStateAndScope() { + RepairProgressTable table = new RepairProgressTable("DataRegion-1"); + + table.markMismatch( + 100L, + 1000L, + 2000L, + 3000L, + 3000L, + RepairProgressTable.SnapshotState.READY, + "LIVE@leaf:1:0", + 1, + "leader:1:2000:3000:3000"); + table.markRepairSucceeded( + 100L, + 1100L, + 2100L, + 3001L, + 3001L, + RepairProgressTable.SnapshotState.READY, + "leader:1:2000:3000:3000"); + table.markDirty(100L); + + RepairProgressTable.PartitionProgress progress = table.getPartition(100L); + Assert.assertEquals(RepairProgressTable.CheckState.DIRTY, progress.getCheckState()); + Assert.assertEquals(RepairProgressTable.RepairState.IDLE, progress.getRepairState()); + Assert.assertNull(progress.getMismatchScopeRef()); + } + + @Test + public void serDeShouldPreserveCheckAndRepairState() throws Exception { + RepairProgressTable table = new RepairProgressTable("DataRegion-7"); + table.markMismatch( + 10L, + 111L, + 222L, + 333L, + 333L, + RepairProgressTable.SnapshotState.READY, + "TOMBSTONE@leaf:7:1", + 2, + "leader:7:222:333:333"); + table.markRepairRunning(10L, "leader:7:222:333:333"); + table.markRepairFailed(10L, "leader:7:222:333:333", "ERR_CODE", "ERR_MSG"); + + RepairProgressTable restored; + try (PublicBAOS byteArrayOutputStream = new PublicBAOS(); + DataOutputStream outputStream = new DataOutputStream(byteArrayOutputStream)) { + table.serialize(outputStream); + restored = + RepairProgressTable.deserialize( + ByteBuffer.wrap(byteArrayOutputStream.getBuf(), 0, byteArrayOutputStream.size())); + } + + Assert.assertEquals(table, restored); + } +} diff --git a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift index 92312ee81a307..29701d1ecea8b 100644 --- a/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift +++ b/iotdb-protocol/thrift-confignode/src/main/thrift/confignode.thrift @@ -173,6 +173,12 @@ struct TSetDataNodeStatusReq { 2: required string status } +struct TTriggerRegionConsistencyRepairReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: optional list partitionFilter + 3: optional string repairEpoch +} + // Database struct TDeleteDatabaseReq { 1: required string prefixPath @@ -711,6 +717,29 @@ struct TShowConfigNodes4InformationSchemaResp { 2: optional list configNodesInfoList } +struct TRepairProgressInfo { + 1: required i32 regionId + 2: required i64 timePartition + 3: required string checkState + 4: required string repairState + 5: required i64 lastCheckedAt + 6: required i64 lastSafeWatermark + 7: required i64 partitionMutationEpoch + 8: required i64 snapshotEpoch + 9: required string snapshotState + 10: required i64 lastMismatchAt + 11: optional string mismatchScopeRef + 12: required i32 mismatchLeafCount + 13: optional string repairEpoch + 14: optional string lastErrorCode + 15: optional string lastErrorMessage +} + +struct TShowRepairProgressResp { + 1: required common.TSStatus status + 2: optional list repairProgressInfoList +} + // Show Database struct TDatabaseInfo { 1: required string name @@ -1753,6 +1782,9 @@ service IConfigNodeRPCService { common.TSStatus removeRegion(TRemoveRegionReq req) + /** Trigger replica consistency check and repair for a single DataRegion */ + common.TSStatus triggerRegionConsistencyRepair(TTriggerRegionConsistencyRepairReq req) + /** Kill query */ common.TSStatus killQuery(string queryId, i32 dataNodeId, string allowedUsername) @@ -1781,6 +1813,9 @@ service IConfigNodeRPCService { /** Show cluster ConfigNodes' information for information schema */ TShowConfigNodes4InformationSchemaResp showConfigNodes4InformationSchema() + /** Show replica consistency check / repair progress for information schema */ + TShowRepairProgressResp showRepairProgress() + /** Show cluster Databases' information */ TShowDatabaseResp showDatabase(TGetDatabaseReq req) @@ -2055,4 +2090,3 @@ service IConfigNodeRPCService { common.TSStatus createTableView(TCreateTableViewReq req) } - diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index cca7110f28d40..d469bfb9e56e7 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -428,6 +428,161 @@ struct TLoadResp { 3: optional common.TSStatus status } +struct TConsistencyDeletionSummary { + 1: required string pathPattern + 2: required i64 timeRangeStart + 3: required i64 timeRangeEnd + 4: required binary serializedProgressIndex +} + +struct TPartitionConsistencyEligibility { + 1: required i64 timePartitionId + 2: required i64 partitionMutationEpoch + 3: required i64 snapshotEpoch + 4: required string snapshotState + 5: required i64 liveRootXorHash + 6: required i64 liveRootAddHash + 7: required i64 tombstoneRootXorHash + 8: required i64 tombstoneRootAddHash +} + +struct TGetConsistencyEligibilityReq { + 1: required common.TConsensusGroupId consensusGroupId +} + +struct TGetConsistencyEligibilityResp { + 1: required common.TSStatus status + 2: required i64 syncLag + 3: required i64 safeWatermark + 4: optional list partitions +} + +struct TSnapshotSubtreeNode { + 1: required string parentNodeHandle + 2: required string nodeHandle + 3: required string treeKind + 4: required i32 depth + 5: required bool leaf + 6: required i64 xorHash + 7: required i64 addHash + 8: required i64 itemCount + 9: optional string leafId + 10: optional string keyRangeStart + 11: optional string keyRangeEnd +} + +struct TGetSnapshotSubtreeReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i64 timePartitionId + 3: required i64 snapshotEpoch + 4: required string treeKind + 5: required list nodeHandles +} + +struct TGetSnapshotSubtreeResp { + 1: required common.TSStatus status + 2: required i64 timePartitionId + 3: required i64 snapshotEpoch + 4: optional bool stale + 5: optional list nodes +} + +struct TLeafDiffEstimate { + 1: required i64 timePartitionId + 2: required i64 snapshotEpoch + 3: required string treeKind + 4: required string leafId + 5: required i64 rowCount + 6: required i64 tombstoneCount + 7: required i64 strataEstimate + 8: optional string keyRangeStart + 9: optional string keyRangeEnd +} + +struct TEstimateLeafDiffReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i64 timePartitionId + 3: required i64 snapshotEpoch + 4: required string treeKind + 5: required string leafId +} + +struct TEstimateLeafDiffResp { + 1: required common.TSStatus status + 2: required i64 timePartitionId + 3: required i64 snapshotEpoch + 4: optional bool stale + 5: optional TLeafDiffEstimate leafDiff +} + +struct TLeafDiffEntry { + 1: required string logicalKey + 2: required string diffType +} + +struct TDecodeLeafDiffReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i64 timePartitionId + 3: required i64 snapshotEpoch + 4: required string treeKind + 5: required string leafId +} + +struct TDecodeLeafDiffResp { + 1: required common.TSStatus status + 2: required i64 timePartitionId + 3: required i64 snapshotEpoch + 4: optional bool stale + 5: optional list diffEntries +} + +struct TLogicalRepairLeafSelector { + 1: required string treeKind + 2: required string leafId +} + +struct TLogicalRepairBatch { + 1: required string sessionId + 2: required string treeKind + 3: required string leafId + 4: required i32 seqNo + 5: required string batchKind + 6: required binary payload +} + +struct TStreamLogicalRepairReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i64 timePartitionId + 3: required string repairEpoch + 4: required list leafSelectors +} + +struct TStreamLogicalRepairResp { + 1: required common.TSStatus status + 2: required i64 timePartitionId + 3: optional bool stale + 4: optional list batches +} + +struct TApplyLogicalRepairBatchReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i64 timePartitionId + 3: required string repairEpoch + 4: required string sessionId + 5: required string treeKind + 6: required string leafId + 7: required i32 seqNo + 8: required string batchKind + 9: required binary payload +} + +struct TFinishLogicalRepairSessionReq { + 1: required common.TConsensusGroupId consensusGroupId + 2: required i64 timePartitionId + 3: required string repairEpoch + 4: required string sessionId +} + struct TConstructSchemaBlackListReq { 1: required list schemaRegionIdList 2: required binary pathPatternTree @@ -815,6 +970,20 @@ service IDataNodeRPCService { TLoadResp sendLoadCommand(TLoadCommandReq req); + TGetConsistencyEligibilityResp getConsistencyEligibility(TGetConsistencyEligibilityReq req); + + TGetSnapshotSubtreeResp getSnapshotSubtree(TGetSnapshotSubtreeReq req); + + TEstimateLeafDiffResp estimateLeafDiff(TEstimateLeafDiffReq req); + + TDecodeLeafDiffResp decodeLeafDiff(TDecodeLeafDiffReq req); + + TStreamLogicalRepairResp streamLogicalRepair(TStreamLogicalRepairReq req); + + common.TSStatus applyLogicalRepairBatch(TApplyLogicalRepairBatchReq req); + + common.TSStatus finishLogicalRepairSession(TFinishLogicalRepairSessionReq req); + common.TSStatus updateAttribute(TAttributeUpdateReq req); @@ -1291,4 +1460,4 @@ service MPPDataExchangeService { /** Empty rpc, only for connection test */ common.TSStatus testConnectionEmptyRPC() -} \ No newline at end of file +}