From 55665f78081a46deeabec396f20552b4781bd990 Mon Sep 17 00:00:00 2001 From: libo Date: Sun, 1 Mar 2026 23:58:20 +0800 Subject: [PATCH 01/39] Come true all rpc interfaces in the DataNode, and partial features in the DataPartitionTableIntegrityCheckProcedure. --- DataPartitionTableIntegrityCheck_README.md | 245 ++++++++ .../AsyncDataNodeHeartbeatClientPool.java | 8 + .../client/sync/CnToDnSyncRequestType.java | 5 + .../client/sync/SyncDataNodeClientPool.java | 10 + .../confignode/manager/ProcedureManager.java | 13 + .../load/service/HeartbeatService.java | 3 + .../partition/ConfigNodeProcedureEnv.java | 39 ++ ...PartitionTableIntegrityCheckProcedure.java | 578 ++++++++++++++++++ ...tionTableIntegrityCheckProcedureState.java | 18 + .../procedure/store/ProcedureFactory.java | 6 + .../procedure/store/ProcedureType.java | 5 +- .../iotdb/confignode/service/ConfigNode.java | 9 +- .../org/apache/iotdb/db/conf/IoTDBConfig.java | 20 + .../apache/iotdb/db/conf/IoTDBDescriptor.java | 5 + .../DataPartitionTableGenerator.java | 357 +++++++++++ .../db/protocol/thrift/OperationType.java | 5 +- .../impl/DataNodeInternalRPCServiceImpl.java | 296 ++++++++- .../dataregion/tsfile/TsFileResource.java | 5 + .../tsfile/timeindex/FileTimeIndex.java | 36 ++ .../tsfile/timeindex/ITimeIndex.java | 9 +- .../conf/iotdb-system.properties.template | 15 + .../iotdb/commons/concurrent/ThreadName.java | 2 + .../DataPartitionTableGeneratorState.java | 33 + .../commons/utils/TimePartitionUtils.java | 33 + .../rateLimiter/LeakyBucketRateLimiter.java | 128 ++++ .../src/main/thrift/datanode.thrift | 54 ++ 26 files changed, 1930 insertions(+), 7 deletions(-) create mode 100644 DataPartitionTableIntegrityCheck_README.md create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java create mode 100644 iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java create mode 100644 iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java diff --git a/DataPartitionTableIntegrityCheck_README.md b/DataPartitionTableIntegrityCheck_README.md new file mode 100644 index 0000000000000..7fe3eefabb041 --- /dev/null +++ b/DataPartitionTableIntegrityCheck_README.md @@ -0,0 +1,245 @@ +# IoTDB 数据分区表完整性检测功能实现 + +## 功能概述 + +本功能实现了IoTDB ConfigNode重启时的数据分区表完整性检测,能够自动发现并恢复丢失的数据分区信息。 + +## 实现架构 + +### 1. 核心组件 + +#### Procedure实现 +- **DataPartitionTableIntegrityCheckProcedure**: 主要的Procedure实现,负责整个完整性检测流程 +- **ConfigNodeProcedureEnv**: Procedure执行环境,提供ConfigManager访问 + +#### DataNode端实现 +- **DataPartitionTableGenerator**: 扫描tsfile并生成DataPartitionTable的核心组件 +- **RPC接口扩展**: 在DataNode RPC服务中添加了三个新接口 + +#### 配置和注册 +- **ProcedureType枚举扩展**: 添加了新的Procedure类型 +- **ProcedureFactory扩展**: 支持新Procedure的创建和反序列化 +- **启动监听器**: ConfigNode启动时自动触发检测 + +### 2. 执行流程 + +``` +ConfigNode重启 → 检查Leader状态 → 收集最早timeslot → 分析缺失分区 → +请求DN生成表 → 合并分区表 → 写入Raft日志 → 完成 +``` + +## 详细实现 + +### 1. Thrift接口定义 (datanode.thrift) + +新增的RPC接口: +```thrift +// 获取最早timeslot信息 +TGetEarliestTimeslotsResp getEarliestTimeslots() + +// 请求生成DataPartitionTable +TGenerateDataPartitionTableResp generateDataPartitionTable() + +// 检查生成状态 +TCheckDataPartitionTableStatusResp checkDataPartitionTableStatus() +``` + +对应的响应结构体: +```thrift +struct TGetEarliestTimeslotsResp { + 1: required common.TSStatus status + 2: optional map databaseToEarliestTimeslot +} + +struct TGenerateDataPartitionTableResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message +} + +struct TCheckDataPartitionTableStatusResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message + 4: optional binary dataPartitionTable +} +``` + +### 2. DataNode实现 + +#### DataPartitionTableGenerator +- **并行扫描**: 使用多线程并行扫描tsfile文件 +- **进度跟踪**: 提供处理进度和状态信息 +- **错误处理**: 统计失败文件并记录错误信息 +- **配置化**: 支持自定义线程数和分区配置 + +#### RPC服务实现 +在`DataNodeInternalRPCServiceImpl`中实现: +- `getEarliestTimeslots()`: 扫描数据目录获取每个数据库的最早timeslot +- `generateDataPartitionTable()`: 启动异步扫描任务 +- `checkDataPartitionTableStatus()`: 检查任务状态并返回结果 + +### 3. ConfigNode Procedure实现 + +#### 状态机设计 +```java +public enum State { + CHECK_LEADER_STATUS, // 检查Leader状态 + COLLECT_EARLIEST_TIMESLOTS, // 收集最早timeslot + ANALYZE_MISSING_PARTITIONS, // 分析缺失分区 + REQUEST_PARTITION_TABLES, // 请求生成分区表 + MERGE_PARTITION_TABLES, // 合并分区表 + WRITE_PARTITION_TABLE_TO_RAFT, // 写入Raft日志 + SUCCESS, // 成功完成 + FAILED // 执行失败 +} +``` + +#### 错误码定义 +```java +public static final int DN_ERROR_CODE_SUCCESS = 0; // 处理成功 +public static final int DN_ERROR_CODE_IN_PROGRESS = 2; // 正在执行 +public static final int DN_ERROR_CODE_FAILED = 1; // 处理失败 +public static final int DN_ERROR_CODE_UNKNOWN = -1; // DN未知状态 +``` + +#### 核心逻辑 +1. **Leader检查**: 只有Leader节点执行检测 +2. **数据收集**: 从所有DataNode收集最早timeslot信息 +3. **缺失分析**: 对比当前分区表,识别缺失的分区 +4. **异步处理**: 向DataNode发送异步扫描请求 +5. **状态轮询**: 定期检查任务状态,支持重试机制 +6. **数据合并**: 合并所有DataNode返回的分区表 +7. **Raft写入**: 通过共识层持久化最终分区表 + +### 4. 自动触发机制 + +#### 启动监听器 +```java +public class DataPartitionTableIntegrityCheckListener { + public void onStartupComplete() { + if (isLeader()) { + startIntegrityCheck(); + } + } + + public void onBecomeLeader() { + startIntegrityCheck(); + } +} +``` + +## 关键特性 + +### 1. 原子性保证 +- 每个步骤都是幂等的,支持重试 +- Procedure框架保证状态一致性 +- 失败时可以安全回滚 + +### 2. 容错机制 +- **重试策略**: 最多重试3次 +- **超时处理**: 避免无限等待 +- **部分失败**: 部分DataNode失败时继续处理 + +### 3. 性能优化 +- **并行扫描**: DataNode端使用多线程并行处理 +- **异步执行**: 避免阻塞主流程 +- **进度跟踪**: 提供实时进度信息 + +### 4. 可扩展性 +- **配置化**: 支持自定义线程数和分区配置 +- **模块化**: 各组件独立,易于扩展 +- **接口化**: 清晰的RPC接口定义 + +## 使用方式 + +### 1. 自动触发 +ConfigNode重启时自动检测并执行,无需手动干预。 + +### 2. 手动触发 +可以通过ProcedureExecutor手动提交检测Procedure: +```java +DataPartitionTableIntegrityCheckProcedure procedure = new DataPartitionTableIntegrityCheckProcedure(); +procedureExecutor.submit(procedure); +``` + +## 配置参数 + +### DataNode配置 +- `seriesSlotNum`: 系列分区槽数量 +- `seriesPartitionExecutorClass`: 分区执行器类名 +- `dataDirs`: 数据目录配置 + +### Procedure配置 +- `MAX_RETRY_COUNT`: 最大重试次数 (默认3) +- 重试间隔: 5秒 + +## 监控和日志 + +### 日志级别 +- **INFO**: 关键流程节点信息 +- **DEBUG**: 详细的执行过程 +- **ERROR**: 错误和异常信息 + +### 关键指标 +- 处理文件数量 +- 失败文件数量 +- 执行时间 +- 重试次数 +- DataNode响应状态 + +## 注意事项 + +### 1. 依赖关系 +- 需要ConfigNode为Leader状态 +- 依赖DataNode正常注册和通信 +- 需要共识层正常工作 + +### 2. 资源消耗 +- DataNode扫描会消耗CPU和I/O资源 +- 建议在低峰期执行 +- 大数据集时需要考虑内存使用 + +### 3. 网络带宽 +- DataPartitionTable序列化后可能较大 +- 需要考虑网络传输限制 +- 建议实现增量传输机制 + +## 后续优化建议 + +### 1. 增量扫描 +- 支持增量扫描,只处理新增文件 +- 维护扫描状态,避免重复工作 + +### 2. 分布式协调 +- 实现更智能的负载分配 +- 支持动态调整扫描策略 + +### 3. 缓存优化 +- 缓存扫描结果,避免重复计算 +- 实现智能失效机制 + +### 4. 监控增强 +- 添加更详细的性能指标 +- 实现告警机制 + +## 测试验证 + +### 1. 单元测试 +- 各组件独立测试 +- 边界条件测试 +- 异常场景验证 + +### 2. 集成测试 +- 端到端流程测试 +- 多节点环境验证 +- 故障恢复测试 + +### 3. 性能测试 +- 大数据集扫描测试 +- 并发性能测试 +- 资源使用监控 + +--- + +本实现提供了完整的IoTDB数据分区表完整性检测解决方案,具备高可用性、容错性和可扩展性,能够在ConfigNode重启时自动发现并恢复丢失的数据分区信息。 diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java index 324e351302787..d32cb5b416934 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java @@ -63,6 +63,14 @@ public void writeAuditLog( } } + public void generateDataPartitionTableHeartbeat(TEndPoint endPoint, TDataNodeHeartbeatReq req, DataNodeHeartbeatHandler handler) { + try { + clientManager.borrowClient(endPoint).generateDataPartitionTableHeartbeat(req, handler); + } catch (Exception ignore) { + // Just ignore + } + } + private static class AsyncDataNodeHeartbeatClientPoolHolder { private static final AsyncDataNodeHeartbeatClientPool INSTANCE = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java index 4055398ddb7ec..790fd637d616a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/CnToDnSyncRequestType.java @@ -37,6 +37,11 @@ public enum CnToDnSyncRequestType { DELETE_OLD_REGION_PEER, RESET_PEER_LIST, + // Data Partition Table Maintenance + COLLECT_EARLIEST_TIMESLOTS, + GENERATE_DATA_PARTITION_TABLE, + GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + // PartitionCache INVALIDATE_PARTITION_CACHE, INVALIDATE_PERMISSION_CACHE, diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java index d63d5a74f6095..84c027e513298 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java @@ -32,6 +32,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TCreateDataRegionReq; import org.apache.iotdb.mpp.rpc.thrift.TCreatePeerReq; import org.apache.iotdb.mpp.rpc.thrift.TCreateSchemaRegionReq; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidatePermissionCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TKillQueryInstanceReq; @@ -139,6 +140,15 @@ private void buildActionMap() { actionMapBuilder.put( CnToDnSyncRequestType.SHOW_APPLIED_CONFIGURATIONS, (req, client) -> client.showAppliedConfigurations()); + actionMapBuilder.put( + CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, + (req, client) -> client.getEarliestTimeslots()); + actionMapBuilder.put( + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, + (req, client) -> client.generateDataPartitionTable((TGenerateDataPartitionTableReq) req)); + actionMapBuilder.put( + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + (req, client) -> client.generateDataPartitionTableHeartbeat()); actionMap = actionMapBuilder.build(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index 0fe3abc79a72b..06f9534d39779 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -67,6 +67,7 @@ import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveConfigNodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveDataNodesProcedure; +import org.apache.iotdb.confignode.procedure.impl.partition.DataPartitionTableIntegrityCheckProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.CreatePipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.DropPipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.runtime.PipeHandleLeaderChangeProcedure; @@ -1374,6 +1375,18 @@ public TSStatus createRegionGroups( } } + /** + * Used to repair the lost data partition table + */ + public TSStatus dataPartitionTableIntegrityCheck() { + DataPartitionTableIntegrityCheckProcedure procedure; + synchronized (this) { + procedure = new DataPartitionTableIntegrityCheckProcedure(); + executor.submitProcedure(procedure); + } + return waitingProcedureFinished(procedure); + } + /** * Generate {@link CreateTriggerProcedure} and wait until it finished. * diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java index 64322da5bbb20..a2b1c3ed66ffd 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java @@ -279,6 +279,9 @@ private void pingRegisteredDataNodes( AsyncDataNodeHeartbeatClientPool.getInstance() .getDataNodeHeartBeat( dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); + AsyncDataNodeHeartbeatClientPool.getInstance() + .generateDataPartitionTableHeartbeat( + dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java new file mode 100644 index 0000000000000..c1ebd7ffccde1 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/ConfigNodeProcedureEnv.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.partition; + +import org.apache.iotdb.confignode.manager.ConfigManager; + +/** + * Environment object for ConfigNode procedures. Provides access to ConfigManager and other + * necessary components. + */ +public class ConfigNodeProcedureEnv { + + private final ConfigManager configManager; + + public ConfigNodeProcedureEnv(ConfigManager configManager) { + this.configManager = configManager; + } + + public ConfigManager getConfigManager() { + return configManager; + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java new file mode 100644 index 0000000000000..860f34ed1d735 --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -0,0 +1,578 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.confignode.procedure.impl.partition; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; +import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; +import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; +import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; +import org.apache.iotdb.confignode.client.sync.SyncDataNodeClientPool; +import org.apache.iotdb.confignode.consensus.request.read.partition.GetDataPartitionPlan; +import org.apache.iotdb.confignode.manager.node.NodeManager; +import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; +import org.apache.iotdb.confignode.procedure.exception.ProcedureException; +import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; +import org.apache.iotdb.confignode.procedure.state.DataPartitionTableIntegrityCheckProcedureState; +import org.apache.iotdb.confignode.rpc.thrift.TTimeSlotList; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; +import org.apache.iotdb.rpc.TSStatusCode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +/** + * Procedure for checking and restoring data partition table integrity. This procedure scans all + * DataNodes to detect missing data partitions and restores the DataPartitionTable on the ConfigNode + * Leader. + */ +public class DataPartitionTableIntegrityCheckProcedure + extends StateMachineProcedure< + ConfigNodeProcedureEnv, DataPartitionTableIntegrityCheckProcedureState> { + + private static final Logger LOG = + LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedure.class); + + /** Error codes for DataNode responses */ + public static final int DN_ERROR_CODE_SUCCESS = 0; + + public static final int DN_ERROR_CODE_IN_PROGRESS = 2; + public static final int DN_ERROR_CODE_FAILED = 1; + public static final int DN_ERROR_CODE_UNKNOWN = -1; + + /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ + private Map earliestTimeslots = new ConcurrentHashMap<>(); + + /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ + private Map dataPartitionTables = new ConcurrentHashMap<>(); + + /** Final merged DataPartitionTable */ + private DataPartitionTable finalDataPartitionTable; + + /** List of DataNodes that need to generate DataPartitionTable */ + private List allDataNodes = new ArrayList<>(); + + private Set lostDataPartitionsOfDatabases; + + NodeManager dataNodeManager; + + /** Current retry attempt */ + private int retryCount = 0; + + private static final int MAX_RETRY_COUNT = 3; + + private static Set skipDnIds; + private static Set failedDnIds; + + private static ScheduledExecutorService heartBeatExecutor; + private static final long HEART_BEAT_REQUEST_RATE = 60000; + + public DataPartitionTableIntegrityCheckProcedure() { + super(); + } + + @Override + protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + throws InterruptedException { + try { + // Ensure to get the real-time DataNodes in the current cluster at every step + dataNodeManager = env.getConfigManager().getNodeManager(); + allDataNodes = dataNodeManager.getRegisteredDataNodes(); + + switch (state) { + case COLLECT_EARLIEST_TIMESLOTS: + failedDnIds = new HashSet<>(); + return collectEarliestTimeslots(env); + case ANALYZE_MISSING_PARTITIONS: + return analyzeMissingPartitions(env); + case REQUEST_PARTITION_TABLES: + heartBeatExecutor = Executors.newScheduledThreadPool(allDataNodes.size()); + return requestPartitionTables(env); + case MERGE_PARTITION_TABLES: + return mergePartitionTables(env); + case WRITE_PARTITION_TABLE_TO_RAFT: + return writePartitionTableToRaft(env); + default: + throw new ProcedureException("Unknown state: " + state); + } + } catch (Exception e) { + LOG.error("Error executing state {}: {}", state, e.getMessage(), e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + return Flow.NO_MORE_STATE; + } + } + + @Override + protected void rollbackState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + throws IOException, InterruptedException, ProcedureException { + switch (state) { + case COLLECT_EARLIEST_TIMESLOTS: + case ANALYZE_MISSING_PARTITIONS: + case REQUEST_PARTITION_TABLES: + case MERGE_PARTITION_TABLES: + case WRITE_PARTITION_TABLE_TO_RAFT: + // Cleanup resources + earliestTimeslots.clear(); + dataPartitionTables.clear(); + allDataNodes.clear(); + finalDataPartitionTable = null; + break; + case SUCCESS: + case FAILED: + // No cleanup needed for terminal states + break; + default: + throw new ProcedureException("Unknown state for rollback: " + state); + } + } + + @Override + protected DataPartitionTableIntegrityCheckProcedureState getState(int stateId) { + return null; + } + + @Override + protected int getStateId(DataPartitionTableIntegrityCheckProcedureState state) { + return 0; + } + + @Override + protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { + skipDnIds = new HashSet<>(); + return DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS; + } + + /** + * Collect earliest timeslot information from all DataNodes. Each DataNode returns a Map where key is database name and value is the earliest timeslot id. + */ + private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.debug("Collecting earliest timeslots from all DataNodes..."); + } + + if (allDataNodes.isEmpty()) { + LOG.error("No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + // Collect earliest timeslots from all DataNodes + Map mergedEarliestTimeslots = new ConcurrentHashMap<>(); + + for (TDataNodeConfiguration dataNode : allDataNodes) { + try { + TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, MAX_RETRY_COUNT); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + LOG.error("Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + continue; + } + + Map nodeTimeslots = resp.getDatabaseToEarliestTimeslot(); + + // Merge with existing timeslots (take minimum) + for (Map.Entry entry : nodeTimeslots.entrySet()) { + mergedEarliestTimeslots.merge(entry.getKey(), entry.getValue(), Math::min); + } + + if (LOG.isDebugEnabled()) { + LOG.debug( + "Collected earliest timeslots from the DataNode[id={}]: {}", + dataNode.getLocation().getDataNodeId(), + nodeTimeslots); + } + } catch (Exception e) { + LOG.error( + "Failed to collect earliest timeslots from the DataNode[id={}]: {}", + dataNode.getLocation().getDataNodeId(), + e.getMessage(), + e); + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + } + } + + earliestTimeslots = mergedEarliestTimeslots; + + if (LOG.isDebugEnabled()) { + LOG.info( + "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", + allDataNodes.size(), + earliestTimeslots, + allDataNodes.size() - failedDnIds.size()); + } + + Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); + if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + } else { + setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); + } + return Flow.HAS_MORE_STATE; + } + + /** + * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions of databases need to be repaired. + */ + private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.debug("Analyzing missing data partitions..."); + } + + if (earliestTimeslots.isEmpty()) { + LOG.error("No missing data partitions detected, nothing needs to be repaired, terminating procedure"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + // Find all databases that have lost data partition tables + lostDataPartitionsOfDatabases = new HashSet<>(); + + for (Map.Entry entry : earliestTimeslots.entrySet()) { + String database = entry.getKey(); + long earliestTimeslot = entry.getValue(); + + // Get current DataPartitionTable from ConfigManager + Map>>> + dataPartitionTable = getLocalDataPartitionTable(env, database); + + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (dataPartitionTable.isEmpty() || dataPartitionTable.get(database) == null || dataPartitionTable.get(database).isEmpty()) { + LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + continue; + } + + Map>> seriesPartitionMap = dataPartitionTable.get(database); + for (Map.Entry>> + seriesPartitionEntry : seriesPartitionMap.entrySet()) { + Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); + tTimePartitionSlotListMap.keySet().forEach(slot -> { + if (!TimePartitionUtils.satisfyPartitionId(slot.getStartTime(), earliestTimeslot)) { + lostDataPartitionsOfDatabases.add(database); + LOG.warn("Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", database, earliestTimeslot); + } + }); + } + } + + if (lostDataPartitionsOfDatabases.isEmpty()) { + LOG.info("No databases have lost data partitions, terminating procedure"); + return Flow.NO_MORE_STATE; + } + + LOG.info( + "Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", + lostDataPartitionsOfDatabases.size(), + allDataNodes.size() - failedDnIds.size()); + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); + return Flow.HAS_MORE_STATE; + } + + private Map>>> getLocalDataPartitionTable(ConfigNodeProcedureEnv env, String database) { + Map> schemaPartitionTable = env.getConfigManager().getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) + .getSchemaPartitionTable(); + + // Construct request for getting data partition + final Map> partitionSlotsMap = new HashMap<>(); + schemaPartitionTable.forEach( + (key, value) -> { + Map slotListMap = new HashMap<>(); + value + .keySet() + .forEach( + slot -> + slotListMap.put( + slot, new TTimeSlotList(Collections.emptyList(), true, true))); + partitionSlotsMap.put(key, slotListMap); + }); + final GetDataPartitionPlan getDataPartitionPlan = new GetDataPartitionPlan(partitionSlotsMap); + return env.getConfigManager().getDataPartition(getDataPartitionPlan).getDataPartitionTable(); + } + + /** + * Request DataPartitionTable generation from target DataNodes. Each DataNode scans its tsfile + * resources and generates a DataPartitionTable. + */ + private Flow requestPartitionTables(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.debug("Requesting DataPartitionTable generation from {} DataNodes...", allDataNodes.size()); + } + + if (allDataNodes.isEmpty()) { + LOG.error("No DataNodes registered, no way to requested DataPartitionTable generation, terminating procedure"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + heartBeatExecutor.scheduleAtFixedRate(this::checkPartitionTableGenerationStatus, 0, HEART_BEAT_REQUEST_RATE, TimeUnit.MILLISECONDS); + + for (TDataNodeConfiguration dataNode : allDataNodes) { + int dataNodeId = dataNode.getLocation().getDataNodeId(); + if (!dataPartitionTables.containsKey(dataNodeId)) { + try { + TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, MAX_RETRY_COUNT); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + LOG.error("Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + continue; + } + + byte[] bytes = resp.getDataPartitionTable(); + DataPartitionTable dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); + dataPartitionTables.put(dataNodeId, dataPartitionTable); + } catch (Exception e) { + failedDnIds.add(dataNode.getLocation().getDataNodeId()); + LOG.error( + "Failed to request DataPartitionTable generation from DataNode[id={}]: {}", + dataNodeId, + e.getMessage(), + e); + } + } + } + + Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); + if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + return Flow.HAS_MORE_STATE; + } + + /** + * Check completion status of DataPartitionTable generation tasks. + */ + private void checkPartitionTableGenerationStatus() { + LOG.info("Checking DataPartitionTable generation completion status..."); + + int completeCount = 0; + for (TDataNodeConfiguration dataNode : allDataNodes) { + int dataNodeId = dataNode.getLocation().getDataNodeId(); + + if (!dataPartitionTables.containsKey(dataNodeId)) { + try { + TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, MAX_RETRY_COUNT); + DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getStatus().getCode()); + + switch (state) { + case SUCCESS: + LOG.info("DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); + completeCount++; + break; + case IN_PROGRESS: + LOG.info("DataNode {} still generating DataPartitionTable", dataNodeId); + break; + case FAILED: + LOG.error("DataNode {} failed to generate DataPartitionTable, terminating heart beat", dataNodeId); + completeCount++; + break; + default: + LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getStatus().getCode()); + break; + } + } catch (Exception e) { + LOG.error( + "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + dataNodeId, + e.getMessage(), + e); + completeCount++; + } + } + } + + if (completeCount >= allDataNodes.size()) { + heartBeatExecutor.shutdown(); + } + } + + private static void declineThread() { + heartBeatExecutor.shutdown(); + } + + /** + * Merge DataPartitionTables from all DataNodes into a final table. + */ + private Flow mergePartitionTables(ConfigNodeProcedureEnv env) { + if (LOG.isDebugEnabled()) { + LOG.info("Merging DataPartitionTables from {} DataNodes...", dataPartitionTables.size()); + } + + if (dataPartitionTables.isEmpty()) { + LOG.error("No DataPartitionTables to merge, dataPartitionTables is empty"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } + + try { + finalDataPartitionTable = new DataPartitionTable(); + + // TODO: Implement proper merging logic + // For now, use the first DataPartitionTable as the final one + if (!dataPartitionTables.isEmpty()) { + DataPartitionTable firstTable = dataPartitionTables.values().iterator().next(); + finalDataPartitionTable = firstTable; + + // In a real implementation, you would: + // 1. Merge all series partition slots from all DataNodes + // 2. For each series slot, merge time slot information + // 3. Resolve conflicts by choosing the most recent/complete data + // 4. Ensure consistency across all DataNodes + + LOG.info( + "Merged DataPartitionTable contains {} series partitions", + finalDataPartitionTable.getDataPartitionMap().size()); + } + + LOG.info("DataPartitionTable merge completed successfully"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + return Flow.HAS_MORE_STATE; + + } catch (Exception e) { + LOG.error("Failed to merge DataPartitionTables", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + return Flow.NO_MORE_STATE; + } + } + + /** Write the final DataPartitionTable to raft log. */ + private Flow writePartitionTableToRaft(ConfigNodeProcedureEnv env) { + LOG.info("Writing DataPartitionTable to raft log..."); + + if (finalDataPartitionTable == null) { + LOG.error("No DataPartitionTable to write to raft"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("No DataPartitionTable available for raft write")); + return Flow.NO_MORE_STATE; + } + + try { + // TODO: Implement actual raft log write + // This should create a consensus request to write the DataPartitionTable + // Example: + // WriteDataPartitionTablePlan plan = new + // WriteDataPartitionTablePlan(finalDataPartitionTable); + // env.getConfigManager().getConsensusManager().write(plan); + + // For now, simulate successful write + boolean writeSuccess = true; + + if (writeSuccess) { + LOG.info("DataPartitionTable successfully written to raft log"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.SUCCESS); + return Flow.HAS_MORE_STATE; + } else { + LOG.error("Failed to write DataPartitionTable to raft log"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("Failed to write DataPartitionTable to raft log")); + return Flow.NO_MORE_STATE; + } + + } catch (Exception e) { + LOG.error("Error writing DataPartitionTable to raft log", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + return Flow.NO_MORE_STATE; + } + } + + // @TODO + @Override + public void serialize(DataOutputStream stream) throws IOException { + super.serialize(stream); + + // Serialize earliestTimeslots + stream.writeInt(earliestTimeslots.size()); + for (Map.Entry entry : earliestTimeslots.entrySet()) { + stream.writeUTF(entry.getKey()); + stream.writeLong(entry.getValue()); + } + + // Serialize dataPartitionTables count + stream.writeInt(dataPartitionTables.size()); + // Note: DataPartitionTable serialization would need to be implemented here + + // Serialize targetDataNodes count + stream.writeInt(targetDataNodes.size()); + for (TDataNodeConfiguration dataNode : targetDataNodes) { + stream.writeInt(dataNode.getLocation().getDataNodeId()); + } + + // Serialize retryCount + stream.writeInt(retryCount); + } + + // @TODO + @Override + public void deserialize(ByteBuffer byteBuffer) { + super.deserialize(byteBuffer); + + // Deserialize earliestTimeslots + int earliestTimeslotsSize = byteBuffer.getInt(); + earliestTimeslots = new ConcurrentHashMap<>(); + for (int i = 0; i < earliestTimeslotsSize; i++) { + String database = String.valueOf(byteBuffer.getChar()); + long timeslot = byteBuffer.getLong(); + earliestTimeslots.put(database, timeslot); + } + + // Deserialize dataPartitionTables count + int dataPartitionTablesSize = byteBuffer.getInt(); + dataPartitionTables = new ConcurrentHashMap<>(); + // Note: DataPartitionTable deserialization would need to be implemented here + + // Deserialize targetDataNodes + int targetDataNodesSize = byteBuffer.getInt(); + targetDataNodes = new ArrayList<>(); + for (int i = 0; i < targetDataNodesSize; i++) { + int dataNodeId = byteBuffer.getInt(); + // Note: TDataNodeLocation reconstruction would need to be implemented here + } + + // Deserialize retryCount + retryCount = byteBuffer.getInt(); + } +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java new file mode 100644 index 0000000000000..7028adf9b4b9a --- /dev/null +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java @@ -0,0 +1,18 @@ +package org.apache.iotdb.confignode.procedure.state; + +public enum DataPartitionTableIntegrityCheckProcedureState { + /** Collect earliest timeslot information from all DataNodes */ + COLLECT_EARLIEST_TIMESLOTS, + /** Analyze missing data partitions */ + ANALYZE_MISSING_PARTITIONS, + /** Request DataPartitionTable generation from DataNodes */ + REQUEST_PARTITION_TABLES, + /** Merge DataPartitionTables from all DataNodes */ + MERGE_PARTITION_TABLES, + /** Write final DataPartitionTable to raft log */ + WRITE_PARTITION_TABLE_TO_RAFT, + /** Procedure completed successfully */ + SUCCESS, + /** Procedure failed */ + FAILED +} diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java index dd15558608718..140fffa852ccc 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureFactory.java @@ -26,6 +26,7 @@ import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveConfigNodeProcedure; import org.apache.iotdb.confignode.procedure.impl.node.RemoveDataNodesProcedure; +import org.apache.iotdb.confignode.procedure.impl.partition.DataPartitionTableIntegrityCheckProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.CreatePipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.DropPipePluginProcedure; import org.apache.iotdb.confignode.procedure.impl.pipe.runtime.PipeHandleLeaderChangeProcedure; @@ -404,6 +405,9 @@ public Procedure create(ByteBuffer buffer) throws IOException { case ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE: procedure = new AddNeverFinishSubProcedureProcedure(); break; + case DATA_PARTITION_TABLE_INTEGRITY_CHECK_PROCEDURE: + procedure = new DataPartitionTableIntegrityCheckProcedure(); + break; default: LOGGER.error("Unknown Procedure type: {}", typeCode); throw new IOException("Unknown Procedure type: " + typeCode); @@ -554,6 +558,8 @@ public static ProcedureType getProcedureType(final Procedure procedure) { return ProcedureType.NEVER_FINISH_PROCEDURE; } else if (procedure instanceof AddNeverFinishSubProcedureProcedure) { return ProcedureType.ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE; + } else if (procedure instanceof DataPartitionTableIntegrityCheckProcedure) { + return ProcedureType.DATA_PARTITION_TABLE_INTEGRITY_CHECK_PROCEDURE; } throw new UnsupportedOperationException( "Procedure type " + procedure.getClass() + " is not supported"); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java index 820a90f7ebfb9..839c8ace0984d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/store/ProcedureType.java @@ -172,7 +172,10 @@ public enum ProcedureType { @TestOnly NEVER_FINISH_PROCEDURE((short) 30000), @TestOnly - ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE((short) 30001); + ADD_NEVER_FINISH_SUB_PROCEDURE_PROCEDURE((short) 30001), + + /** Data Partition Table Integrity Check */ + DATA_PARTITION_TABLE_INTEGRITY_CHECK_PROCEDURE((short) 1600); private final short typeCode; diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index f20f77095d97a..3befc7f1634f1 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -67,7 +67,6 @@ import org.apache.iotdb.metrics.metricsets.net.NetMetrics; import org.apache.iotdb.metrics.metricsets.system.SystemMetrics; import org.apache.iotdb.rpc.TSStatusCode; - import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -203,6 +202,14 @@ public void active() { } loadSecretKey(); loadHardwareCode(); + + // The data partition table integrity check is only performed when the ConfigNode is the leader node + if (configManager.getConsensusManager().isLeader()) { + TSStatus status = configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error("Data partition table integrity check failed!"); + } + } return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java index 98c15a2d9bf06..2ce50415549e3 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java @@ -1219,6 +1219,10 @@ public class IoTDBConfig { private long maxObjectSizeInByte = 4 * 1024 * 1024 * 1024L; + /* Need use these parameters when repair data partition table */ + private int partitionTableRecoverWorkerNum = 10; + private int partitionTableRecoverMaxReadBytesPerSecond = 1000; + IoTDBConfig() {} public int getMaxLogEntriesNumPerBatch() { @@ -4367,4 +4371,20 @@ public long getMaxObjectSizeInByte() { public void setMaxObjectSizeInByte(long maxObjectSizeInByte) { this.maxObjectSizeInByte = maxObjectSizeInByte; } + + public int getPartitionTableRecoverWorkerNum() { + return partitionTableRecoverWorkerNum; + } + + public void setPartitionTableRecoverWorkerNum(int partitionTableRecoverWorkerNum) { + this.partitionTableRecoverWorkerNum = partitionTableRecoverWorkerNum; + } + + public int getPartitionTableRecoverMaxReadBytesPerSecond() { + return partitionTableRecoverMaxReadBytesPerSecond; + } + + public void setPartitionTableRecoverMaxReadBytesPerSecond(int partitionTableRecoverWorkerNum) { + this.partitionTableRecoverWorkerNum = partitionTableRecoverWorkerNum; + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index 6730138b2af5c..4c4d7a6928747 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -1139,6 +1139,11 @@ public void loadProperties(TrimProperties properties) throws BadNodeUrlException // update trusted_uri_pattern loadTrustedUriPattern(properties); + conf.setPartitionTableRecoverWorkerNum( + Integer.parseInt(properties.getProperty("partition_table_recover_worker_num", String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); + conf.setPartitionTableRecoverMaxReadBytesPerSecond( + Integer.parseInt(properties.getProperty("partition_table_recover_max_read_bytes_per_second", String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); + conf.setIncludeNullValueInWriteThroughputMetric( Boolean.parseBoolean( properties.getProperty( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java new file mode 100644 index 0000000000000..689a12bd8df89 --- /dev/null +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -0,0 +1,357 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.db.partition; + +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; +import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.SeriesPartitionTable; +import org.apache.iotdb.commons.partition.executor.SeriesPartitionExecutor; +import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; +import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Generator for DataPartitionTable by scanning tsfile resources. This class scans the data + * directory structure and builds a complete DataPartitionTable based on existing tsfiles. + */ +public class DataPartitionTableGenerator { + + private static final Logger LOG = LoggerFactory.getLogger(DataPartitionTableGenerator.class); + + // Task status + private volatile TaskStatus status = TaskStatus.NOT_STARTED; + private volatile String errorMessage; + private volatile DataPartitionTable dataPartitionTable; + + // Progress tracking + private final AtomicInteger processedFiles = new AtomicInteger(0); + private final AtomicInteger failedFiles = new AtomicInteger(0); + private final AtomicLong totalFiles = new AtomicLong(0); + + // Configuration + private final String[] dataDirectories; + private final ExecutorService executor; + private final int seriesSlotNum; + private final String seriesPartitionExecutorClass; + + private static final int EXECUTOR_MAX_TIMEOUT = 60; + + private static final LeakyBucketRateLimiter limiter = + new LeakyBucketRateLimiter((long) IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverMaxReadBytesPerSecond() * 1024 * 1024); + + private static final String SCAN_FILE_SUFFIX_NAME = ".resource"; + + public DataPartitionTableGenerator( + String dataDirectory, + ExecutorService executor, + int seriesSlotNum, + String seriesPartitionExecutorClass) { + this.dataDirectories = new String[]{dataDirectory}; + this.executor = executor; + this.seriesSlotNum = seriesSlotNum; + this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; + } + + public DataPartitionTableGenerator( + String[] dataDirectories, + ExecutorService executor, + int seriesSlotNum, + String seriesPartitionExecutorClass) { + this.dataDirectories = dataDirectories; + this.executor = executor; + this.seriesSlotNum = seriesSlotNum; + this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; + } + + public enum TaskStatus { + NOT_STARTED, + IN_PROGRESS, + COMPLETED, + FAILED + } + + /** Start generating DataPartitionTable asynchronously. */ + public void startGeneration() { + if (status != TaskStatus.NOT_STARTED) { + throw new IllegalStateException("Task is already started or completed"); + } + + status = TaskStatus.IN_PROGRESS; + + CompletableFuture.runAsync( + () -> { + try { + generateDataPartitionTable(); + status = TaskStatus.COMPLETED; + } catch (Exception e) { + LOG.error("Failed to generate DataPartitionTable", e); + errorMessage = e.getMessage(); + status = TaskStatus.FAILED; + } + }); + } + + /** Generate DataPartitionTable by scanning all resource files. */ + private void generateDataPartitionTable() throws IOException { + LOG.info("Starting DataPartitionTable generation from {} directories", dataDirectories.length); + + List> futures = new ArrayList<>(); + + Map dataPartitionMap = new ConcurrentHashMap<>(); + + try { + // Count total files first for progress tracking + countTotalFiles(); + + // Process all data directories + for (String dataDirectory : dataDirectories) { + LOG.info("Processing data directory: {}", dataDirectory); + + // First layer: database directories + Files.list(Paths.get(dataDirectory)) + .filter(Files::isDirectory) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + LOG.debug("Processing database: {}", databaseName); + + try { + Files.list(dbPath) + .filter(Files::isDirectory) + .forEach( + regionPath -> { + processRegionDirectory( + regionPath, + databaseName, + dataPartitionMap, + executor, + futures); + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", dbPath, e); + failedFiles.incrementAndGet(); + } + }); + } + + // Wait for all tasks to complete + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + + dataPartitionTable = new DataPartitionTable(dataPartitionMap); + + LOG.info( + "DataPartitionTable generation completed. Processed: {}, Failed: {}", + processedFiles.get(), + failedFiles.get()); + + } finally { + executor.shutdown(); + try { + if (!executor.awaitTermination(EXECUTOR_MAX_TIMEOUT, TimeUnit.SECONDS)) { + executor.shutdownNow(); + } + } catch (InterruptedException e) { + executor.shutdownNow(); + Thread.currentThread().interrupt(); + } + } + } + + /** Process a region directory. */ + private void processRegionDirectory( + java.nio.file.Path regionPath, + String databaseName, + Map dataPartitionMap, + ExecutorService executor, + List> futures) { + + int regionId; + try { + regionId = Integer.parseInt(regionPath.getFileName().toString()); + LOG.debug("Processing region: {}", regionId); + } catch (NumberFormatException e) { + LOG.error("Invalid region directory: {}", regionPath); + return; + } + + TConsensusGroupId consensusGroupId = new TConsensusGroupId(); + consensusGroupId.setId(regionId); + + // Process time partitions asynchronously + CompletableFuture regionFuture = + CompletableFuture.runAsync( + () -> { + try { + Files.list(regionPath) + .filter(Files::isDirectory) + .forEach( + timeSlotPath -> { + processTimeSlotDirectory( + timeSlotPath, databaseName, consensusGroupId, dataPartitionMap); + }); + } catch (IOException e) { + LOG.error("Failed to list region directory: {}", regionPath, e); + } + }, + executor); + + futures.add(regionFuture); + } + + /** Process a time slot directory. */ + private void processTimeSlotDirectory( + java.nio.file.Path timeSlotPath, + String databaseName, + TConsensusGroupId consensusGroupId, + Map dataPartitionMap) { + + long timeSlotLong; + try { + timeSlotLong = Long.parseLong(timeSlotPath.getFileName().toString()); + LOG.debug("Processing time slot: {}", timeSlotLong); + } catch (NumberFormatException e) { + LOG.error("Invalid time slot directory: {}", timeSlotPath); + return; + } + + try { + // Fourth layer: .tsfile files + Files.walk(timeSlotPath) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach( + tsFilePath -> { + processTsFile( + tsFilePath.toFile(), + consensusGroupId, + timeSlotLong, + dataPartitionMap); + }); + } catch (IOException e) { + LOG.error("Failed to walk time slot directory: {}", timeSlotPath, e); + } + } + + /** Process a single tsfile. */ + private void processTsFile( + File tsFile, + TConsensusGroupId consensusGroupId, + long timeSlotId, + Map dataPartitionMap) { + try { + TsFileResource tsFileResource = new TsFileResource(tsFile.getAbsoluteFile()); + tsFileResource.deserialize(); + + Set devices = tsFileResource.getDevices(limiter); + processedFiles.incrementAndGet(); + + SeriesPartitionExecutor seriesPartitionExecutor = + SeriesPartitionExecutor.getSeriesPartitionExecutor( + seriesPartitionExecutorClass, seriesSlotNum); + + for (org.apache.tsfile.file.metadata.IDeviceID deviceId : devices) { + TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + } + + if (processedFiles.get() % 1000 == 0) { + LOG.info("Processed {} files, current: {}", processedFiles.get(), tsFile.getName()); + } + } catch (IOException e) { + failedFiles.incrementAndGet(); + LOG.error("Failed to process tsfile: {} -> {}", tsFile.getAbsolutePath(), e.getMessage()); + } + } + + private static SeriesPartitionTable newSeriesPartitionTable(TConsensusGroupId consensusGroupId, long timeSlotId) { + SeriesPartitionTable seriesPartitionTable = new SeriesPartitionTable(); + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + seriesPartitionTable.putDataPartition(timePartitionSlot, consensusGroupId); + return seriesPartitionTable; + } + + /** Count total files for progress tracking. */ + private void countTotalFiles() throws IOException { + AtomicLong fileCount = new AtomicLong(0); + + for (String dataDirectory : dataDirectories) { + Files.walk(Paths.get(dataDirectory)) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach(p -> fileCount.incrementAndGet()); + } + + totalFiles.set(fileCount.get()); + LOG.info("Found {} resource files to process", totalFiles.get()); + } + + // Getters + public TaskStatus getStatus() { + return status; + } + + public String getErrorMessage() { + return errorMessage; + } + + public DataPartitionTable getDataPartitionTable() { + return dataPartitionTable; + } + + public int getProcessedFiles() { + return processedFiles.get(); + } + + public int getFailedFiles() { + return failedFiles.get(); + } + + public long getTotalFiles() { + return totalFiles.get(); + } + + public double getProgress() { + if (totalFiles.get() == 0) { + return 0.0; + } + return (double) (processedFiles.get() + failedFiles.get()) / totalFiles.get(); + } +} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java index 9c44de9f5fdca..881e823ef2d67 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/OperationType.java @@ -55,7 +55,10 @@ public enum OperationType { WRITE_AUDIT_LOG("writeAuditLog"), PREPARE_STATEMENT("prepareStatement"), EXECUTE_PREPARED_STATEMENT("executePreparedStatement"), - DEALLOCATE_PREPARED_STATEMENT("deallocatePreparedStatement"); + DEALLOCATE_PREPARED_STATEMENT("deallocatePreparedStatement"), + GET_EARLIEST_TIMESLOTS("getEarliestTimeslots"), + GENERATE_DATA_PARTITION_TABLE("generateDataPartitionTable"), + CHECK_DATA_PARTITION_TABLE_STATUS("checkDataPartitionTableStatus"); private final String name; OperationType(String name) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 42929be741819..cf2178c3569c1 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -19,6 +19,7 @@ package org.apache.iotdb.db.protocol.thrift.impl; +import com.google.common.collect.ImmutableList; import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; @@ -61,8 +62,10 @@ import org.apache.iotdb.commons.consensus.SchemaRegionId; import org.apache.iotdb.commons.consensus.index.ProgressIndex; import org.apache.iotdb.commons.consensus.index.ProgressIndexType; +import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.path.ExtendedPartialPath; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; @@ -102,6 +105,7 @@ import org.apache.iotdb.db.consensus.SchemaRegionConsensusImpl; import org.apache.iotdb.db.exception.StorageEngineException; import org.apache.iotdb.db.exception.query.QueryProcessException; +import org.apache.iotdb.db.partition.DataPartitionTableGenerator; import org.apache.iotdb.db.pipe.agent.PipeDataNodeAgent; import org.apache.iotdb.db.protocol.client.ConfigNodeInfo; import org.apache.iotdb.db.protocol.client.cn.DnToCnInternalServiceAsyncRequestManager; @@ -260,6 +264,10 @@ import org.apache.iotdb.mpp.rpc.thrift.TFireTriggerReq; import org.apache.iotdb.mpp.rpc.thrift.TFireTriggerResp; import org.apache.iotdb.mpp.rpc.thrift.TFragmentInstanceInfoResp; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableHeartbeatResp; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.mpp.rpc.thrift.TInactiveTriggerInstanceReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateCacheReq; import org.apache.iotdb.mpp.rpc.thrift.TInvalidateColumnCacheReq; @@ -314,9 +322,10 @@ import org.apache.iotdb.service.rpc.thrift.TSInsertRecordReq; import org.apache.iotdb.trigger.api.enums.FailureStrategy; import org.apache.iotdb.trigger.api.enums.TriggerEvent; - -import com.google.common.collect.ImmutableList; import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.transport.TIOStreamTransport; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.exception.NotImplementedException; import org.apache.tsfile.read.common.TimeRange; @@ -331,9 +340,11 @@ import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; +import java.io.File; import java.io.IOException; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.file.Files; import java.time.ZoneId; import java.util.ArrayList; import java.util.Arrays; @@ -370,7 +381,6 @@ import static org.apache.iotdb.db.utils.ErrorHandlingUtils.onQueryException; public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface { - private static final Logger LOGGER = LoggerFactory.getLogger(DataNodeInternalRPCServiceImpl.class); @@ -414,6 +424,32 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private static final String SYSTEM = "system"; + private final ExecutorService findEarliestTimeSlotExecutor = + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName()), + ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); + + private final ExecutorService partitionTableRecoverExecutor = + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), + ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); + + private static final long timeoutMs = 600000; // 600 seconds timeout + public DataNodeInternalRPCServiceImpl() { super(); partitionFetcher = ClusterPartitionFetcher.getInstance(); @@ -3117,4 +3153,258 @@ public TSStatus writeAuditLog(TAuditLogReq req) { public void handleClientExit() { // Do nothing } + + // ==================================================== + // Data Partition Table Integrity Check Implementation + // ==================================================== + + private volatile DataPartitionTableGenerator currentGenerator; + private volatile long currentTaskId = 0; + + @Override + public TGetEarliestTimeslotsResp getEarliestTimeslots() { + TGetEarliestTimeslotsResp resp = new TGetEarliestTimeslotsResp(); + + try { + Map earliestTimeslots = new HashMap<>(); + + // Get data directories from configuration + String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); + + for (String dataDir : dataDirs) { + File dir = new File(dataDir); + if (dir.exists() && dir.isDirectory()) { + processDataDirectoryForEarliestTimeslots(dir, earliestTimeslots); + } + } + + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + resp.setDatabaseToEarliestTimeslot(earliestTimeslots); + + LOGGER.info("Retrieved earliest timeslots for {} databases", earliestTimeslots.size()); + + } catch (Exception e) { + LOGGER.error("Failed to get earliest timeslots", e); + resp.setStatus( + onIoTDBException( + e, + OperationType.GET_EARLIEST_TIMESLOTS, + TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); + } + + return resp; + } + + @Override + public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { + TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); + byte[] empty = new byte[0]; + + try { + // Check if there's already a task in the progress + if (currentGenerator != null + && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); + resp.setMessage("DataPartitionTable generation is already in the progress"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + // Get data directories and configuration + String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); + if (dataDirs.length == 0) { + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("dataDirs parameter are not configured in the iotdb-system.properties"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + // Create generator for all data directories + int seriesSlotNum = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionSlotNum(); + String seriesPartitionExecutorClass = + IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); + + currentGenerator = + new DataPartitionTableGenerator( + dataDirs, partitionTableRecoverExecutor, seriesSlotNum, seriesPartitionExecutorClass); + currentTaskId = System.currentTimeMillis(); + + // Start generation synchronously for now to return the data partition table immediately + currentGenerator.startGeneration(); + + // Wait for completion (with timeout) + long startTime = System.currentTimeMillis(); + + while (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { + if (System.currentTimeMillis() - startTime > timeoutMs) { + resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); + resp.setMessage("DataPartitionTable generation timed out"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + try { + Thread.sleep(100); // Sleep for 100ms + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("DataPartitionTable generation interrupted"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + } + + // Check final status + if (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.COMPLETED) { + DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); + if (dataPartitionTable != null) { + ByteBuffer result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result.array()); + } + + resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); + resp.setMessage("DataPartitionTable generation completed successfully"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + + LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); + } else { + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage( + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + } + + // Clear current generator + currentGenerator = null; + } catch (Exception e) { + LOGGER.error("Failed to generate DataPartitionTable", e); + resp.setStatus( + onIoTDBException( + e, + OperationType.GENERATE_DATA_PARTITION_TABLE, + TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); + } + + return resp; + } + + @Override + public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() { + TGenerateDataPartitionTableHeartbeatResp resp = new TGenerateDataPartitionTableHeartbeatResp(); + + try { + if (currentGenerator == null) { + resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); + resp.setMessage("No DataPartitionTable generation task found"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + return resp; + } + + DataPartitionTableGenerator.TaskStatus status = currentGenerator.getStatus(); + + switch (status) { + case IN_PROGRESS: + resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); + resp.setMessage( + String.format( + "DataPartitionTable generation in progress: %.1f%%", + currentGenerator.getProgress() * 100)); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + case COMPLETED: + resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); + resp.setMessage("DataPartitionTable generation completed successfully"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + break; + case FAILED: + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + default: + resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); + resp.setMessage("Unknown task status: " + status); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + } + } catch (Exception e) { + LOGGER.error("Failed to check DataPartitionTable generation status", e); + resp.setStatus( + onIoTDBException( + e, + OperationType.CHECK_DATA_PARTITION_TABLE_STATUS, + TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); + } + + return resp; + } + + /** Process data directory to find the earliest timeslots for each database. */ + private void processDataDirectoryForEarliestTimeslots( + File dataDir, Map earliestTimeslots) { + try { + Files.list(dataDir.toPath()) + .filter(Files::isDirectory) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + + if (earliestTimeslot != Long.MAX_VALUE) { + earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to process data directory: {}", dataDir, e); + } + } + + /** Find the earliest timeslot in a database directory. */ + private long findEarliestTimeslotInDatabase(File databaseDir) { + final AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); + + try { + Files.walk(databaseDir.toPath()) + .filter(Files::isDirectory) + .forEach( + regionPath -> { + findEarliestTimeSlotExecutor.submit(() -> { + try { + Files.list(regionPath) + .filter(Files::isDirectory) + .forEach(timeSlotPath -> { + String timeSlotName = timeSlotPath.getFileName().toString(); + long timeslot = Long.parseLong(timeSlotName); + if (timeslot < earliest.get()) { + earliest.set(timeslot); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to scan {}", regionPath, e); + } + }); + }); + } catch (IOException e) { + LOGGER.error("Failed to walk database directory: {}", databaseDir, e); + } + + return earliest.get(); + } + + /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ + private ByteBuffer serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + TIOStreamTransport tioStreamTransport = new TIOStreamTransport(baos)) { + TProtocol protocol = new TBinaryProtocol(tioStreamTransport); + dataPartitionTable.serialize(baos, protocol); + return ByteBuffer.wrap(baos.toByteArray()); + } catch (Exception e) { + LOGGER.error("Failed to serialize DataPartitionTable", e); + return ByteBuffer.allocate(0); + } + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java index b84cce9e8d21b..d625b753e193b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java @@ -27,6 +27,7 @@ import org.apache.iotdb.commons.pipe.datastructure.resource.PersistentResource; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TestOnly; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.conf.IoTDBConfig; import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.exception.load.PartitionViolationException; @@ -677,6 +678,10 @@ public Set getDevices() { return timeIndex.getDevices(file.getPath(), this); } + public Set getDevices(LeakyBucketRateLimiter limiter) { + return timeIndex.getDevicesByRateLimiter(file.getPath(), this, limiter); + } + public ArrayDeviceTimeIndex buildDeviceTimeIndex(IDeviceID.Deserializer deserializer) throws IOException { readLock(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index e4a812012a8e3..6cbcc48021f77 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -22,6 +22,7 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; @@ -120,6 +121,41 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc } } + @Override + public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + byte[] buffer = new byte[64 * 1024]; + tsFileResource.readLock(); + try (InputStream inputStream = + FSFactoryProducer.getFSFactory() + .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { + // The first byte is VERSION_NUMBER, second byte is timeIndexType. + byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); + limiter.acquire(bytes.length); + if (bytes[1] == ARRAY_DEVICE_TIME_INDEX_TYPE) { + return ArrayDeviceTimeIndex.getDevices(inputStream); + } else { + return PlainDeviceTimeIndex.getDevices(inputStream); + } + } catch (NoSuchFileException e) { + // deleted by ttl + if (tsFileResource.isDeleted()) { + return Collections.emptySet(); + } else { + logger.error( + "Can't read file {} from disk ", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + throw new RuntimeException( + "Can't read file " + tsFilePath + TsFileResource.RESOURCE_SUFFIX + " from disk"); + } + } catch (Exception e) { + logger.error( + "Failed to get devices from tsfile: {}", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + throw new RuntimeException( + "Failed to get devices from tsfile: " + tsFilePath + TsFileResource.RESOURCE_SUFFIX); + } finally { + tsFileResource.readUnlock(); + } + } + @Override public boolean endTimeEmpty() { return endTime == Long.MIN_VALUE; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index d705a2417d7c6..7b3f047b34d92 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -20,9 +20,9 @@ package org.apache.iotdb.db.storageengine.dataregion.tsfile.timeindex; import org.apache.iotdb.commons.path.PartialPath; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; - import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.utils.Pair; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -74,6 +74,13 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali */ Set getDevices(String tsFilePath, TsFileResource tsFileResource); + /** + * get devices in TimeIndex that use inputStream + * + * @return device names + */ + Set getDevicesByRateLimiter(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + /** * @return whether end time is empty (Long.MIN_VALUE) */ diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index f4ebae2fb807e..f90f664572553 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -742,6 +742,21 @@ failure_detector_phi_acceptable_pause_in_ms=10000 # Datatype: double(percentage) disk_space_warning_threshold=0.05 +# The number of threads used for parallel scanning in the partition table recovery +# effectiveMode: restart +# Datatype: Integer +partition_table_recover_worker_num=10 + +# Limit the number of files used for parallel processing +# effectiveMode: restart +# Datatype: Integer +#partition_table_recover_process_file_num=1000 + +# Limit the number of bytes read per second from a file +# effectiveMode: restart +# Datatype: Integer +partition_table_recover_max_read_bytes_per_second=10 + #################### ### Memory Control Configuration #################### diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java index 6f9f95ca8fe88..39bc7eebfa92b 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/concurrent/ThreadName.java @@ -202,6 +202,8 @@ public enum ThreadName { FILE_TIME_INDEX_RECORD("FileTimeIndexRecord"), BINARY_ALLOCATOR_SAMPLE_EVICTOR("BinaryAllocator-SampleEvictor"), BINARY_ALLOCATOR_AUTO_RELEASER("BinaryAllocator-Auto-Releaser"), + FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL("FindEarliestTimeSlot-Parallel-Pool"), + DATA_PARTITION_RECOVER_PARALLEL_POOL("DataPartitionRecover-Parallel-Pool"), // the unknown thread name is used for metrics UNKNOWN("UNKNOWN"); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java new file mode 100644 index 0000000000000..0d0d09c182e05 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java @@ -0,0 +1,33 @@ +package org.apache.iotdb.commons.enums; + +public enum DataPartitionTableGeneratorState { + SUCCESS(0), + FAILED(1), + IN_PROGRESS(2), + UNKNOWN(-1); + + private final int code; + + DataPartitionTableGeneratorState(int code) { + this.code = code; + } + + public int getCode() { + return code; + } + + /** + * get DataNodeRemoveState by code + * + * @param code code + * @return DataNodeRemoveState + */ + public static DataPartitionTableGeneratorState getStateByCode(int code) { + for (DataPartitionTableGeneratorState state : DataPartitionTableGeneratorState.values()) { + if (state.code == code) { + return state; + } + } + return UNKNOWN; + } +} diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index eb53cdb2798dd..c5dd3e401d13e 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -77,6 +77,12 @@ public static TTimePartitionSlot getTimePartitionSlot(long time) { return timePartitionSlot; } + public static TTimePartitionSlot getTimePartitionSlot(long partitionId) { + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(); + timePartitionSlot.setStartTime(getTimePartitionLowerBound(time)); + return timePartitionSlot; + } + public static long getTimePartitionInterval() { return timePartitionInterval; } @@ -112,6 +118,14 @@ public static long getTimePartitionId(long time) { : time / timePartitionInterval - 1; } + public static long getTime(long partitionId) { + long time = partitionId * timePartitionInterval; + if (time > 0 || time % timePartitionInterval == 0) { + return time + timePartitionOrigin; + } + return ((partitionId + 1) * timePartitionInterval) + timePartitionOrigin; + } + public static long getTimePartitionIdWithoutOverflow(long time) { BigInteger bigTime = BigInteger.valueOf(time).subtract(bigTimePartitionOrigin); BigInteger partitionId = @@ -122,6 +136,18 @@ public static long getTimePartitionIdWithoutOverflow(long time) { return partitionId.longValue(); } + public static long getTimeWithoutOverflow(long partitionId) { + BigInteger bigTime = bigTimePartitionInterval.multiply(BigInteger.valueOf(partitionId)); + if (bigTime.compareTo(BigInteger.ZERO) > 0 || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { + return bigTime.add(bigTimePartitionOrigin).longValue(); + } + return BigInteger.valueOf(partitionId).add(BigInteger.ONE).multiply(bigTimePartitionInterval).add(bigTimePartitionOrigin).longValue(); + } + + public static long getTimeByPartitionId(long partitionId) { + return originMayCauseOverflow ? getTimeWithoutOverflow(partitionId) : getTime(partitionId); + } + public static boolean satisfyPartitionId(long startTime, long endTime, long partitionId) { long startPartition = originMayCauseOverflow @@ -134,6 +160,13 @@ public static boolean satisfyPartitionId(long startTime, long endTime, long part return startPartition <= partitionId && endPartition >= partitionId; } + public static boolean satisfyPartitionId(long startTime, long partitionId) { + long endTime = startTime >= timePartitionLowerBoundWithoutOverflow + ? Long.MAX_VALUE + : (startTime + timePartitionInterval - 1); + return satisfyPartitionId(startTime, endTime, partitionId); + } + public static boolean satisfyPartitionStartTime(Filter timeFilter, long partitionStartTime) { if (timeFilter == null) { return true; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java new file mode 100644 index 0000000000000..faff05c6ff69c --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iotdb.commons.utils.rateLimiter; + +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.LockSupport; + +/** + * A global leaky-bucket rate limiter for bytes throughput. + * Features: + * - Strict throughput limiting (no burst) + * - Smooth bandwidth shaping + * - Thread-safe + * - Fair for multi-thread + * - Low contention + */ +public class LeakyBucketRateLimiter { + /** bytes per second */ + private volatile long bytesPerSecond; + + /** start time */ + private final long startTimeNs; + + /** total consumed bytes */ + private final AtomicLong totalBytes = new AtomicLong(0); + + public LeakyBucketRateLimiter(long bytesPerSecond) { + if (bytesPerSecond <= 0) { + throw new IllegalArgumentException("bytesPerSecond must be > 0"); + } + this.bytesPerSecond = bytesPerSecond; + this.startTimeNs = System.nanoTime(); + } + + /** + * Acquire permission for reading bytes. + * + * This method will block if reading too fast. + */ + public void acquire(long bytes) { + if (bytes <= 0) { + return; + } + + long currentTotal = totalBytes.addAndGet(bytes); + + long expectedTimeNs = expectedTimeNs(currentTotal); + long now = System.nanoTime(); + + long sleepNs = expectedTimeNs - now; + + if (sleepNs > 0) { + LockSupport.parkNanos(sleepNs); + } + } + + /** + * Try acquire without blocking. + * + * @return true if allowed immediately + */ + public boolean tryAcquire(long bytes) { + if (bytes <= 0) { + return true; + } + + long currentTotal = totalBytes.addAndGet(bytes); + + long expectedTimeNs = expectedTimeNs(currentTotal); + long now = System.nanoTime(); + + if (expectedTimeNs <= now) { + return true; + } + + // rollback + totalBytes.addAndGet(-bytes); + return false; + } + + /** + * Update rate dynamically. + */ + public void setRate(long newBytesPerSecond) { + if (newBytesPerSecond <= 0) { + throw new IllegalArgumentException("bytesPerSecond must be > 0"); + } + this.bytesPerSecond = newBytesPerSecond; + } + + /** + * Current rate. + */ + public long getRate() { + return bytesPerSecond; + } + + /** + * Total bytes processed. + */ + public long getTotalBytes() { + return totalBytes.get(); + } + + /** + * Expected time based on bytes processed. + */ + private long expectedTimeNs(long totalBytes) { + return startTimeNs + (totalBytes * 1_000_000_000L) / bytesPerSecond; + } +} diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index cca7110f28d40..54479b2859875 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -678,6 +678,36 @@ struct TAuditLogReq { 11: required i32 cnId } +/** +* BEGIN: Data Partition Table Integrity Check Structures +**/ + +struct TGetEarliestTimeslotsResp { + 1: required common.TSStatus status + 2: optional map databaseToEarliestTimeslot +} + +struct TGenerateDataPartitionTableReq { + 1: required string database +} + +struct TGenerateDataPartitionTableResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message + 4: optional binary dataPartitionTable +} + +struct TGenerateDataPartitionTableHeartbeatResp { + 1: required common.TSStatus status + 2: required i32 errorCode + 3: optional string message +} + +/** +* END: Data Partition Table Integrity Check Structures +**/ + /** * BEGIN: Used for EXPLAIN ANALYZE **/ @@ -1276,6 +1306,30 @@ service IDataNodeRPCService { * Write an audit log entry to the DataNode's AuditEventLogger */ common.TSStatus writeAuditLog(TAuditLogReq req); + + /** + * BEGIN: Data Partition Table Integrity Check + **/ + + /** + * Get earliest timeslot information from DataNode + * Returns map of database name to earliest timeslot id + */ + TGetEarliestTimeslotsResp getEarliestTimeslots() + + /** + * Request DataNode to generate DataPartitionTable by scanning tsfile resources + */ + TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) + + /** + * Check the status of DataPartitionTable generation task + */ + TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() + + /** + * END: Data Partition Table Integrity Check + **/ } service MPPDataExchangeService { From 1fb3ab59c27e124d4425b1d4592b99b1b966712a Mon Sep 17 00:00:00 2001 From: libo Date: Mon, 9 Mar 2026 18:54:03 +0800 Subject: [PATCH 02/39] Debugged and verified all key logic in the procedure. --- .../AsyncDataNodeHeartbeatClientPool.java | 8 - .../confignode/conf/ConfigNodeConfig.java | 10 + .../confignode/conf/ConfigNodeDescriptor.java | 7 + .../load/service/HeartbeatService.java | 3 - .../confignode/manager/node/NodeManager.java | 48 +++ ...PartitionTableIntegrityCheckProcedure.java | 332 +++++++++++------- .../iotdb/confignode/service/ConfigNode.java | 47 ++- .../DataPartitionTableGenerator.java | 100 ++++-- .../impl/DataNodeInternalRPCServiceImpl.java | 77 ++-- .../dataregion/tsfile/TsFileResource.java | 2 +- .../timeindex/ArrayDeviceTimeIndex.java | 6 + .../tsfile/timeindex/FileTimeIndex.java | 1 - .../tsfile/timeindex/ITimeIndex.java | 4 +- .../DataNodeInternalRPCServiceImplTest.java | 50 +++ .../conf/iotdb-system.properties.template | 6 + .../iotdb/commons/ServerCommandLine.java | 3 +- .../commons/utils/TimePartitionUtils.java | 6 - .../src/main/thrift/datanode.thrift | 2 +- 18 files changed, 513 insertions(+), 199 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java index d32cb5b416934..324e351302787 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/async/AsyncDataNodeHeartbeatClientPool.java @@ -63,14 +63,6 @@ public void writeAuditLog( } } - public void generateDataPartitionTableHeartbeat(TEndPoint endPoint, TDataNodeHeartbeatReq req, DataNodeHeartbeatHandler handler) { - try { - clientManager.borrowClient(endPoint).generateDataPartitionTableHeartbeat(req, handler); - } catch (Exception ignore) { - // Just ignore - } - } - private static class AsyncDataNodeHeartbeatClientPoolHolder { private static final AsyncDataNodeHeartbeatClientPool INSTANCE = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index 88e8d76001dc5..1c1485d90a0b8 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -316,6 +316,8 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; + private long partitionTableRecoverWaitAllDnUpTimeout=2000; + public ConfigNodeConfig() { // empty constructor } @@ -1275,4 +1277,12 @@ public long getFailureDetectorPhiAcceptablePauseInMs() { public void setFailureDetectorPhiAcceptablePauseInMs(long failureDetectorPhiAcceptablePauseInMs) { this.failureDetectorPhiAcceptablePauseInMs = failureDetectorPhiAcceptablePauseInMs; } + + public long getPartitionTableRecoverWaitAllDnUpTimeout() { + return partitionTableRecoverWaitAllDnUpTimeout; + } + + public void setPartitionTableRecoverWaitAllDnUpTimeout(long partitionTableRecoverWaitAllDnUpTimeout) { + this.partitionTableRecoverWaitAllDnUpTimeout = partitionTableRecoverWaitAllDnUpTimeout; + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index 0ea7a278732e5..12a1f08b953e5 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -319,6 +319,13 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio "failure_detector_phi_acceptable_pause_in_ms", String.valueOf(conf.getFailureDetectorPhiAcceptablePauseInMs())))); + conf.setPartitionTableRecoverWaitAllDnUpTimeout( + Long.parseLong( + properties.getProperty( + "partition_table_recover_wait_all_dn_up_timeout", + String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout()))) + ); + String leaderDistributionPolicy = properties.getProperty("leader_distribution_policy", conf.getLeaderDistributionPolicy()); if (AbstractLeaderBalancer.GREEDY_POLICY.equals(leaderDistributionPolicy) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java index a2b1c3ed66ffd..64322da5bbb20 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/load/service/HeartbeatService.java @@ -279,9 +279,6 @@ private void pingRegisteredDataNodes( AsyncDataNodeHeartbeatClientPool.getInstance() .getDataNodeHeartBeat( dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); - AsyncDataNodeHeartbeatClientPool.getInstance() - .generateDataPartitionTableHeartbeat( - dataNodeInfo.getLocation().getInternalEndPoint(), heartbeatReq, handler); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java index e3d775259d626..7a7cf3ff13290 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java @@ -352,6 +352,9 @@ public DataSet registerDataNode(TDataNodeRegisterReq req) { // Adjust the maximum RegionGroup number of each Database getClusterSchemaManager().adjustMaxRegionGroupNum(); + // Check if all DataNodes are registered and trigger integrity check if needed + checkAndTriggerIntegrityCheck(); + resp.setStatus(ClusterNodeStartUtils.ACCEPT_NODE_REGISTRATION); resp.setDataNodeId( registerDataNodePlan.getDataNodeConfiguration().getLocation().getDataNodeId()); @@ -1346,4 +1349,49 @@ private TTLManager getTTLManager() { private ExternalServiceManager getServiceManager() { return configManager.getExternalServiceManager(); } + + /** + * Check if all DataNodes are registered and running, then trigger integrity check. + * This method should be called after each DataNode registration. + */ + private void checkAndTriggerIntegrityCheck() { + // Only trigger integrity check if this ConfigNode is the leader + if (!configManager.getConsensusManager().isLeader()) { + return; + } + + // Get all registered DataNodes + List registeredDataNodes = getRegisteredDataNodes(); + + // Check if all registered DataNodes are running + boolean allDataNodesRunning = registeredDataNodes.stream() + .allMatch(dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }); + + if (allDataNodesRunning && !registeredDataNodes.isEmpty()) { + LOGGER.info("All {} DataNodes are registered and running, triggering data partition table integrity check", + registeredDataNodes.size()); + + // Trigger integrity check asynchronously + try { + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + LOGGER.info("Data partition table integrity check procedure submitted successfully"); + } catch (Exception e) { + LOGGER.error("Failed to submit data partition table integrity check procedure", e); + } + } else { + LOGGER.debug("Not all DataNodes are ready yet. Registered: {}, Running: {}", + registeredDataNodes.size(), + (int) registeredDataNodes.stream() + .filter(dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }) + .count()); + } + } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 860f34ed1d735..5c06b29418f65 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -21,28 +21,42 @@ import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeConfiguration; +import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.SeriesPartitionTable; import org.apache.iotdb.commons.utils.TimePartitionUtils; import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; import org.apache.iotdb.confignode.client.sync.SyncDataNodeClientPool; import org.apache.iotdb.confignode.consensus.request.read.partition.GetDataPartitionPlan; +import org.apache.iotdb.confignode.consensus.request.write.partition.CreateDataPartitionPlan; import org.apache.iotdb.confignode.manager.node.NodeManager; import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv; import org.apache.iotdb.confignode.procedure.exception.ProcedureException; import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure; import org.apache.iotdb.confignode.procedure.state.DataPartitionTableIntegrityCheckProcedureState; import org.apache.iotdb.confignode.rpc.thrift.TTimeSlotList; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableHeartbeatResp; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.rpc.TSStatusCode; +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.transport.TIOStreamTransport; +import org.apache.thrift.transport.TTransport; +import org.apache.tsfile.utils.ReadWriteIOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; @@ -69,46 +83,36 @@ public class DataPartitionTableIntegrityCheckProcedure private static final Logger LOG = LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedure.class); - /** Error codes for DataNode responses */ - public static final int DN_ERROR_CODE_SUCCESS = 0; + private static final int MAX_RETRY_COUNT = 3; + private static final long HEART_BEAT_REQUEST_RATE = 60000; - public static final int DN_ERROR_CODE_IN_PROGRESS = 2; - public static final int DN_ERROR_CODE_FAILED = 1; - public static final int DN_ERROR_CODE_UNKNOWN = -1; + NodeManager dataNodeManager; + private List allDataNodes = new ArrayList<>(); + //============Need serialize BEGIN=============/ /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ private Map earliestTimeslots = new ConcurrentHashMap<>(); /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ private Map dataPartitionTables = new ConcurrentHashMap<>(); + private Set lostDataPartitionsOfDatabases = new HashSet<>(); + /** Final merged DataPartitionTable */ private DataPartitionTable finalDataPartitionTable; - /** List of DataNodes that need to generate DataPartitionTable */ - private List allDataNodes = new ArrayList<>(); - - private Set lostDataPartitionsOfDatabases; - - NodeManager dataNodeManager; - - /** Current retry attempt */ - private int retryCount = 0; - - private static final int MAX_RETRY_COUNT = 3; - - private static Set skipDnIds; - private static Set failedDnIds; + private static Set skipDnIds = new HashSet<>(); + private static Set failedDnIds = new HashSet<>(); private static ScheduledExecutorService heartBeatExecutor; - private static final long HEART_BEAT_REQUEST_RATE = 60000; + //============Need serialize END=============/ public DataPartitionTableIntegrityCheckProcedure() { super(); } @Override - protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws InterruptedException { try { // Ensure to get the real-time DataNodes in the current cluster at every step @@ -120,9 +124,10 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIn failedDnIds = new HashSet<>(); return collectEarliestTimeslots(env); case ANALYZE_MISSING_PARTITIONS: + lostDataPartitionsOfDatabases = new HashSet<>(); return analyzeMissingPartitions(env); case REQUEST_PARTITION_TABLES: - heartBeatExecutor = Executors.newScheduledThreadPool(allDataNodes.size()); + heartBeatExecutor = Executors.newScheduledThreadPool(1); return requestPartitionTables(env); case MERGE_PARTITION_TABLES: return mergePartitionTables(env); @@ -139,7 +144,7 @@ protected Flow executeFromState(ConfigNodeProcedureEnv env, DataPartitionTableIn } @Override - protected void rollbackState(ConfigNodeProcedureEnv env, DataPartitionTableIntegrityCheckProcedureState state) + protected void rollbackState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws IOException, InterruptedException, ProcedureException { switch (state) { case COLLECT_EARLIEST_TIMESLOTS: @@ -153,23 +158,19 @@ protected void rollbackState(ConfigNodeProcedureEnv env, DataPartitionTableInteg allDataNodes.clear(); finalDataPartitionTable = null; break; - case SUCCESS: - case FAILED: - // No cleanup needed for terminal states - break; default: throw new ProcedureException("Unknown state for rollback: " + state); } } @Override - protected DataPartitionTableIntegrityCheckProcedureState getState(int stateId) { - return null; + protected DataPartitionTableIntegrityCheckProcedureState getState(final int stateId) { + return DataPartitionTableIntegrityCheckProcedureState.values()[stateId]; } @Override - protected int getStateId(DataPartitionTableIntegrityCheckProcedureState state) { - return 0; + protected int getStateId(final DataPartitionTableIntegrityCheckProcedureState state) { + return state.ordinal(); } @Override @@ -182,7 +183,7 @@ protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { * Collect earliest timeslot information from all DataNodes. Each DataNode returns a Map where key is database name and value is the earliest timeslot id. */ - private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { + private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { LOG.debug("Collecting earliest timeslots from all DataNodes..."); } @@ -194,8 +195,6 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { } // Collect earliest timeslots from all DataNodes - Map mergedEarliestTimeslots = new ConcurrentHashMap<>(); - for (TDataNodeConfiguration dataNode : allDataNodes) { try { TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() @@ -207,10 +206,14 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { } Map nodeTimeslots = resp.getDatabaseToEarliestTimeslot(); +// Map nodeTimeslots = new HashMap<>(); +// nodeTimeslots.put("test", 2927L); +// nodeTimeslots.put("root.test", 0L); +// nodeTimeslots.put("root.demo", 0L); // Merge with existing timeslots (take minimum) for (Map.Entry entry : nodeTimeslots.entrySet()) { - mergedEarliestTimeslots.merge(entry.getKey(), entry.getValue(), Math::min); + earliestTimeslots.merge(entry.getKey(), entry.getValue(), Math::min); } if (LOG.isDebugEnabled()) { @@ -229,8 +232,6 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { } } - earliestTimeslots = mergedEarliestTimeslots; - if (LOG.isDebugEnabled()) { LOG.info( "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", @@ -251,7 +252,7 @@ private Flow collectEarliestTimeslots(ConfigNodeProcedureEnv env) { /** * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions of databases need to be repaired. */ - private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { + private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { LOG.debug("Analyzing missing data partitions..."); } @@ -263,23 +264,21 @@ private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { } // Find all databases that have lost data partition tables - lostDataPartitionsOfDatabases = new HashSet<>(); - for (Map.Entry entry : earliestTimeslots.entrySet()) { String database = entry.getKey(); long earliestTimeslot = entry.getValue(); // Get current DataPartitionTable from ConfigManager Map>>> - dataPartitionTable = getLocalDataPartitionTable(env, database); + localDataPartitionTable = getLocalDataPartitionTable(env, database); // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (dataPartitionTable.isEmpty() || dataPartitionTable.get(database) == null || dataPartitionTable.get(database).isEmpty()) { + if (localDataPartitionTable == null || localDataPartitionTable.isEmpty() || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { LOG.error("No data partition table related to database {} was found from the ConfigNode", database); continue; } - Map>> seriesPartitionMap = dataPartitionTable.get(database); + Map>> seriesPartitionMap = localDataPartitionTable.get(database); for (Map.Entry>> seriesPartitionEntry : seriesPartitionMap.entrySet()) { Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); @@ -292,7 +291,9 @@ private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { } } - if (lostDataPartitionsOfDatabases.isEmpty()) { + //@TODO simulate case that lost data partition +// if (lostDataPartitionsOfDatabases.isEmpty()) { + if (!lostDataPartitionsOfDatabases.isEmpty()) { LOG.info("No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; } @@ -305,7 +306,7 @@ private Flow analyzeMissingPartitions(ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - private Map>>> getLocalDataPartitionTable(ConfigNodeProcedureEnv env, String database) { + private Map>>> getLocalDataPartitionTable(final ConfigNodeProcedureEnv env, final String database) { Map> schemaPartitionTable = env.getConfigManager().getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) .getSchemaPartitionTable(); @@ -330,7 +331,7 @@ private Map finalDataPartitionMap = new HashMap<>(); + + for (String database : lostDataPartitionsOfDatabases) { + // Get current DataPartitionTable from ConfigManager + Map>>> + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - // TODO: Implement proper merging logic - // For now, use the first DataPartitionTable as the final one - if (!dataPartitionTables.isEmpty()) { - DataPartitionTable firstTable = dataPartitionTables.values().iterator().next(); - finalDataPartitionTable = firstTable; - - // In a real implementation, you would: - // 1. Merge all series partition slots from all DataNodes - // 2. For each series slot, merge time slot information - // 3. Resolve conflicts by choosing the most recent/complete data - // 4. Ensure consistency across all DataNodes - - LOG.info( - "Merged DataPartitionTable contains {} series partitions", - finalDataPartitionTable.getDataPartitionMap().size()); + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + continue; + } + + localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { + return; + } + finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); } + finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables.values().forEach(dataPartitionTable -> { + if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot,dnDataPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition(k, tConsensusGroupId); + })); + }); + }); + }); + + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); return Flow.HAS_MORE_STATE; @@ -478,9 +507,17 @@ private Flow mergePartitionTables(ConfigNodeProcedureEnv env) { } /** Write the final DataPartitionTable to raft log. */ - private Flow writePartitionTableToRaft(ConfigNodeProcedureEnv env) { + private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { LOG.info("Writing DataPartitionTable to raft log..."); + if (lostDataPartitionsOfDatabases.isEmpty()) { + LOG.error("No database lost data partition table"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("No database lost data partition table for raft write")); + return Flow.NO_MORE_STATE; + } + if (finalDataPartitionTable == null) { LOG.error("No DataPartitionTable to write to raft"); setFailure( @@ -489,39 +526,35 @@ private Flow writePartitionTableToRaft(ConfigNodeProcedureEnv env) { return Flow.NO_MORE_STATE; } - try { - // TODO: Implement actual raft log write - // This should create a consensus request to write the DataPartitionTable - // Example: - // WriteDataPartitionTablePlan plan = new - // WriteDataPartitionTablePlan(finalDataPartitionTable); - // env.getConfigManager().getConsensusManager().write(plan); - - // For now, simulate successful write - boolean writeSuccess = true; - - if (writeSuccess) { - LOG.info("DataPartitionTable successfully written to raft log"); - setNextState(DataPartitionTableIntegrityCheckProcedureState.SUCCESS); - return Flow.HAS_MORE_STATE; - } else { - LOG.error("Failed to write DataPartitionTable to raft log"); - setFailure( - "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("Failed to write DataPartitionTable to raft log")); - return Flow.NO_MORE_STATE; + int failedCnt = 0; + while (failedCnt < MAX_RETRY_COUNT) { + try { + CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); + Map assignedDataPartition = new HashMap<>(); + assignedDataPartition.put(lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); + createPlan.setAssignedDataPartition(assignedDataPartition); + TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); + + if (tsStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOG.info("DataPartitionTable successfully written to raft log"); + break; + } else { + LOG.error("Failed to write DataPartitionTable to raft log"); + setFailure( + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("Failed to write DataPartitionTable to raft log")); + } + } catch (Exception e) { + LOG.error("Error writing DataPartitionTable to raft log", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); } - - } catch (Exception e) { - LOG.error("Error writing DataPartitionTable to raft log", e); - setFailure("DataPartitionTableIntegrityCheckProcedure", e); - return Flow.NO_MORE_STATE; + failedCnt++; } + return Flow.NO_MORE_STATE; } - // @TODO @Override - public void serialize(DataOutputStream stream) throws IOException { + public void serialize(final DataOutputStream stream) throws IOException { super.serialize(stream); // Serialize earliestTimeslots @@ -533,21 +566,45 @@ public void serialize(DataOutputStream stream) throws IOException { // Serialize dataPartitionTables count stream.writeInt(dataPartitionTables.size()); - // Note: DataPartitionTable serialization would need to be implemented here + for (Map.Entry entry : dataPartitionTables.entrySet()) { + stream.writeInt(entry.getKey()); + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + entry.getValue().serialize(oos, protocol); + } catch (IOException | TException e) { + LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + } + } + + stream.writeInt(lostDataPartitionsOfDatabases.size()); + for (String database : lostDataPartitionsOfDatabases) { + stream.writeUTF(database); + } + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + finalDataPartitionTable.serialize(oos, protocol); + } catch (IOException | TException e) { + LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + } - // Serialize targetDataNodes count - stream.writeInt(targetDataNodes.size()); - for (TDataNodeConfiguration dataNode : targetDataNodes) { - stream.writeInt(dataNode.getLocation().getDataNodeId()); + stream.writeInt(skipDnIds.size()); + for (int skipDnId : skipDnIds) { + stream.writeInt(skipDnId); } - // Serialize retryCount - stream.writeInt(retryCount); + stream.writeInt(failedDnIds.size()); + for (int failedDnId : failedDnIds) { + stream.writeInt(failedDnId); + } } - // @TODO @Override - public void deserialize(ByteBuffer byteBuffer) { + public void deserialize(final ByteBuffer byteBuffer) { super.deserialize(byteBuffer); // Deserialize earliestTimeslots @@ -561,18 +618,57 @@ public void deserialize(ByteBuffer byteBuffer) { // Deserialize dataPartitionTables count int dataPartitionTablesSize = byteBuffer.getInt(); - dataPartitionTables = new ConcurrentHashMap<>(); - // Note: DataPartitionTable deserialization would need to be implemented here + for (int i = 0; i < dataPartitionTablesSize; i++) { + int key = byteBuffer.getInt(); + int size = byteBuffer.getInt(); + byte[] bytes = new byte[size]; + byteBuffer.get(bytes); + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + ObjectInputStream ois = new ObjectInputStream(bais)) { + TTransport transport = new TIOStreamTransport(ois); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + // Deserialize by input stream and protocol + DataPartitionTable value = new DataPartitionTable(); + value.deserialize(ois, protocol); + dataPartitionTables.put(key, value); + } catch (IOException | TException e) { + LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); + throw new RuntimeException(e); + } + } - // Deserialize targetDataNodes - int targetDataNodesSize = byteBuffer.getInt(); - targetDataNodes = new ArrayList<>(); - for (int i = 0; i < targetDataNodesSize; i++) { - int dataNodeId = byteBuffer.getInt(); - // Note: TDataNodeLocation reconstruction would need to be implemented here + int lostDataPartitionsOfDatabasesSize = byteBuffer.getInt(); + for (int i = 0; i < lostDataPartitionsOfDatabasesSize; i++) { + String database = ReadWriteIOUtils.readString(byteBuffer); + lostDataPartitionsOfDatabases.add(database); } - // Deserialize retryCount - retryCount = byteBuffer.getInt(); + // Deserialize finalDataPartitionTable size + int finalDataPartitionTableSize = byteBuffer.getInt(); + byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; + byteBuffer.get(finalDataPartitionTableBytes); + try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); + ObjectInputStream ois = new ObjectInputStream(bais)) { + TTransport transport = new TIOStreamTransport(ois); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + // Deserialize by input stream and protocol + finalDataPartitionTable = new DataPartitionTable(); + finalDataPartitionTable.deserialize(ois, protocol); + } catch (IOException | TException e) { + LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); + throw new RuntimeException(e); + } + + int skipDnIdsSize = byteBuffer.getInt(); + for (int i = 0; i < skipDnIdsSize; i++) { + skipDnIds.add(byteBuffer.getInt()); + } + + int failedDnIdsSize = byteBuffer.getInt(); + for (int i = 0; i < failedDnIdsSize; i++) { + failedDnIds.add(byteBuffer.getInt()); + } } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 3befc7f1634f1..9e4836a089cdf 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -24,6 +24,8 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.commons.ServerCommandLine; import org.apache.iotdb.commons.client.ClientManagerMetrics; +import org.apache.iotdb.commons.cluster.NodeStatus; +import org.apache.iotdb.commons.concurrent.IoTDBThreadPoolFactory; import org.apache.iotdb.commons.concurrent.ThreadModule; import org.apache.iotdb.commons.concurrent.ThreadName; import org.apache.iotdb.commons.concurrent.ThreadPoolMetrics; @@ -78,6 +80,10 @@ import java.util.Arrays; import java.util.List; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { @@ -109,6 +115,14 @@ public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { private int exitStatusCode = 0; + private Future dataPartitionTableCheckFuture; + + private ExecutorService dataPartitionTableCheckExecutor = + IoTDBThreadPoolFactory.newSingleThreadExecutor( + "DATA_PARTITION_TABLE_CHECK"); + + private final CountDownLatch latch = new CountDownLatch(1); + public ConfigNode() { super("ConfigNode"); // We do not init anything here, so that we can re-initialize the instance in IT. @@ -146,6 +160,11 @@ protected void start() throws IoTDBException { } active(); LOGGER.info("IoTDB started"); + try { + dataPartitionTableCheckFuture.get(); + } catch (ExecutionException | InterruptedException e) { + LOGGER.error("Data partition table check task execute failed", e); + } } @Override @@ -203,13 +222,29 @@ public void active() { loadSecretKey(); loadHardwareCode(); - // The data partition table integrity check is only performed when the ConfigNode is the leader node - if (configManager.getConsensusManager().isLeader()) { - TSStatus status = configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error("Data partition table integrity check failed!"); + dataPartitionTableCheckFuture = dataPartitionTableCheckExecutor.submit(() -> { + LOGGER.info("Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); + Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); + + while (latch.getCount() > 0) { + List dnList = configManager + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus.Running); + if (dnList != null && !dnList.isEmpty()) { + LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); + TSStatus status = + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error("Data partition table integrity check failed!"); + } + latch.countDown(); + } else { + LOGGER.info("No running datanodes found, waiting..."); + Thread.sleep(5000); // 等待5秒后重新检查 + } } - } + return null; + }); return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 689a12bd8df89..56bc17d808b16 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -20,6 +20,7 @@ package org.apache.iotdb.db.partition; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; +import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.partition.DataPartitionTable; @@ -37,6 +38,7 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -68,6 +70,7 @@ public class DataPartitionTableGenerator { // Configuration private final String[] dataDirectories; private final ExecutorService executor; + private final Set databases; private final int seriesSlotNum; private final String seriesPartitionExecutorClass; @@ -76,15 +79,20 @@ public class DataPartitionTableGenerator { private static final LeakyBucketRateLimiter limiter = new LeakyBucketRateLimiter((long) IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverMaxReadBytesPerSecond() * 1024 * 1024); - private static final String SCAN_FILE_SUFFIX_NAME = ".resource"; + public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; + public static final Set IGNORE_DATABASE = new HashSet() {{ + add("root.__audit"); + }}; public DataPartitionTableGenerator( String dataDirectory, ExecutorService executor, + Set databases, int seriesSlotNum, String seriesPartitionExecutorClass) { this.dataDirectories = new String[]{dataDirectory}; this.executor = executor; + this.databases = databases; this.seriesSlotNum = seriesSlotNum; this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; } @@ -92,10 +100,12 @@ public DataPartitionTableGenerator( public DataPartitionTableGenerator( String[] dataDirectories, ExecutorService executor, + Set databases, int seriesSlotNum, String seriesPartitionExecutorClass) { this.dataDirectories = dataDirectories; this.executor = executor; + this.databases = databases; this.seriesSlotNum = seriesSlotNum; this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; } @@ -147,28 +157,42 @@ private void generateDataPartitionTable() throws IOException { // First layer: database directories Files.list(Paths.get(dataDirectory)) .filter(Files::isDirectory) - .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - LOG.debug("Processing database: {}", databaseName); - - try { - Files.list(dbPath) + .forEach(sequenceTypePath -> { + try { + Files.list(sequenceTypePath) .filter(Files::isDirectory) - .forEach( - regionPath -> { - processRegionDirectory( - regionPath, - databaseName, - dataPartitionMap, - executor, - futures); - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", dbPath, e); - failedFiles.incrementAndGet(); - } - }); + .forEach(dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Processing database: {}", databaseName); + } + + try { + Files.list(dbPath) + .filter(Files::isDirectory) + .forEach( + regionPath -> { + processRegionDirectory( + regionPath, + databaseName, + dataPartitionMap, + executor, + futures); + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", dbPath, e); + failedFiles.incrementAndGet(); + } + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", sequenceTypePath, e); + failedFiles.incrementAndGet(); + } + }); } // Wait for all tasks to complete @@ -213,6 +237,7 @@ private void processRegionDirectory( TConsensusGroupId consensusGroupId = new TConsensusGroupId(); consensusGroupId.setId(regionId); + consensusGroupId.setType(TConsensusGroupType.DataRegion); // Process time partitions asynchronously CompletableFuture regionFuture = @@ -252,7 +277,7 @@ private void processTimeSlotDirectory( } try { - // Fourth layer: .tsfile files + // Fourth layer: .resource files Files.walk(timeSlotPath) .filter(Files::isRegularFile) .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) @@ -313,10 +338,31 @@ private void countTotalFiles() throws IOException { AtomicLong fileCount = new AtomicLong(0); for (String dataDirectory : dataDirectories) { - Files.walk(Paths.get(dataDirectory)) - .filter(Files::isRegularFile) - .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) - .forEach(p -> fileCount.incrementAndGet()); + Files.list(Paths.get(dataDirectory)) + .filter(Files::isDirectory) + .forEach(sequenceTypePath -> { + try { + Files.list(sequenceTypePath) + .filter(Files::isDirectory) + .forEach(dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + try { + Files.walk(dbPath) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach(p -> fileCount.incrementAndGet()); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", dbPath, e); + } + }); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); + } + }); } totalFiles.set(fileCount.get()); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index cf2178c3569c1..222025214532e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -324,13 +324,14 @@ import org.apache.iotdb.trigger.api.enums.TriggerEvent; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; -import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.transport.TIOStreamTransport; +import org.apache.thrift.transport.TTransport; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.exception.NotImplementedException; import org.apache.tsfile.read.common.TimeRange; import org.apache.tsfile.read.common.block.TsBlock; import org.apache.tsfile.utils.Pair; +import org.apache.tsfile.utils.PublicBAOS; import org.apache.tsfile.utils.RamUsageEstimator; import org.apache.tsfile.utils.ReadWriteIOUtils; import org.apache.tsfile.write.record.Tablet; @@ -345,6 +346,7 @@ import java.net.URL; import java.nio.ByteBuffer; import java.nio.file.Files; +import java.nio.file.Path; import java.time.ZoneId; import java.util.ArrayList; import java.util.Arrays; @@ -3197,6 +3199,11 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { @Override public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { + String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); + return generateDataPartitionTable(req, dataDirs); + } + + public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req, String[] dataDirs) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); byte[] empty = new byte[0]; @@ -3212,7 +3219,6 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP } // Get data directories and configuration - String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); if (dataDirs.length == 0) { resp.setDataPartitionTable(empty); resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); @@ -3228,7 +3234,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP currentGenerator = new DataPartitionTableGenerator( - dataDirs, partitionTableRecoverExecutor, seriesSlotNum, seriesPartitionExecutorClass); + dataDirs, partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately @@ -3237,7 +3243,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP // Wait for completion (with timeout) long startTime = System.currentTimeMillis(); - while (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { + while (currentGenerator != null && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { if (System.currentTimeMillis() - startTime > timeoutMs) { resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); resp.setMessage("DataPartitionTable generation timed out"); @@ -3261,8 +3267,8 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP if (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.COMPLETED) { DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); if (dataPartitionTable != null) { - ByteBuffer result = serializeDataPartitionTable(dataPartitionTable); - resp.setDataPartitionTable(result.array()); + byte[] result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result); } resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); @@ -3350,12 +3356,23 @@ private void processDataDirectoryForEarliestTimeslots( Files.list(dataDir.toPath()) .filter(Files::isDirectory) .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + sequenceTypePath -> { + try { + Files.list(sequenceTypePath) + .filter(Files::isDirectory) + .forEach(dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (DataPartitionTableGenerator.IGNORE_DATABASE.contains(databaseName)) { + return; + } + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); - if (earliestTimeslot != Long.MAX_VALUE) { - earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + if (earliestTimeslot != Long.MAX_VALUE) { + earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to process data directory: {}", sequenceTypePath.toFile(), e); } }); } catch (IOException e) { @@ -3368,7 +3385,7 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { final AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); try { - Files.walk(databaseDir.toPath()) + Files.list(databaseDir.toPath()) .filter(Files::isDirectory) .forEach( regionPath -> { @@ -3377,10 +3394,18 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { Files.list(regionPath) .filter(Files::isDirectory) .forEach(timeSlotPath -> { - String timeSlotName = timeSlotPath.getFileName().toString(); - long timeslot = Long.parseLong(timeSlotName); - if (timeslot < earliest.get()) { - earliest.set(timeslot); + try { + Optional matchedFile = Files.find(timeSlotPath, 1, (path, attrs) -> attrs.isRegularFile() && path.toString().endsWith(DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME)).findFirst(); + if (!matchedFile.isPresent()) { + return; + } + String timeSlotName = timeSlotPath.getFileName().toString(); + long timeslot = Long.parseLong(timeSlotName); + if (timeslot < earliest.get()) { + earliest.set(timeslot); + } + } catch (IOException e) { + LOGGER.error("Failed to find any {} files in the {} directory", DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, timeSlotPath, e); } }); } catch (IOException e) { @@ -3396,15 +3421,19 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { } /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ - private ByteBuffer serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - TIOStreamTransport tioStreamTransport = new TIOStreamTransport(baos)) { - TProtocol protocol = new TBinaryProtocol(tioStreamTransport); - dataPartitionTable.serialize(baos, protocol); - return ByteBuffer.wrap(baos.toByteArray()); - } catch (Exception e) { + private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { +// try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); +// ObjectOutputStream oos = new ObjectOutputStream(baos)) { + + try (PublicBAOS baos = new PublicBAOS(); + DataOutputStream oos = new DataOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + dataPartitionTable.serialize(oos, protocol); + return baos.getBuf(); + } catch (IOException | TException e) { LOGGER.error("Failed to serialize DataPartitionTable", e); - return ByteBuffer.allocate(0); + return ByteBuffer.allocate(0).array(); } } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java index d625b753e193b..f4a950a72afd6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java @@ -679,7 +679,7 @@ public Set getDevices() { } public Set getDevices(LeakyBucketRateLimiter limiter) { - return timeIndex.getDevicesByRateLimiter(file.getPath(), this, limiter); + return timeIndex.getDevices(file.getPath(), this, limiter); } public ArrayDeviceTimeIndex buildDeviceTimeIndex(IDeviceID.Deserializer deserializer) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java index 8499b6d6b3d3e..71a761a813731 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TimePartitionUtils; +import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; @@ -171,6 +172,11 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc return deviceToIndex.keySet(); } + @Override + public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + return deviceToIndex.keySet(); + } + public Map getDeviceToIndex() { return deviceToIndex; } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index 6cbcc48021f77..a0a725c85d73d 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -123,7 +123,6 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc @Override public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { - byte[] buffer = new byte[64 * 1024]; tsFileResource.readLock(); try (InputStream inputStream = FSFactoryProducer.getFSFactory() diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index 7b3f047b34d92..5f94703a944ba 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -75,11 +75,11 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali Set getDevices(String tsFilePath, TsFileResource tsFileResource); /** - * get devices in TimeIndex that use inputStream + * get devices in TimeIndex and limit files reading rate * * @return device names */ - Set getDevicesByRateLimiter(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); /** * @return whether end time is empty (Long.MIN_VALUE) diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index 622c2c4ebbfe7..fc2c05b75b799 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -28,6 +28,7 @@ import org.apache.iotdb.commons.consensus.DataRegionId; import org.apache.iotdb.commons.consensus.SchemaRegionId; import org.apache.iotdb.commons.exception.MetadataException; +import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.consensus.ConsensusFactory; @@ -53,6 +54,9 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; import org.apache.iotdb.db.utils.EnvironmentUtils; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; +import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; +import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.mpp.rpc.thrift.TPlanNode; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeReq; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeResp; @@ -68,6 +72,8 @@ import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; @@ -75,12 +81,16 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; public class DataNodeInternalRPCServiceImplTest { + private static final Logger LOG = + LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); private static final IoTDBConfig conf = IoTDBDescriptor.getInstance().getConfig(); DataNodeInternalRPCServiceImpl dataNodeInternalRPCServiceImpl; private static IConsensus instance; @@ -412,4 +422,44 @@ private List genSchemaRegionPeerList(TRegionReplicaSet regionReplicaSet) { } return peerList; } + + @Test + public void testGetEarliestTimeslots() { + Set lostDataPartitionsOfDatabases = new HashSet<>(); + lostDataPartitionsOfDatabases.add("root.demo"); + + TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); + req.setDatabases(lostDataPartitionsOfDatabases); + + // Use consensus layer to execute request + TGetEarliestTimeslotsResp response = + dataNodeInternalRPCServiceImpl.getEarliestTimeslots(); + + Map result = new HashMap(){{ + put("test", 2927L); + put("root.test", 0L); + put("root.demo", 0L); + }}; + Assert.assertNotSame(response.getDatabaseToEarliestTimeslot(), result); + } + + @Test + public void testGenerateDataPartitionTable() { + Set lostDataPartitionsOfDatabases = new HashSet<>(); + lostDataPartitionsOfDatabases.add("root.demo"); + + TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); + req.setDatabases(lostDataPartitionsOfDatabases); + + // Use consensus layer to execute request + String[] dataDirs = new String[]{"D:\\Users\\libo\\Downloads\\muliti-iotdb\\master-iotdb-source-conf\\data\\datanode\\data"}; + TGenerateDataPartitionTableResp response = + dataNodeInternalRPCServiceImpl.generateDataPartitionTable(req, dataDirs); + + Assert.assertNotSame(response.getDataPartitionTable(), ByteBuffer.allocate(0).array()); + + DataPartitionTable dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(ByteBuffer.wrap(response.getDataPartitionTable())); + Assert.assertEquals(1, dataPartitionTable.getTimeSlotCount()); + } } diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index f90f664572553..ff9066c0dec22 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -757,6 +757,12 @@ partition_table_recover_worker_num=10 # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 +# Set a timeout to wait for all datanodes complete startup, the unit is ms300000 +# effectiveMode: restart +# Datatype: Integer +#partition_table_recover_wait_all_dn_up_timeout=300000 +partition_table_recover_wait_all_dn_up_timeout=2000 + #################### ### Memory Control Configuration #################### diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java index b700dbbb6b033..7901f9cc36a1d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java @@ -18,8 +18,6 @@ */ package org.apache.iotdb.commons; -import org.apache.iotdb.commons.exception.IoTDBException; - import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; @@ -28,6 +26,7 @@ import org.apache.commons.cli.OptionGroup; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; +import org.apache.iotdb.commons.exception.IoTDBException; import java.io.PrintWriter; import java.util.HashSet; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index c5dd3e401d13e..d1a550c5ca1c9 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -77,12 +77,6 @@ public static TTimePartitionSlot getTimePartitionSlot(long time) { return timePartitionSlot; } - public static TTimePartitionSlot getTimePartitionSlot(long partitionId) { - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(); - timePartitionSlot.setStartTime(getTimePartitionLowerBound(time)); - return timePartitionSlot; - } - public static long getTimePartitionInterval() { return timePartitionInterval; } diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 54479b2859875..b248599f59cc4 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -688,7 +688,7 @@ struct TGetEarliestTimeslotsResp { } struct TGenerateDataPartitionTableReq { - 1: required string database + 1: required set databases } struct TGenerateDataPartitionTableResp { From cb5f6014e6a88587cb34c07cf841d731205ea8d4 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 10 Mar 2026 10:53:13 +0800 Subject: [PATCH 03/39] Correct dataPartitionTables and finalDataPartitionTable serialization; Adjust method that record the earliest timeslot id for every database --- ...PartitionTableIntegrityCheckProcedure.java | 73 ++++++++++++------- .../impl/DataNodeInternalRPCServiceImpl.java | 14 ++-- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 5c06b29418f65..76c575a40609d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -206,10 +206,6 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { } Map nodeTimeslots = resp.getDatabaseToEarliestTimeslot(); -// Map nodeTimeslots = new HashMap<>(); -// nodeTimeslots.put("test", 2927L); -// nodeTimeslots.put("root.test", 0L); -// nodeTimeslots.put("root.demo", 0L); // Merge with existing timeslots (take minimum) for (Map.Entry entry : nodeTimeslots.entrySet()) { @@ -388,7 +384,9 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { * Check completion status of DataPartitionTable generation tasks. */ private void checkPartitionTableGenerationStatus() { - LOG.info("Checking DataPartitionTable generation completion status..."); + if (LOG.isDebugEnabled()) { + LOG.info("Checking DataPartitionTable generation completion status..."); + } int completeCount = 0; for (TDataNodeConfiguration dataNode : allDataNodes) { @@ -508,7 +506,9 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { /** Write the final DataPartitionTable to raft log. */ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { - LOG.info("Writing DataPartitionTable to raft log..."); + if (LOG.isDebugEnabled()) { + LOG.info("Writing DataPartitionTable to raft log..."); + } if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.error("No database lost data partition table"); @@ -573,8 +573,14 @@ public void serialize(final DataOutputStream stream) throws IOException { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); entry.getValue().serialize(oos, protocol); + + // Write the size and data for byte array after serialize + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); } catch (IOException | TException e) { LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + throw new IOException("Failed to serialize dataPartitionTables", e); } } @@ -583,13 +589,23 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeUTF(database); } - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(oos); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - finalDataPartitionTable.serialize(oos, protocol); - } catch (IOException | TException e) { - LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + if (finalDataPartitionTable != null) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos)) { + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + finalDataPartitionTable.serialize(oos, protocol); + + // Write the size and data for byte array after serialize + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (IOException | TException e) { + LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); + throw new IOException("Failed to serialize finalDataPartitionTable", e); + } + } else { + stream.writeInt(0); } stream.writeInt(skipDnIds.size()); @@ -618,6 +634,7 @@ public void deserialize(final ByteBuffer byteBuffer) { // Deserialize dataPartitionTables count int dataPartitionTablesSize = byteBuffer.getInt(); + dataPartitionTables = new HashMap<>(); for (int i = 0; i < dataPartitionTablesSize; i++) { int key = byteBuffer.getInt(); int size = byteBuffer.getInt(); @@ -646,19 +663,23 @@ public void deserialize(final ByteBuffer byteBuffer) { // Deserialize finalDataPartitionTable size int finalDataPartitionTableSize = byteBuffer.getInt(); - byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; - byteBuffer.get(finalDataPartitionTableBytes); - try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { - TTransport transport = new TIOStreamTransport(ois); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - - // Deserialize by input stream and protocol - finalDataPartitionTable = new DataPartitionTable(); - finalDataPartitionTable.deserialize(ois, protocol); - } catch (IOException | TException e) { - LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); - throw new RuntimeException(e); + if (finalDataPartitionTableSize > 0) { + byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; + byteBuffer.get(finalDataPartitionTableBytes); + try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); + ObjectInputStream ois = new ObjectInputStream(bais)) { + TTransport transport = new TIOStreamTransport(ois); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + // Deserialize by input stream and protocol + finalDataPartitionTable = new DataPartitionTable(); + finalDataPartitionTable.deserialize(ois, protocol); + } catch (IOException | TException e) { + LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); + throw new RuntimeException(e); + } + } else { + finalDataPartitionTable = null; } int skipDnIdsSize = byteBuffer.getInt(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 222025214532e..2b033c2498e4d 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -450,6 +450,8 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), new ThreadPoolExecutor.CallerRunsPolicy()); + private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); + private static final long timeoutMs = 600000; // 600 seconds timeout public DataNodeInternalRPCServiceImpl() { @@ -3365,6 +3367,7 @@ private void processDataDirectoryForEarliestTimeslots( if (DataPartitionTableGenerator.IGNORE_DATABASE.contains(databaseName)) { return; } + databaseEarliestRegionMap.computeIfAbsent(databaseName, key -> Long.MAX_VALUE); long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); if (earliestTimeslot != Long.MAX_VALUE) { @@ -3382,7 +3385,7 @@ private void processDataDirectoryForEarliestTimeslots( /** Find the earliest timeslot in a database directory. */ private long findEarliestTimeslotInDatabase(File databaseDir) { - final AtomicLong earliest = new AtomicLong(Long.MAX_VALUE); + String databaseName = databaseDir.getName(); try { Files.list(databaseDir.toPath()) @@ -3401,8 +3404,8 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { } String timeSlotName = timeSlotPath.getFileName().toString(); long timeslot = Long.parseLong(timeSlotName); - if (timeslot < earliest.get()) { - earliest.set(timeslot); + if (timeslot < databaseEarliestRegionMap.get(databaseName)) { + databaseEarliestRegionMap.put(databaseName, timeslot); } } catch (IOException e) { LOGGER.error("Failed to find any {} files in the {} directory", DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, timeSlotPath, e); @@ -3417,14 +3420,11 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { LOGGER.error("Failed to walk database directory: {}", databaseDir, e); } - return earliest.get(); + return databaseEarliestRegionMap.get(databaseName); } /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { -// try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); -// ObjectOutputStream oos = new ObjectOutputStream(baos)) { - try (PublicBAOS baos = new PublicBAOS(); DataOutputStream oos = new DataOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); From f19d1d381efb2e86ccc801665c10b7e858151d71 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 10 Mar 2026 11:47:31 +0800 Subject: [PATCH 04/39] Correct heartbeat logic when data partition table is generating; Remove two unit testes only run successful in local environment --- ...PartitionTableIntegrityCheckProcedure.java | 15 ++++--- .../impl/DataNodeInternalRPCServiceImpl.java | 3 -- .../DataNodeInternalRPCServiceImplTest.java | 41 ------------------- 3 files changed, 10 insertions(+), 49 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 76c575a40609d..52fe407be661d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -288,8 +288,8 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } //@TODO simulate case that lost data partition -// if (lostDataPartitionsOfDatabases.isEmpty()) { - if (!lostDataPartitionsOfDatabases.isEmpty()) { + lostDataPartitionsOfDatabases.add("root.demo"); + if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.info("No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; } @@ -345,7 +345,6 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { if (!dataPartitionTables.containsKey(dataNodeId)) { try { TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - lostDataPartitionsOfDatabases.add("root.demo"); req.setDatabases(lostDataPartitionsOfDatabases); TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), req, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, MAX_RETRY_COUNT); @@ -396,8 +395,12 @@ private void checkPartitionTableGenerationStatus() { try { TGenerateDataPartitionTableHeartbeatResp resp = (TGenerateDataPartitionTableHeartbeatResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, MAX_RETRY_COUNT); - DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getStatus().getCode()); + DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); + if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + continue; + } switch (state) { case SUCCESS: LOG.info("DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); @@ -411,7 +414,7 @@ private void checkPartitionTableGenerationStatus() { completeCount++; break; default: - LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getStatus().getCode()); + LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); break; } } catch (Exception e) { @@ -422,6 +425,8 @@ private void checkPartitionTableGenerationStatus() { e); completeCount++; } + } else { + completeCount++; } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 2b033c2498e4d..d108180029b46 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3285,9 +3285,6 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); } - - // Clear current generator - currentGenerator = null; } catch (Exception e) { LOGGER.error("Failed to generate DataPartitionTable", e); resp.setStatus( diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index fc2c05b75b799..adf276dd8e2a8 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -61,7 +61,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeReq; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeReq; - import org.apache.ratis.util.FileUtils; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.file.metadata.enums.CompressionType; @@ -422,44 +421,4 @@ private List genSchemaRegionPeerList(TRegionReplicaSet regionReplicaSet) { } return peerList; } - - @Test - public void testGetEarliestTimeslots() { - Set lostDataPartitionsOfDatabases = new HashSet<>(); - lostDataPartitionsOfDatabases.add("root.demo"); - - TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - req.setDatabases(lostDataPartitionsOfDatabases); - - // Use consensus layer to execute request - TGetEarliestTimeslotsResp response = - dataNodeInternalRPCServiceImpl.getEarliestTimeslots(); - - Map result = new HashMap(){{ - put("test", 2927L); - put("root.test", 0L); - put("root.demo", 0L); - }}; - Assert.assertNotSame(response.getDatabaseToEarliestTimeslot(), result); - } - - @Test - public void testGenerateDataPartitionTable() { - Set lostDataPartitionsOfDatabases = new HashSet<>(); - lostDataPartitionsOfDatabases.add("root.demo"); - - TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - req.setDatabases(lostDataPartitionsOfDatabases); - - // Use consensus layer to execute request - String[] dataDirs = new String[]{"D:\\Users\\libo\\Downloads\\muliti-iotdb\\master-iotdb-source-conf\\data\\datanode\\data"}; - TGenerateDataPartitionTableResp response = - dataNodeInternalRPCServiceImpl.generateDataPartitionTable(req, dataDirs); - - Assert.assertNotSame(response.getDataPartitionTable(), ByteBuffer.allocate(0).array()); - - DataPartitionTable dataPartitionTable = new DataPartitionTable(); - dataPartitionTable.deserialize(ByteBuffer.wrap(response.getDataPartitionTable())); - Assert.assertEquals(1, dataPartitionTable.getTimeSlotCount()); - } } From 49ef823303dfe418d4454c6a21e567e209f8b951 Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 17:52:13 +0800 Subject: [PATCH 05/39] Use StorageEngine.getInstance().getAllDataRegions() to get Data Partition Information instead of scanning data directories in the DataNode; Correct the logic that retry after the step failed; Correct skipDataNodes and failedDataNodes serialization and deserialization. --- ...PartitionTableIntegrityCheckProcedure.java | 228 ++++++++++++------ .../DataPartitionTableGenerator.java | 96 ++++++-- .../impl/DataNodeInternalRPCServiceImpl.java | 100 ++++---- 3 files changed, 273 insertions(+), 151 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 52fe407be661d..dbb47019e136d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -69,7 +69,6 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; /** * Procedure for checking and restoring data partition table integrity. This procedure scans all @@ -101,8 +100,8 @@ public class DataPartitionTableIntegrityCheckProcedure /** Final merged DataPartitionTable */ private DataPartitionTable finalDataPartitionTable; - private static Set skipDnIds = new HashSet<>(); - private static Set failedDnIds = new HashSet<>(); + private static Set skipDataNodes = new HashSet<>(); + private static Set failedDataNodes = new HashSet<>(); private static ScheduledExecutorService heartBeatExecutor; //============Need serialize END=============/ @@ -121,8 +120,8 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPart switch (state) { case COLLECT_EARLIEST_TIMESLOTS: - failedDnIds = new HashSet<>(); - return collectEarliestTimeslots(env); + failedDataNodes = new HashSet<>(); + return collectEarliestTimeslots(); case ANALYZE_MISSING_PARTITIONS: lostDataPartitionsOfDatabases = new HashSet<>(); return analyzeMissingPartitions(env); @@ -175,7 +174,8 @@ protected int getStateId(final DataPartitionTableIntegrityCheckProcedureState st @Override protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { - skipDnIds = new HashSet<>(); + skipDataNodes = new HashSet<>(); + failedDataNodes = new HashSet<>(); return DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS; } @@ -183,7 +183,7 @@ protected DataPartitionTableIntegrityCheckProcedureState getInitialState() { * Collect earliest timeslot information from all DataNodes. Each DataNode returns a Map where key is database name and value is the earliest timeslot id. */ - private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { + private Flow collectEarliestTimeslots() { if (LOG.isDebugEnabled()) { LOG.debug("Collecting earliest timeslots from all DataNodes..."); } @@ -195,12 +195,13 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { } // Collect earliest timeslots from all DataNodes + allDataNodes.removeAll(skipDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { try { TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, MAX_RETRY_COUNT); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); LOG.error("Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); continue; } @@ -224,7 +225,7 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { dataNode.getLocation().getDataNodeId(), e.getMessage(), e); - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); } } @@ -233,11 +234,10 @@ private Flow collectEarliestTimeslots(final ConfigNodeProcedureEnv env) { "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", allDataNodes.size(), earliestTimeslots, - allDataNodes.size() - failedDnIds.size()); + allDataNodes.size() - failedDataNodes.size()); } - Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); - if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + if (failedDataNodes.size() == allDataNodes.size() && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); } else { setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); @@ -287,8 +287,6 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } } - //@TODO simulate case that lost data partition - lostDataPartitionsOfDatabases.add("root.demo"); if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.info("No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; @@ -297,7 +295,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { LOG.info( "Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", lostDataPartitionsOfDatabases.size(), - allDataNodes.size() - failedDnIds.size()); + allDataNodes.size() - failedDataNodes.size()); setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); return Flow.HAS_MORE_STATE; } @@ -340,6 +338,8 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { heartBeatExecutor.scheduleAtFixedRate(this::checkPartitionTableGenerationStatus, 0, HEART_BEAT_REQUEST_RATE, TimeUnit.MILLISECONDS); + allDataNodes.removeAll(skipDataNodes); + allDataNodes.removeAll(failedDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { int dataNodeId = dataNode.getLocation().getDataNodeId(); if (!dataPartitionTables.containsKey(dataNodeId)) { @@ -349,7 +349,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), req, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, MAX_RETRY_COUNT); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); LOG.error("Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); continue; } @@ -359,7 +359,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); dataPartitionTables.put(dataNodeId, dataPartitionTable); } catch (Exception e) { - failedDnIds.add(dataNode.getLocation().getDataNodeId()); + failedDataNodes.add(dataNode); LOG.error( "Failed to request DataPartitionTable generation from DataNode[id={}]: {}", dataNodeId, @@ -369,8 +369,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { } } - Set allDnIds = allDataNodes.stream().map(dataNodeConfiguration -> dataNodeConfiguration.getLocation().getDataNodeId()).collect(Collectors.toSet()); - if (failedDnIds.size() == allDataNodes.size() && allDnIds.containsAll(failedDnIds)) { + if (failedDataNodes.size() == allDataNodes.size() && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -398,7 +397,7 @@ private void checkPartitionTableGenerationStatus() { DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", dataNode.getLocation().getDataNodeId(), state, resp.getStatus()); continue; } switch (state) { @@ -449,64 +448,70 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - try { - Map finalDataPartitionMap = new HashMap<>(); + int failedCnt = 0; + while (failedCnt < MAX_RETRY_COUNT) { + try { + Map finalDataPartitionMap = new HashMap<>(); - for (String database : lostDataPartitionsOfDatabases) { - // Get current DataPartitionTable from ConfigManager - Map>>> - localDataPartitionTableMap = getLocalDataPartitionTable(env, database); + for (String database : lostDataPartitionsOfDatabases) { + // Get current DataPartitionTable from ConfigManager + Map>>> + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.error("No data partition table related to database {} was found from the ConfigNode", database); - continue; - } - - localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { - if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { - return; + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + continue; } - finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); - })); - } - finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables.values().forEach(dataPartitionTable -> { - if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot,dnDataPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { return; } + finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); + } - if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); + finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables.values().forEach(dataPartitionTable -> { + if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; } - - // dnDataPartitionTable merged to seriesPartitionTable - dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { - if (seriesPartitionTable == null) { + dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot, dnDataPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { return; } - seriesPartitionTable.putDataPartition(k, tConsensusGroupId); - })); - }); - }); - }); - finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); + } - LOG.info("DataPartitionTable merge completed successfully"); - setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); - return Flow.HAS_MORE_STATE; + // dnDataPartitionTable merged to seriesPartitionTable + dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition(k, tConsensusGroupId); + })); + }); + }); + }); - } catch (Exception e) { - LOG.error("Failed to merge DataPartitionTables", e); - setFailure("DataPartitionTableIntegrityCheckProcedure", e); - return Flow.NO_MORE_STATE; + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + break; + } catch (Exception e) { + LOG.error("Failed to merge DataPartitionTables", e); + setFailure("DataPartitionTableIntegrityCheckProcedure", e); + failedCnt++; + if (failedCnt >= MAX_RETRY_COUNT) { + return Flow.NO_MORE_STATE; + } + } } + + LOG.info("DataPartitionTable merge completed successfully"); + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + return Flow.HAS_MORE_STATE; } /** Write the final DataPartitionTable to raft log. */ @@ -520,7 +525,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("No database lost data partition table for raft write")); - return Flow.NO_MORE_STATE; + return getFlow(); } if (finalDataPartitionTable == null) { @@ -528,7 +533,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("No DataPartitionTable available for raft write")); - return Flow.NO_MORE_STATE; + return getFlow(); } int failedCnt = 0; @@ -555,7 +560,20 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { } failedCnt++; } - return Flow.NO_MORE_STATE; + + return getFlow(); + } + + private Flow getFlow() { + if (!failedDataNodes.isEmpty()) { + allDataNodes.removeAll(failedDataNodes); + skipDataNodes = new HashSet<>(allDataNodes); + setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); + return Flow.HAS_MORE_STATE; + } else { + skipDataNodes.clear(); + return Flow.NO_MORE_STATE; + } } @Override @@ -613,14 +631,36 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(0); } - stream.writeInt(skipDnIds.size()); - for (int skipDnId : skipDnIds) { - stream.writeInt(skipDnId); + stream.writeInt(skipDataNodes.size()); + for (TDataNodeConfiguration skipDataNode : skipDataNodes) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + TTransport transport = new TIOStreamTransport(baos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + skipDataNode.write(protocol); + + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (TException e) { + LOG.error("Failed to serialize skipDataNode", e); + throw new IOException("Failed to serialize skipDataNode", e); + } } - stream.writeInt(failedDnIds.size()); - for (int failedDnId : failedDnIds) { - stream.writeInt(failedDnId); + stream.writeInt(failedDataNodes.size()); + for (TDataNodeConfiguration failedDataNode : failedDataNodes) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { + TTransport transport = new TIOStreamTransport(baos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + failedDataNode.write(protocol); + + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (TException e) { + LOG.error("Failed to serialize failedDataNode", e); + throw new IOException("Failed to serialize failedDataNode", e); + } } } @@ -687,14 +727,44 @@ public void deserialize(final ByteBuffer byteBuffer) { finalDataPartitionTable = null; } - int skipDnIdsSize = byteBuffer.getInt(); - for (int i = 0; i < skipDnIdsSize; i++) { - skipDnIds.add(byteBuffer.getInt()); + skipDataNodes = new HashSet<>(); + int skipDataNodesSize = byteBuffer.getInt(); + for (int i = 0; i < skipDataNodesSize; i++) { + int size = byteBuffer.getInt(); + byte[] bytes = new byte[size]; + byteBuffer.get(bytes); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes)) { + TTransport transport = new TIOStreamTransport(bais); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + TDataNodeConfiguration dataNode = new TDataNodeConfiguration(); + dataNode.read(protocol); + skipDataNodes.add(dataNode); + } catch (TException | IOException e) { + LOG.error("Failed to deserialize skipDataNode", e); + throw new RuntimeException(e); + } } - int failedDnIdsSize = byteBuffer.getInt(); - for (int i = 0; i < failedDnIdsSize; i++) { - failedDnIds.add(byteBuffer.getInt()); + failedDataNodes = new HashSet<>(); + int failedDataNodesSize = byteBuffer.getInt(); + for (int i = 0; i < failedDataNodesSize; i++) { + int size = byteBuffer.getInt(); + byte[] bytes = new byte[size]; + byteBuffer.get(bytes); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes)) { + TTransport transport = new TIOStreamTransport(bais); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + TDataNodeConfiguration dataNode = new TDataNodeConfiguration(); + dataNode.read(protocol); + failedDataNodes.add(dataNode); + } catch (TException | IOException e) { + LOG.error("Failed to deserialize failedDataNode", e); + throw new RuntimeException(e); + } } } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 56bc17d808b16..4ce321e9a536c 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -29,7 +29,11 @@ import org.apache.iotdb.commons.utils.TimePartitionUtils; import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.conf.IoTDBDescriptor; +import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,7 +72,7 @@ public class DataPartitionTableGenerator { private final AtomicLong totalFiles = new AtomicLong(0); // Configuration - private final String[] dataDirectories; + private String[] dataDirectories; private final ExecutorService executor; private final Set databases; private final int seriesSlotNum; @@ -84,6 +88,17 @@ public class DataPartitionTableGenerator { add("root.__audit"); }}; + public DataPartitionTableGenerator( + ExecutorService executor, + Set databases, + int seriesSlotNum, + String seriesPartitionExecutorClass) { + this.executor = executor; + this.databases = databases; + this.seriesSlotNum = seriesSlotNum; + this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; + } + public DataPartitionTableGenerator( String dataDirectory, ExecutorService executor, @@ -117,25 +132,78 @@ public enum TaskStatus { FAILED } - /** Start generating DataPartitionTable asynchronously. */ - public void startGeneration() { + /** + * Start generating DataPartitionTable asynchronously. + * + */ + public CompletableFuture startGeneration() { if (status != TaskStatus.NOT_STARTED) { throw new IllegalStateException("Task is already started or completed"); } status = TaskStatus.IN_PROGRESS; + return CompletableFuture.runAsync(this::generateDataPartitionTableByMemory); + } + + private void generateDataPartitionTableByMemory() { + Map dataPartitionMap = new ConcurrentHashMap<>(); + List> futures = new ArrayList<>(); + + SeriesPartitionExecutor seriesPartitionExecutor = + SeriesPartitionExecutor.getSeriesPartitionExecutor( + seriesPartitionExecutorClass, seriesSlotNum); + + for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { + CompletableFuture regionFuture = + CompletableFuture.runAsync( + () -> { + TsFileManager tsFileManager = dataRegion.getTsFileManager(); + String databaseName = dataRegion.getDatabaseName(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } - CompletableFuture.runAsync( - () -> { - try { - generateDataPartitionTable(); - status = TaskStatus.COMPLETED; - } catch (Exception e) { - LOG.error("Failed to generate DataPartitionTable", e); - errorMessage = e.getMessage(); - status = TaskStatus.FAILED; - } - }); + tsFileManager.readLock(); + List seqTsFileList = tsFileManager.getTsFileList(true); + List unseqTsFileList = tsFileManager.getTsFileList(false); + tsFileManager.readUnlock(); + + constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); + constructDataPartitionMap(unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); + }, + executor); + futures.add(regionFuture); + } + + // Wait for all tasks to complete + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + + if (dataPartitionMap.isEmpty()) { + LOG.error("Failed to generate DataPartitionTable, dataPartitionMap is empty"); + status = TaskStatus.FAILED; + return; + } + + dataPartitionTable = new DataPartitionTable(dataPartitionMap); + status = TaskStatus.COMPLETED; + } + + private static void constructDataPartitionMap(List seqTsFileList, SeriesPartitionExecutor seriesPartitionExecutor, Map dataPartitionMap) { + for (TsFileResource tsFileResource : seqTsFileList) { + Set devices = tsFileResource.getDevices(limiter); + long timeSlotId = tsFileResource.getTsFileID().timePartitionId; + int regionId = tsFileResource.getTsFileID().regionId; + + TConsensusGroupId consensusGroupId = new TConsensusGroupId(); + consensusGroupId.setId(regionId); + consensusGroupId.setType(TConsensusGroupType.DataRegion); + + for (IDeviceID deviceId : devices) { + TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + } + } } /** Generate DataPartitionTable by scanning all resource files. */ diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index d108180029b46..6e5b615cc1b27 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3201,11 +3201,6 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { @Override public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { - String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); - return generateDataPartitionTable(req, dataDirs); - } - - public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req, String[] dataDirs) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); byte[] empty = new byte[0]; @@ -3220,71 +3215,50 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP return resp; } - // Get data directories and configuration - if (dataDirs.length == 0) { - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("dataDirs parameter are not configured in the iotdb-system.properties"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - return resp; - } - // Create generator for all data directories int seriesSlotNum = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionSlotNum(); String seriesPartitionExecutorClass = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); currentGenerator = - new DataPartitionTableGenerator( - dataDirs, partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); + new DataPartitionTableGenerator(partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately - currentGenerator.startGeneration(); + currentGenerator.startGeneration().get(timeoutMs, TimeUnit.MILLISECONDS); + + if (currentGenerator != null) { + switch (currentGenerator.getStatus()) { + case IN_PROGRESS: + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage("DataPartitionTable generation interrupted"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; + case COMPLETED: + DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); + if (dataPartitionTable != null) { + byte[] result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result); + } - // Wait for completion (with timeout) - long startTime = System.currentTimeMillis(); - - while (currentGenerator != null && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { - if (System.currentTimeMillis() - startTime > timeoutMs) { - resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); - resp.setMessage("DataPartitionTable generation timed out"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - return resp; - } - - try { - Thread.sleep(100); // Sleep for 100ms - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("DataPartitionTable generation interrupted"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - return resp; + resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); + resp.setMessage("DataPartitionTable generation completed successfully"); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); + break; + default: + resp.setDataPartitionTable(empty); + resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); + resp.setMessage( + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + break; } } - // Check final status - if (currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.COMPLETED) { - DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); - if (dataPartitionTable != null) { - byte[] result = serializeDataPartitionTable(dataPartitionTable); - resp.setDataPartitionTable(result); - } - - resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); - resp.setMessage("DataPartitionTable generation completed successfully"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); - - LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); - } else { - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage( - "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - } + // Clear current generator + currentGenerator = null; } catch (Exception e) { LOGGER.error("Failed to generate DataPartitionTable", e); resp.setStatus( @@ -3383,13 +3357,14 @@ private void processDataDirectoryForEarliestTimeslots( /** Find the earliest timeslot in a database directory. */ private long findEarliestTimeslotInDatabase(File databaseDir) { String databaseName = databaseDir.getName(); + List> futureList = new ArrayList<>(); try { Files.list(databaseDir.toPath()) .filter(Files::isDirectory) .forEach( regionPath -> { - findEarliestTimeSlotExecutor.submit(() -> { + Future future = findEarliestTimeSlotExecutor.submit(() -> { try { Files.list(regionPath) .filter(Files::isDirectory) @@ -3412,11 +3387,20 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { LOGGER.error("Failed to scan {}", regionPath, e); } }); + futureList.add(future); }); } catch (IOException e) { LOGGER.error("Failed to walk database directory: {}", databaseDir, e); } + for (Future future : futureList) { + try { + future.get(); + } catch (InterruptedException | ExecutionException e) { + LOGGER.error("Failed to wait for task completion", e); + Thread.currentThread().interrupt(); + } + } return databaseEarliestRegionMap.get(databaseName); } From 3960a0716952be6f41a8407f8b4ad5f889e75cfc Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 17:58:22 +0800 Subject: [PATCH 06/39] Adjust the default value is 1 min --- .../org/apache/iotdb/confignode/conf/ConfigNodeConfig.java | 2 +- .../assembly/resources/conf/iotdb-system.properties.template | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c549e1347bc7c..c284762fc10c4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout=2000; + private long partitionTableRecoverWaitAllDnUpTimeout=60000; public ConfigNodeConfig() { // empty constructor diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index ff9066c0dec22..9dc55d3903261 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -757,11 +757,10 @@ partition_table_recover_worker_num=10 # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 -# Set a timeout to wait for all datanodes complete startup, the unit is ms300000 +# Set a timeout to wait for all datanodes complete startup, the unit is ms # effectiveMode: restart # Datatype: Integer -#partition_table_recover_wait_all_dn_up_timeout=300000 -partition_table_recover_wait_all_dn_up_timeout=2000 +partition_table_recover_wait_all_dn_up_timeout=60000 #################### ### Memory Control Configuration From 98a18c3fdcc8812ccc3bdf03c25473eee655178a Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 17:58:22 +0800 Subject: [PATCH 07/39] Adjust the default value is 1 min --- DataPartitionTableIntegrityCheck_README.md | 245 ------------------ .../confignode/conf/ConfigNodeConfig.java | 2 +- .../conf/iotdb-system.properties.template | 5 +- 3 files changed, 3 insertions(+), 249 deletions(-) delete mode 100644 DataPartitionTableIntegrityCheck_README.md diff --git a/DataPartitionTableIntegrityCheck_README.md b/DataPartitionTableIntegrityCheck_README.md deleted file mode 100644 index 7fe3eefabb041..0000000000000 --- a/DataPartitionTableIntegrityCheck_README.md +++ /dev/null @@ -1,245 +0,0 @@ -# IoTDB 数据分区表完整性检测功能实现 - -## 功能概述 - -本功能实现了IoTDB ConfigNode重启时的数据分区表完整性检测,能够自动发现并恢复丢失的数据分区信息。 - -## 实现架构 - -### 1. 核心组件 - -#### Procedure实现 -- **DataPartitionTableIntegrityCheckProcedure**: 主要的Procedure实现,负责整个完整性检测流程 -- **ConfigNodeProcedureEnv**: Procedure执行环境,提供ConfigManager访问 - -#### DataNode端实现 -- **DataPartitionTableGenerator**: 扫描tsfile并生成DataPartitionTable的核心组件 -- **RPC接口扩展**: 在DataNode RPC服务中添加了三个新接口 - -#### 配置和注册 -- **ProcedureType枚举扩展**: 添加了新的Procedure类型 -- **ProcedureFactory扩展**: 支持新Procedure的创建和反序列化 -- **启动监听器**: ConfigNode启动时自动触发检测 - -### 2. 执行流程 - -``` -ConfigNode重启 → 检查Leader状态 → 收集最早timeslot → 分析缺失分区 → -请求DN生成表 → 合并分区表 → 写入Raft日志 → 完成 -``` - -## 详细实现 - -### 1. Thrift接口定义 (datanode.thrift) - -新增的RPC接口: -```thrift -// 获取最早timeslot信息 -TGetEarliestTimeslotsResp getEarliestTimeslots() - -// 请求生成DataPartitionTable -TGenerateDataPartitionTableResp generateDataPartitionTable() - -// 检查生成状态 -TCheckDataPartitionTableStatusResp checkDataPartitionTableStatus() -``` - -对应的响应结构体: -```thrift -struct TGetEarliestTimeslotsResp { - 1: required common.TSStatus status - 2: optional map databaseToEarliestTimeslot -} - -struct TGenerateDataPartitionTableResp { - 1: required common.TSStatus status - 2: required i32 errorCode - 3: optional string message -} - -struct TCheckDataPartitionTableStatusResp { - 1: required common.TSStatus status - 2: required i32 errorCode - 3: optional string message - 4: optional binary dataPartitionTable -} -``` - -### 2. DataNode实现 - -#### DataPartitionTableGenerator -- **并行扫描**: 使用多线程并行扫描tsfile文件 -- **进度跟踪**: 提供处理进度和状态信息 -- **错误处理**: 统计失败文件并记录错误信息 -- **配置化**: 支持自定义线程数和分区配置 - -#### RPC服务实现 -在`DataNodeInternalRPCServiceImpl`中实现: -- `getEarliestTimeslots()`: 扫描数据目录获取每个数据库的最早timeslot -- `generateDataPartitionTable()`: 启动异步扫描任务 -- `checkDataPartitionTableStatus()`: 检查任务状态并返回结果 - -### 3. ConfigNode Procedure实现 - -#### 状态机设计 -```java -public enum State { - CHECK_LEADER_STATUS, // 检查Leader状态 - COLLECT_EARLIEST_TIMESLOTS, // 收集最早timeslot - ANALYZE_MISSING_PARTITIONS, // 分析缺失分区 - REQUEST_PARTITION_TABLES, // 请求生成分区表 - MERGE_PARTITION_TABLES, // 合并分区表 - WRITE_PARTITION_TABLE_TO_RAFT, // 写入Raft日志 - SUCCESS, // 成功完成 - FAILED // 执行失败 -} -``` - -#### 错误码定义 -```java -public static final int DN_ERROR_CODE_SUCCESS = 0; // 处理成功 -public static final int DN_ERROR_CODE_IN_PROGRESS = 2; // 正在执行 -public static final int DN_ERROR_CODE_FAILED = 1; // 处理失败 -public static final int DN_ERROR_CODE_UNKNOWN = -1; // DN未知状态 -``` - -#### 核心逻辑 -1. **Leader检查**: 只有Leader节点执行检测 -2. **数据收集**: 从所有DataNode收集最早timeslot信息 -3. **缺失分析**: 对比当前分区表,识别缺失的分区 -4. **异步处理**: 向DataNode发送异步扫描请求 -5. **状态轮询**: 定期检查任务状态,支持重试机制 -6. **数据合并**: 合并所有DataNode返回的分区表 -7. **Raft写入**: 通过共识层持久化最终分区表 - -### 4. 自动触发机制 - -#### 启动监听器 -```java -public class DataPartitionTableIntegrityCheckListener { - public void onStartupComplete() { - if (isLeader()) { - startIntegrityCheck(); - } - } - - public void onBecomeLeader() { - startIntegrityCheck(); - } -} -``` - -## 关键特性 - -### 1. 原子性保证 -- 每个步骤都是幂等的,支持重试 -- Procedure框架保证状态一致性 -- 失败时可以安全回滚 - -### 2. 容错机制 -- **重试策略**: 最多重试3次 -- **超时处理**: 避免无限等待 -- **部分失败**: 部分DataNode失败时继续处理 - -### 3. 性能优化 -- **并行扫描**: DataNode端使用多线程并行处理 -- **异步执行**: 避免阻塞主流程 -- **进度跟踪**: 提供实时进度信息 - -### 4. 可扩展性 -- **配置化**: 支持自定义线程数和分区配置 -- **模块化**: 各组件独立,易于扩展 -- **接口化**: 清晰的RPC接口定义 - -## 使用方式 - -### 1. 自动触发 -ConfigNode重启时自动检测并执行,无需手动干预。 - -### 2. 手动触发 -可以通过ProcedureExecutor手动提交检测Procedure: -```java -DataPartitionTableIntegrityCheckProcedure procedure = new DataPartitionTableIntegrityCheckProcedure(); -procedureExecutor.submit(procedure); -``` - -## 配置参数 - -### DataNode配置 -- `seriesSlotNum`: 系列分区槽数量 -- `seriesPartitionExecutorClass`: 分区执行器类名 -- `dataDirs`: 数据目录配置 - -### Procedure配置 -- `MAX_RETRY_COUNT`: 最大重试次数 (默认3) -- 重试间隔: 5秒 - -## 监控和日志 - -### 日志级别 -- **INFO**: 关键流程节点信息 -- **DEBUG**: 详细的执行过程 -- **ERROR**: 错误和异常信息 - -### 关键指标 -- 处理文件数量 -- 失败文件数量 -- 执行时间 -- 重试次数 -- DataNode响应状态 - -## 注意事项 - -### 1. 依赖关系 -- 需要ConfigNode为Leader状态 -- 依赖DataNode正常注册和通信 -- 需要共识层正常工作 - -### 2. 资源消耗 -- DataNode扫描会消耗CPU和I/O资源 -- 建议在低峰期执行 -- 大数据集时需要考虑内存使用 - -### 3. 网络带宽 -- DataPartitionTable序列化后可能较大 -- 需要考虑网络传输限制 -- 建议实现增量传输机制 - -## 后续优化建议 - -### 1. 增量扫描 -- 支持增量扫描,只处理新增文件 -- 维护扫描状态,避免重复工作 - -### 2. 分布式协调 -- 实现更智能的负载分配 -- 支持动态调整扫描策略 - -### 3. 缓存优化 -- 缓存扫描结果,避免重复计算 -- 实现智能失效机制 - -### 4. 监控增强 -- 添加更详细的性能指标 -- 实现告警机制 - -## 测试验证 - -### 1. 单元测试 -- 各组件独立测试 -- 边界条件测试 -- 异常场景验证 - -### 2. 集成测试 -- 端到端流程测试 -- 多节点环境验证 -- 故障恢复测试 - -### 3. 性能测试 -- 大数据集扫描测试 -- 并发性能测试 -- 资源使用监控 - ---- - -本实现提供了完整的IoTDB数据分区表完整性检测解决方案,具备高可用性、容错性和可扩展性,能够在ConfigNode重启时自动发现并恢复丢失的数据分区信息。 diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c549e1347bc7c..c284762fc10c4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout=2000; + private long partitionTableRecoverWaitAllDnUpTimeout=60000; public ConfigNodeConfig() { // empty constructor diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index ff9066c0dec22..9dc55d3903261 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -757,11 +757,10 @@ partition_table_recover_worker_num=10 # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 -# Set a timeout to wait for all datanodes complete startup, the unit is ms300000 +# Set a timeout to wait for all datanodes complete startup, the unit is ms # effectiveMode: restart # Datatype: Integer -#partition_table_recover_wait_all_dn_up_timeout=300000 -partition_table_recover_wait_all_dn_up_timeout=2000 +partition_table_recover_wait_all_dn_up_timeout=60000 #################### ### Memory Control Configuration From d0882ee513d2dfafa580eafc7545d14b733666e8 Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 11 Mar 2026 18:12:06 +0800 Subject: [PATCH 08/39] Append a description about the unit --- .../resources/conf/iotdb-system.properties.template | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index 9dc55d3903261..c36f35cd5778c 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -747,12 +747,7 @@ disk_space_warning_threshold=0.05 # Datatype: Integer partition_table_recover_worker_num=10 -# Limit the number of files used for parallel processing -# effectiveMode: restart -# Datatype: Integer -#partition_table_recover_process_file_num=1000 - -# Limit the number of bytes read per second from a file +# Limit the number of bytes read per second from a file, the unit is MB # effectiveMode: restart # Datatype: Integer partition_table_recover_max_read_bytes_per_second=10 From 9a53bd25ea0e7507fa81e1ff05bcb2380953273e Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 12 Mar 2026 18:12:23 +0800 Subject: [PATCH 09/39] use the spotless command to format code --- .../client/sync/SyncDataNodeClientPool.java | 12 +- .../confignode/conf/ConfigNodeConfig.java | 5 +- .../confignode/conf/ConfigNodeDescriptor.java | 5 +- .../confignode/manager/ProcedureManager.java | 4 +- .../confignode/manager/node/NodeManager.java | 49 +-- ...PartitionTableIntegrityCheckProcedure.java | 301 ++++++++++++------ .../iotdb/confignode/service/ConfigNode.java | 54 ++-- .../apache/iotdb/db/conf/IoTDBDescriptor.java | 10 +- .../DataPartitionTableGenerator.java | 241 +++++++------- .../impl/DataNodeInternalRPCServiceImpl.java | 137 ++++---- .../timeindex/ArrayDeviceTimeIndex.java | 3 +- .../tsfile/timeindex/FileTimeIndex.java | 15 +- .../tsfile/timeindex/ITimeIndex.java | 4 +- .../DataNodeInternalRPCServiceImplTest.java | 9 +- .../iotdb/commons/ServerCommandLine.java | 3 +- .../commons/utils/TimePartitionUtils.java | 12 +- .../rateLimiter/LeakyBucketRateLimiter.java | 27 +- 17 files changed, 525 insertions(+), 366 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java index 84c027e513298..9f5729ef06dfd 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/client/sync/SyncDataNodeClientPool.java @@ -141,14 +141,14 @@ private void buildActionMap() { CnToDnSyncRequestType.SHOW_APPLIED_CONFIGURATIONS, (req, client) -> client.showAppliedConfigurations()); actionMapBuilder.put( - CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, - (req, client) -> client.getEarliestTimeslots()); + CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, + (req, client) -> client.getEarliestTimeslots()); actionMapBuilder.put( - CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, - (req, client) -> client.generateDataPartitionTable((TGenerateDataPartitionTableReq) req)); + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE, + (req, client) -> client.generateDataPartitionTable((TGenerateDataPartitionTableReq) req)); actionMapBuilder.put( - CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, - (req, client) -> client.generateDataPartitionTableHeartbeat()); + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + (req, client) -> client.generateDataPartitionTableHeartbeat()); actionMap = actionMapBuilder.build(); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c284762fc10c4..c682107698a3a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout=60000; + private long partitionTableRecoverWaitAllDnUpTimeout = 60000; public ConfigNodeConfig() { // empty constructor @@ -1293,7 +1293,8 @@ public long getPartitionTableRecoverWaitAllDnUpTimeout() { return partitionTableRecoverWaitAllDnUpTimeout; } - public void setPartitionTableRecoverWaitAllDnUpTimeout(long partitionTableRecoverWaitAllDnUpTimeout) { + public void setPartitionTableRecoverWaitAllDnUpTimeout( + long partitionTableRecoverWaitAllDnUpTimeout) { this.partitionTableRecoverWaitAllDnUpTimeout = partitionTableRecoverWaitAllDnUpTimeout; } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index 17ec570b6d7d0..e7d39fd3bcb87 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -325,9 +325,8 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio conf.setPartitionTableRecoverWaitAllDnUpTimeout( Long.parseLong( properties.getProperty( - "partition_table_recover_wait_all_dn_up_timeout", - String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout()))) - ); + "partition_table_recover_wait_all_dn_up_timeout", + String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout())))); String leaderDistributionPolicy = properties.getProperty("leader_distribution_policy", conf.getLeaderDistributionPolicy()); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java index bda00014ef204..1a69044d37d3d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/ProcedureManager.java @@ -1375,9 +1375,7 @@ public TSStatus createRegionGroups( } } - /** - * Used to repair the lost data partition table - */ + /** Used to repair the lost data partition table */ public TSStatus dataPartitionTableIntegrityCheck() { DataPartitionTableIntegrityCheckProcedure procedure; synchronized (this) { diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java index 7a7cf3ff13290..fdf8ef89f65d7 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java @@ -1351,8 +1351,8 @@ private ExternalServiceManager getServiceManager() { } /** - * Check if all DataNodes are registered and running, then trigger integrity check. - * This method should be called after each DataNode registration. + * Check if all DataNodes are registered and running, then trigger integrity check. This method + * should be called after each DataNode registration. */ private void checkAndTriggerIntegrityCheck() { // Only trigger integrity check if this ConfigNode is the leader @@ -1362,19 +1362,22 @@ private void checkAndTriggerIntegrityCheck() { // Get all registered DataNodes List registeredDataNodes = getRegisteredDataNodes(); - + // Check if all registered DataNodes are running - boolean allDataNodesRunning = registeredDataNodes.stream() - .allMatch(dataNode -> { - Integer dataNodeId = dataNode.getLocation().getDataNodeId(); - NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); - return status == NodeStatus.Running; - }); + boolean allDataNodesRunning = + registeredDataNodes.stream() + .allMatch( + dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }); if (allDataNodesRunning && !registeredDataNodes.isEmpty()) { - LOGGER.info("All {} DataNodes are registered and running, triggering data partition table integrity check", - registeredDataNodes.size()); - + LOGGER.info( + "All {} DataNodes are registered and running, triggering data partition table integrity check", + registeredDataNodes.size()); + // Trigger integrity check asynchronously try { configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); @@ -1383,15 +1386,19 @@ private void checkAndTriggerIntegrityCheck() { LOGGER.error("Failed to submit data partition table integrity check procedure", e); } } else { - LOGGER.debug("Not all DataNodes are ready yet. Registered: {}, Running: {}", - registeredDataNodes.size(), - (int) registeredDataNodes.stream() - .filter(dataNode -> { - Integer dataNodeId = dataNode.getLocation().getDataNodeId(); - NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); - return status == NodeStatus.Running; - }) - .count()); + LOGGER.debug( + "Not all DataNodes are ready yet. Registered: {}, Running: {}", + registeredDataNodes.size(), + (int) + registeredDataNodes.stream() + .filter( + dataNode -> { + Integer dataNodeId = dataNode.getLocation().getDataNodeId(); + NodeStatus status = + getLoadManager().getLoadCache().getNodeStatus(dataNodeId); + return status == NodeStatus.Running; + }) + .count()); } } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index dbb47019e136d..50522c5e66c93 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -24,6 +24,7 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; +import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.partition.SeriesPartitionTable; @@ -88,7 +89,7 @@ public class DataPartitionTableIntegrityCheckProcedure NodeManager dataNodeManager; private List allDataNodes = new ArrayList<>(); - //============Need serialize BEGIN=============/ + // ============Need serialize BEGIN=============/ /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ private Map earliestTimeslots = new ConcurrentHashMap<>(); @@ -104,14 +105,16 @@ public class DataPartitionTableIntegrityCheckProcedure private static Set failedDataNodes = new HashSet<>(); private static ScheduledExecutorService heartBeatExecutor; - //============Need serialize END=============/ + + // ============Need serialize END=============/ public DataPartitionTableIntegrityCheckProcedure() { super(); } @Override - protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) + protected Flow executeFromState( + final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws InterruptedException { try { // Ensure to get the real-time DataNodes in the current cluster at every step @@ -143,7 +146,8 @@ protected Flow executeFromState(final ConfigNodeProcedureEnv env, final DataPart } @Override - protected void rollbackState(final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) + protected void rollbackState( + final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws IOException, InterruptedException, ProcedureException { switch (state) { case COLLECT_EARLIEST_TIMESLOTS: @@ -189,7 +193,8 @@ private Flow collectEarliestTimeslots() { } if (allDataNodes.isEmpty()) { - LOG.error("No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); + LOG.error( + "No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -198,11 +203,20 @@ private Flow collectEarliestTimeslots() { allDataNodes.removeAll(skipDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { try { - TGetEarliestTimeslotsResp resp = (TGetEarliestTimeslotsResp) SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, MAX_RETRY_COUNT); + TGetEarliestTimeslotsResp resp = + (TGetEarliestTimeslotsResp) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry( + dataNode.getLocation().getInternalEndPoint(), + null, + CnToDnSyncRequestType.COLLECT_EARLIEST_TIMESLOTS, + MAX_RETRY_COUNT); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { failedDataNodes.add(dataNode); - LOG.error("Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); + LOG.error( + "Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", + dataNode.getLocation().getDataNodeId(), + resp.getStatus()); continue; } @@ -215,9 +229,9 @@ private Flow collectEarliestTimeslots() { if (LOG.isDebugEnabled()) { LOG.debug( - "Collected earliest timeslots from the DataNode[id={}]: {}", - dataNode.getLocation().getDataNodeId(), - nodeTimeslots); + "Collected earliest timeslots from the DataNode[id={}]: {}", + dataNode.getLocation().getDataNodeId(), + nodeTimeslots); } } catch (Exception e) { LOG.error( @@ -237,7 +251,8 @@ private Flow collectEarliestTimeslots() { allDataNodes.size() - failedDataNodes.size()); } - if (failedDataNodes.size() == allDataNodes.size() && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { + if (failedDataNodes.size() == allDataNodes.size() + && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); } else { setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); @@ -246,7 +261,8 @@ private Flow collectEarliestTimeslots() { } /** - * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions of databases need to be repaired. + * Analyze which data partitions are missing based on earliest timeslots. Identify data partitions + * of databases need to be repaired. */ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { @@ -254,7 +270,8 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } if (earliestTimeslots.isEmpty()) { - LOG.error("No missing data partitions detected, nothing needs to be repaired, terminating procedure"); + LOG.error( + "No missing data partitions detected, nothing needs to be repaired, terminating procedure"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -266,24 +283,39 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { // Get current DataPartitionTable from ConfigManager Map>>> - localDataPartitionTable = getLocalDataPartitionTable(env, database); + localDataPartitionTable = getLocalDataPartitionTable(env, database); // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTable == null || localDataPartitionTable.isEmpty() || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { - LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + if (localDataPartitionTable == null + || localDataPartitionTable.isEmpty() + || localDataPartitionTable.get(database) == null + || localDataPartitionTable.get(database).isEmpty()) { + lostDataPartitionsOfDatabases.add(database); + LOG.error( + "No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", + database); continue; } - Map>> seriesPartitionMap = localDataPartitionTable.get(database); + Map>> + seriesPartitionMap = localDataPartitionTable.get(database); for (Map.Entry>> - seriesPartitionEntry : seriesPartitionMap.entrySet()) { - Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); - tTimePartitionSlotListMap.keySet().forEach(slot -> { - if (!TimePartitionUtils.satisfyPartitionId(slot.getStartTime(), earliestTimeslot)) { - lostDataPartitionsOfDatabases.add(database); - LOG.warn("Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", database, earliestTimeslot); - } - }); + seriesPartitionEntry : seriesPartitionMap.entrySet()) { + Map> tTimePartitionSlotListMap = + seriesPartitionEntry.getValue(); + tTimePartitionSlotListMap + .keySet() + .forEach( + slot -> { + if (!TimePartitionUtils.satisfyPartitionId( + slot.getStartTime(), earliestTimeslot)) { + lostDataPartitionsOfDatabases.add(database); + LOG.warn( + "Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", + database, + earliestTimeslot); + } + }); } } @@ -300,23 +332,26 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - private Map>>> getLocalDataPartitionTable(final ConfigNodeProcedureEnv env, final String database) { - Map> schemaPartitionTable = env.getConfigManager().getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) + private Map>>> + getLocalDataPartitionTable(final ConfigNodeProcedureEnv env, final String database) { + Map> schemaPartitionTable = + env.getConfigManager() + .getSchemaPartition(Collections.singletonMap(database, Collections.emptyList())) .getSchemaPartitionTable(); // Construct request for getting data partition final Map> partitionSlotsMap = new HashMap<>(); schemaPartitionTable.forEach( - (key, value) -> { - Map slotListMap = new HashMap<>(); - value - .keySet() - .forEach( - slot -> - slotListMap.put( - slot, new TTimeSlotList(Collections.emptyList(), true, true))); - partitionSlotsMap.put(key, slotListMap); - }); + (key, value) -> { + Map slotListMap = new HashMap<>(); + value + .keySet() + .forEach( + slot -> + slotListMap.put( + slot, new TTimeSlotList(Collections.emptyList(), true, true))); + partitionSlotsMap.put(key, slotListMap); + }); final GetDataPartitionPlan getDataPartitionPlan = new GetDataPartitionPlan(partitionSlotsMap); return env.getConfigManager().getDataPartition(getDataPartitionPlan).getDataPartitionTable(); } @@ -327,16 +362,21 @@ private Map(allDataNodes).containsAll(failedDataNodes)) { + if (failedDataNodes.size() == allDataNodes.size() + && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -378,9 +428,7 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - /** - * Check completion status of DataPartitionTable generation tasks. - */ + /** Check completion status of DataPartitionTable generation tasks. */ private void checkPartitionTableGenerationStatus() { if (LOG.isDebugEnabled()) { LOG.info("Checking DataPartitionTable generation completion status..."); @@ -392,36 +440,52 @@ private void checkPartitionTableGenerationStatus() { if (!dataPartitionTables.containsKey(dataNodeId)) { try { - TGenerateDataPartitionTableHeartbeatResp resp = (TGenerateDataPartitionTableHeartbeatResp) SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithGivenRetry(dataNode.getLocation().getInternalEndPoint(), null, CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, MAX_RETRY_COUNT); - DataPartitionTableGeneratorState state = DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); + TGenerateDataPartitionTableHeartbeatResp resp = + (TGenerateDataPartitionTableHeartbeatResp) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry( + dataNode.getLocation().getInternalEndPoint(), + null, + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + MAX_RETRY_COUNT); + DataPartitionTableGeneratorState state = + DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOG.error("Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", dataNode.getLocation().getDataNodeId(), state, resp.getStatus()); + LOG.error( + "Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", + dataNode.getLocation().getDataNodeId(), + state, + resp.getStatus()); continue; } switch (state) { case SUCCESS: - LOG.info("DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); + LOG.info( + "DataNode {} completed DataPartitionTable generation, terminating heart beat", + dataNodeId); completeCount++; break; case IN_PROGRESS: LOG.info("DataNode {} still generating DataPartitionTable", dataNodeId); break; case FAILED: - LOG.error("DataNode {} failed to generate DataPartitionTable, terminating heart beat", dataNodeId); + LOG.error( + "DataNode {} failed to generate DataPartitionTable, terminating heart beat", + dataNodeId); completeCount++; break; default: - LOG.error("DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); + LOG.error( + "DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); break; } } catch (Exception e) { LOG.error( - "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", - dataNodeId, - e.getMessage(), - e); + "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + dataNodeId, + e.getMessage(), + e); completeCount++; } } else { @@ -434,9 +498,7 @@ private void checkPartitionTableGenerationStatus() { } } - /** - * Merge DataPartitionTables from all DataNodes into a final table. - */ + /** Merge DataPartitionTables from all DataNodes into a final table. */ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { LOG.info("Merging DataPartitionTables from {} DataNodes...", dataPartitionTables.size()); @@ -456,46 +518,78 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { for (String database : lostDataPartitionsOfDatabases) { // Get current DataPartitionTable from ConfigManager Map>>> - localDataPartitionTableMap = getLocalDataPartitionTable(env, database); + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTableMap == null || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.error("No data partition table related to database {} was found from the ConfigNode", database); + if (localDataPartitionTableMap == null + || localDataPartitionTableMap.isEmpty() + || localDataPartitionTableMap.get(database) == null + || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.error( + "No data partition table related to database {} was found from the ConfigNode", + database); continue; } - localDataPartitionTableMap.values().forEach(map -> map.forEach((tSeriesPartitionSlot, seriesPartitionTableMap) -> { - if (tSeriesPartitionSlot == null || seriesPartitionTableMap == null || seriesPartitionTableMap.isEmpty()) { - return; - } - finalDataPartitionMap.computeIfAbsent(tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); - })); + localDataPartitionTableMap + .values() + .forEach( + map -> + map.forEach( + (tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null + || seriesPartitionTableMap == null + || seriesPartitionTableMap.isEmpty()) { + return; + } + finalDataPartitionMap.computeIfAbsent( + tSeriesPartitionSlot, + k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); } - finalDataPartitionMap.forEach((tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables.values().forEach(dataPartitionTable -> { - if (dataPartitionTable == null || dataPartitionTable.getDataPartitionMap() == null || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable.getDataPartitionMap().forEach((dnSeriesPartitionSlot, dnDataPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null || seriesPartitionTable.getSeriesPartitionMap() == null || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put(tSeriesPartitionSlot, dnDataPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnDataPartitionTable.getSeriesPartitionMap().forEach((k, v) -> v.forEach(tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition(k, tConsensusGroupId); - })); + finalDataPartitionMap.forEach( + (tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables + .values() + .forEach( + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnDataPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null + || seriesPartitionTable.getSeriesPartitionMap() == null + || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put( + tSeriesPartitionSlot, dnDataPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnDataPartitionTable + .getSeriesPartitionMap() + .forEach( + (k, v) -> + v.forEach( + tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition( + k, tConsensusGroupId); + })); + }); + }); }); - }); - }); finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); break; @@ -523,8 +617,8 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { if (lostDataPartitionsOfDatabases.isEmpty()) { LOG.error("No database lost data partition table"); setFailure( - "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("No database lost data partition table for raft write")); + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("No database lost data partition table for raft write")); return getFlow(); } @@ -541,7 +635,8 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { try { CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); Map assignedDataPartition = new HashMap<>(); - assignedDataPartition.put(lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); + assignedDataPartition.put( + lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); createPlan.setAssignedDataPartition(assignedDataPartition); TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); @@ -551,8 +646,8 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { } else { LOG.error("Failed to write DataPartitionTable to raft log"); setFailure( - "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("Failed to write DataPartitionTable to raft log")); + "DataPartitionTableIntegrityCheckProcedure", + new ProcedureException("Failed to write DataPartitionTable to raft log")); } } catch (Exception e) { LOG.error("Error writing DataPartitionTable to raft log", e); @@ -592,7 +687,7 @@ public void serialize(final DataOutputStream stream) throws IOException { for (Map.Entry entry : dataPartitionTables.entrySet()) { stream.writeInt(entry.getKey()); try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + ObjectOutputStream oos = new ObjectOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); entry.getValue().serialize(oos, protocol); @@ -614,7 +709,7 @@ public void serialize(final DataOutputStream stream) throws IOException { if (finalDataPartitionTable != null) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { + ObjectOutputStream oos = new ObjectOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); finalDataPartitionTable.serialize(oos, protocol); @@ -686,7 +781,7 @@ public void deserialize(final ByteBuffer byteBuffer) { byte[] bytes = new byte[size]; byteBuffer.get(bytes); try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { + ObjectInputStream ois = new ObjectInputStream(bais)) { TTransport transport = new TIOStreamTransport(ois); TBinaryProtocol protocol = new TBinaryProtocol(transport); @@ -712,7 +807,7 @@ public void deserialize(final ByteBuffer byteBuffer) { byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; byteBuffer.get(finalDataPartitionTableBytes); try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { + ObjectInputStream ois = new ObjectInputStream(bais)) { TTransport transport = new TIOStreamTransport(ois); TBinaryProtocol protocol = new TBinaryProtocol(transport); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 9e4836a089cdf..01ae2499a9e80 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -69,6 +69,7 @@ import org.apache.iotdb.metrics.metricsets.net.NetMetrics; import org.apache.iotdb.metrics.metricsets.system.SystemMetrics; import org.apache.iotdb.rpc.TSStatusCode; + import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -118,8 +119,7 @@ public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { private Future dataPartitionTableCheckFuture; private ExecutorService dataPartitionTableCheckExecutor = - IoTDBThreadPoolFactory.newSingleThreadExecutor( - "DATA_PARTITION_TABLE_CHECK"); + IoTDBThreadPoolFactory.newSingleThreadExecutor("DATA_PARTITION_TABLE_CHECK"); private final CountDownLatch latch = new CountDownLatch(1); @@ -222,29 +222,33 @@ public void active() { loadSecretKey(); loadHardwareCode(); - dataPartitionTableCheckFuture = dataPartitionTableCheckExecutor.submit(() -> { - LOGGER.info("Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); - Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); - - while (latch.getCount() > 0) { - List dnList = configManager - .getLoadManager() - .filterDataNodeThroughStatus(NodeStatus.Running); - if (dnList != null && !dnList.isEmpty()) { - LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); - TSStatus status = - configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error("Data partition table integrity check failed!"); - } - latch.countDown(); - } else { - LOGGER.info("No running datanodes found, waiting..."); - Thread.sleep(5000); // 等待5秒后重新检查 - } - } - return null; - }); + dataPartitionTableCheckFuture = + dataPartitionTableCheckExecutor.submit( + () -> { + LOGGER.info( + "Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); + // Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); + + while (latch.getCount() > 0) { + List dnList = + configManager + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus.Running); + if (dnList != null && !dnList.isEmpty()) { + LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); + TSStatus status = + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error("Data partition table integrity check failed!"); + } + latch.countDown(); + } else { + LOGGER.info("No running datanodes found, waiting..."); + Thread.sleep(5000); // 等待5秒后重新检查 + } + } + return null; + }); return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index 4c4d7a6928747..a5e89bb250dfb 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -1140,9 +1140,15 @@ public void loadProperties(TrimProperties properties) throws BadNodeUrlException loadTrustedUriPattern(properties); conf.setPartitionTableRecoverWorkerNum( - Integer.parseInt(properties.getProperty("partition_table_recover_worker_num", String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); + Integer.parseInt( + properties.getProperty( + "partition_table_recover_worker_num", + String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); conf.setPartitionTableRecoverMaxReadBytesPerSecond( - Integer.parseInt(properties.getProperty("partition_table_recover_max_read_bytes_per_second", String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); + Integer.parseInt( + properties.getProperty( + "partition_table_recover_max_read_bytes_per_second", + String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); conf.setIncludeNullValueInWriteThroughputMetric( Boolean.parseBoolean( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 4ce321e9a536c..4f9a326f05223 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -33,6 +33,7 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -81,18 +82,28 @@ public class DataPartitionTableGenerator { private static final int EXECUTOR_MAX_TIMEOUT = 60; private static final LeakyBucketRateLimiter limiter = - new LeakyBucketRateLimiter((long) IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverMaxReadBytesPerSecond() * 1024 * 1024); + new LeakyBucketRateLimiter( + (long) + IoTDBDescriptor.getInstance() + .getConfig() + .getPartitionTableRecoverMaxReadBytesPerSecond() + * 1024 + * 1024); public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; - public static final Set IGNORE_DATABASE = new HashSet() {{ - add("root.__audit"); - }}; + public static final Set IGNORE_DATABASE = + new HashSet() { + { + add("root.__audit"); + add("root.__system"); + } + }; public DataPartitionTableGenerator( - ExecutorService executor, - Set databases, - int seriesSlotNum, - String seriesPartitionExecutorClass) { + ExecutorService executor, + Set databases, + int seriesSlotNum, + String seriesPartitionExecutorClass) { this.executor = executor; this.databases = databases; this.seriesSlotNum = seriesSlotNum; @@ -105,7 +116,7 @@ public DataPartitionTableGenerator( Set databases, int seriesSlotNum, String seriesPartitionExecutorClass) { - this.dataDirectories = new String[]{dataDirectory}; + this.dataDirectories = new String[] {dataDirectory}; this.executor = executor; this.databases = databases; this.seriesSlotNum = seriesSlotNum; @@ -113,11 +124,11 @@ public DataPartitionTableGenerator( } public DataPartitionTableGenerator( - String[] dataDirectories, - ExecutorService executor, - Set databases, - int seriesSlotNum, - String seriesPartitionExecutorClass) { + String[] dataDirectories, + ExecutorService executor, + Set databases, + int seriesSlotNum, + String seriesPartitionExecutorClass) { this.dataDirectories = dataDirectories; this.executor = executor; this.databases = databases; @@ -132,10 +143,7 @@ public enum TaskStatus { FAILED } - /** - * Start generating DataPartitionTable asynchronously. - * - */ + /** Start generating DataPartitionTable asynchronously. */ public CompletableFuture startGeneration() { if (status != TaskStatus.NOT_STARTED) { throw new IllegalStateException("Task is already started or completed"); @@ -150,28 +158,29 @@ private void generateDataPartitionTableByMemory() { List> futures = new ArrayList<>(); SeriesPartitionExecutor seriesPartitionExecutor = - SeriesPartitionExecutor.getSeriesPartitionExecutor( - seriesPartitionExecutorClass, seriesSlotNum); + SeriesPartitionExecutor.getSeriesPartitionExecutor( + seriesPartitionExecutorClass, seriesSlotNum); for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { CompletableFuture regionFuture = - CompletableFuture.runAsync( - () -> { - TsFileManager tsFileManager = dataRegion.getTsFileManager(); - String databaseName = dataRegion.getDatabaseName(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - tsFileManager.readLock(); - List seqTsFileList = tsFileManager.getTsFileList(true); - List unseqTsFileList = tsFileManager.getTsFileList(false); - tsFileManager.readUnlock(); - - constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); - constructDataPartitionMap(unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); - }, - executor); + CompletableFuture.runAsync( + () -> { + TsFileManager tsFileManager = dataRegion.getTsFileManager(); + String databaseName = dataRegion.getDatabaseName(); + if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + tsFileManager.readLock(); + List seqTsFileList = tsFileManager.getTsFileList(true); + List unseqTsFileList = tsFileManager.getTsFileList(false); + tsFileManager.readUnlock(); + + constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); + constructDataPartitionMap( + unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); + }, + executor); futures.add(regionFuture); } @@ -188,7 +197,10 @@ private void generateDataPartitionTableByMemory() { status = TaskStatus.COMPLETED; } - private static void constructDataPartitionMap(List seqTsFileList, SeriesPartitionExecutor seriesPartitionExecutor, Map dataPartitionMap) { + private static void constructDataPartitionMap( + List seqTsFileList, + SeriesPartitionExecutor seriesPartitionExecutor, + Map dataPartitionMap) { for (TsFileResource tsFileResource : seqTsFileList) { Set devices = tsFileResource.getDevices(limiter); long timeSlotId = tsFileResource.getTsFileID().timePartitionId; @@ -199,9 +211,14 @@ private static void constructDataPartitionMap(List seqTsFileList consensusGroupId.setType(TConsensusGroupType.DataRegion); for (IDeviceID deviceId : devices) { - TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); - dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + TSeriesPartitionSlot seriesSlotId = + seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap + .computeIfAbsent( + seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) + .putDataPartition(timePartitionSlot, consensusGroupId); } } } @@ -221,46 +238,49 @@ private void generateDataPartitionTable() throws IOException { // Process all data directories for (String dataDirectory : dataDirectories) { LOG.info("Processing data directory: {}", dataDirectory); - + // First layer: database directories Files.list(Paths.get(dataDirectory)) .filter(Files::isDirectory) - .forEach(sequenceTypePath -> { - try { - Files.list(sequenceTypePath) + .forEach( + sequenceTypePath -> { + try { + Files.list(sequenceTypePath) .filter(Files::isDirectory) - .forEach(dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Processing database: {}", databaseName); - } - - try { - Files.list(dbPath) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) + || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + if (LOG.isDebugEnabled()) { + LOG.debug("Processing database: {}", databaseName); + } + + try { + Files.list(dbPath) .filter(Files::isDirectory) .forEach( - regionPath -> { - processRegionDirectory( - regionPath, - databaseName, - dataPartitionMap, - executor, - futures); - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", dbPath, e); - failedFiles.incrementAndGet(); - } - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", sequenceTypePath, e); - failedFiles.incrementAndGet(); - } - }); + regionPath -> { + processRegionDirectory( + regionPath, + databaseName, + dataPartitionMap, + executor, + futures); + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", dbPath, e); + failedFiles.incrementAndGet(); + } + }); + } catch (IOException e) { + LOG.error("Failed to process database directory: {}", sequenceTypePath, e); + failedFiles.incrementAndGet(); + } + }); } // Wait for all tasks to complete @@ -352,10 +372,7 @@ private void processTimeSlotDirectory( .forEach( tsFilePath -> { processTsFile( - tsFilePath.toFile(), - consensusGroupId, - timeSlotLong, - dataPartitionMap); + tsFilePath.toFile(), consensusGroupId, timeSlotLong, dataPartitionMap); }); } catch (IOException e) { LOG.error("Failed to walk time slot directory: {}", timeSlotPath, e); @@ -380,9 +397,14 @@ private void processTsFile( seriesPartitionExecutorClass, seriesSlotNum); for (org.apache.tsfile.file.metadata.IDeviceID deviceId : devices) { - TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); - dataPartitionMap.computeIfAbsent(seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)).putDataPartition(timePartitionSlot, consensusGroupId); + TSeriesPartitionSlot seriesSlotId = + seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap + .computeIfAbsent( + seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) + .putDataPartition(timePartitionSlot, consensusGroupId); } if (processedFiles.get() % 1000 == 0) { @@ -394,9 +416,11 @@ private void processTsFile( } } - private static SeriesPartitionTable newSeriesPartitionTable(TConsensusGroupId consensusGroupId, long timeSlotId) { + private static SeriesPartitionTable newSeriesPartitionTable( + TConsensusGroupId consensusGroupId, long timeSlotId) { SeriesPartitionTable seriesPartitionTable = new SeriesPartitionTable(); - TTimePartitionSlot timePartitionSlot = new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); seriesPartitionTable.putDataPartition(timePartitionSlot, consensusGroupId); return seriesPartitionTable; } @@ -407,29 +431,32 @@ private void countTotalFiles() throws IOException { for (String dataDirectory : dataDirectories) { Files.list(Paths.get(dataDirectory)) - .filter(Files::isDirectory) - .forEach(sequenceTypePath -> { - try { - Files.list(sequenceTypePath) - .filter(Files::isDirectory) - .forEach(dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - try { - Files.walk(dbPath) - .filter(Files::isRegularFile) - .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) - .forEach(p -> fileCount.incrementAndGet()); - } catch (IOException e) { - LOG.error("countTotalFiles failed when scan {}", dbPath, e); - } - }); - } catch (IOException e) { - LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); - } + .filter(Files::isDirectory) + .forEach( + sequenceTypePath -> { + try { + Files.list(sequenceTypePath) + .filter(Files::isDirectory) + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (!databases.contains(databaseName) + || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + try { + Files.walk(dbPath) + .filter(Files::isRegularFile) + .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) + .forEach(p -> fileCount.incrementAndGet()); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", dbPath, e); + } + }); + } catch (IOException e) { + LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); + } }); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 6e5b615cc1b27..ab4f1523516aa 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -19,7 +19,6 @@ package org.apache.iotdb.db.protocol.thrift.impl; -import com.google.common.collect.ImmutableList; import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; @@ -322,6 +321,8 @@ import org.apache.iotdb.service.rpc.thrift.TSInsertRecordReq; import org.apache.iotdb.trigger.api.enums.FailureStrategy; import org.apache.iotdb.trigger.api.enums.TriggerEvent; + +import com.google.common.collect.ImmutableList; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -433,22 +434,22 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface 0L, TimeUnit.SECONDS, new ArrayBlockingQueue<>( - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), new IoTThreadFactory(ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName()), ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), new ThreadPoolExecutor.CallerRunsPolicy()); private final ExecutorService partitionTableRecoverExecutor = - new WrappedThreadPoolExecutor( - 0, - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), - 0L, - TimeUnit.SECONDS, - new ArrayBlockingQueue<>( - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), - new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), - ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), - new ThreadPoolExecutor.CallerRunsPolicy()); + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), + ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); @@ -3200,7 +3201,8 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { } @Override - public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataPartitionTableReq req) { + public TGenerateDataPartitionTableResp generateDataPartitionTable( + TGenerateDataPartitionTableReq req) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); byte[] empty = new byte[0]; @@ -3221,7 +3223,11 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); currentGenerator = - new DataPartitionTableGenerator(partitionTableRecoverExecutor, req.getDatabases(), seriesSlotNum, seriesPartitionExecutorClass); + new DataPartitionTableGenerator( + partitionTableRecoverExecutor, + req.getDatabases(), + seriesSlotNum, + seriesPartitionExecutorClass); currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately @@ -3251,7 +3257,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable(TGenerateDataP resp.setDataPartitionTable(empty); resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); resp.setMessage( - "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); break; } @@ -3301,7 +3307,8 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb break; case FAILED: resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); + resp.setMessage( + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); break; default: @@ -3333,20 +3340,24 @@ private void processDataDirectoryForEarliestTimeslots( try { Files.list(sequenceTypePath) .filter(Files::isDirectory) - .forEach(dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (DataPartitionTableGenerator.IGNORE_DATABASE.contains(databaseName)) { - return; - } - databaseEarliestRegionMap.computeIfAbsent(databaseName, key -> Long.MAX_VALUE); - long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + .forEach( + dbPath -> { + String databaseName = dbPath.getFileName().toString(); + if (DataPartitionTableGenerator.IGNORE_DATABASE.contains( + databaseName)) { + return; + } + databaseEarliestRegionMap.computeIfAbsent( + databaseName, key -> Long.MAX_VALUE); + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); - if (earliestTimeslot != Long.MAX_VALUE) { - earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); - } - }); + if (earliestTimeslot != Long.MAX_VALUE) { + earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); + } + }); } catch (IOException e) { - LOGGER.error("Failed to process data directory: {}", sequenceTypePath.toFile(), e); + LOGGER.error( + "Failed to process data directory: {}", sequenceTypePath.toFile(), e); } }); } catch (IOException e) { @@ -3363,31 +3374,49 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { Files.list(databaseDir.toPath()) .filter(Files::isDirectory) .forEach( - regionPath -> { - Future future = findEarliestTimeSlotExecutor.submit(() -> { - try { - Files.list(regionPath) - .filter(Files::isDirectory) - .forEach(timeSlotPath -> { - try { - Optional matchedFile = Files.find(timeSlotPath, 1, (path, attrs) -> attrs.isRegularFile() && path.toString().endsWith(DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME)).findFirst(); - if (!matchedFile.isPresent()) { - return; - } - String timeSlotName = timeSlotPath.getFileName().toString(); - long timeslot = Long.parseLong(timeSlotName); - if (timeslot < databaseEarliestRegionMap.get(databaseName)) { - databaseEarliestRegionMap.put(databaseName, timeslot); - } - } catch (IOException e) { - LOGGER.error("Failed to find any {} files in the {} directory", DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, timeSlotPath, e); - } - }); - } catch (IOException e) { - LOGGER.error("Failed to scan {}", regionPath, e); - } - }); - futureList.add(future); + regionPath -> { + Future future = + findEarliestTimeSlotExecutor.submit( + () -> { + try { + Files.list(regionPath) + .filter(Files::isDirectory) + .forEach( + timeSlotPath -> { + try { + Optional matchedFile = + Files.find( + timeSlotPath, + 1, + (path, attrs) -> + attrs.isRegularFile() + && path.toString() + .endsWith( + DataPartitionTableGenerator + .SCAN_FILE_SUFFIX_NAME)) + .findFirst(); + if (!matchedFile.isPresent()) { + return; + } + String timeSlotName = timeSlotPath.getFileName().toString(); + long timeslot = Long.parseLong(timeSlotName); + if (timeslot + < databaseEarliestRegionMap.get(databaseName)) { + databaseEarliestRegionMap.put(databaseName, timeslot); + } + } catch (IOException e) { + LOGGER.error( + "Failed to find any {} files in the {} directory", + DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, + timeSlotPath, + e); + } + }); + } catch (IOException e) { + LOGGER.error("Failed to scan {}", regionPath, e); + } + }); + futureList.add(future); }); } catch (IOException e) { LOGGER.error("Failed to walk database directory: {}", databaseDir, e); @@ -3407,7 +3436,7 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { try (PublicBAOS baos = new PublicBAOS(); - DataOutputStream oos = new DataOutputStream(baos)) { + DataOutputStream oos = new DataOutputStream(baos)) { TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); dataPartitionTable.serialize(oos, protocol); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java index 71a761a813731..a3262ddd37a1a 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java @@ -173,7 +173,8 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc } @Override - public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + public Set getDevices( + String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { return deviceToIndex.keySet(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index a0a725c85d73d..059663c5a6aea 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -122,11 +122,12 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc } @Override - public Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + public Set getDevices( + String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { tsFileResource.readLock(); try (InputStream inputStream = - FSFactoryProducer.getFSFactory() - .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { + FSFactoryProducer.getFSFactory() + .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { // The first byte is VERSION_NUMBER, second byte is timeIndexType. byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); limiter.acquire(bytes.length); @@ -141,15 +142,15 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc return Collections.emptySet(); } else { logger.error( - "Can't read file {} from disk ", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + "Can't read file {} from disk ", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); throw new RuntimeException( - "Can't read file " + tsFilePath + TsFileResource.RESOURCE_SUFFIX + " from disk"); + "Can't read file " + tsFilePath + TsFileResource.RESOURCE_SUFFIX + " from disk"); } } catch (Exception e) { logger.error( - "Failed to get devices from tsfile: {}", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); + "Failed to get devices from tsfile: {}", tsFilePath + TsFileResource.RESOURCE_SUFFIX, e); throw new RuntimeException( - "Failed to get devices from tsfile: " + tsFilePath + TsFileResource.RESOURCE_SUFFIX); + "Failed to get devices from tsfile: " + tsFilePath + TsFileResource.RESOURCE_SUFFIX); } finally { tsFileResource.readUnlock(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index 5f94703a944ba..400c478df5054 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -23,6 +23,7 @@ import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.utils.Pair; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -79,7 +80,8 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali * * @return device names */ - Set getDevices(String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + Set getDevices( + String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); /** * @return whether end time is empty (Long.MIN_VALUE) diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index adf276dd8e2a8..066e1bea5bfd7 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -28,7 +28,6 @@ import org.apache.iotdb.commons.consensus.DataRegionId; import org.apache.iotdb.commons.consensus.SchemaRegionId; import org.apache.iotdb.commons.exception.MetadataException; -import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.consensus.ConsensusFactory; @@ -54,13 +53,11 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; import org.apache.iotdb.db.utils.EnvironmentUtils; -import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableReq; -import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; -import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.mpp.rpc.thrift.TPlanNode; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeReq; import org.apache.iotdb.mpp.rpc.thrift.TSendBatchPlanNodeResp; import org.apache.iotdb.mpp.rpc.thrift.TSendSinglePlanNodeReq; + import org.apache.ratis.util.FileUtils; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.file.metadata.enums.CompressionType; @@ -80,16 +77,14 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.Set; public class DataNodeInternalRPCServiceImplTest { private static final Logger LOG = - LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); + LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); private static final IoTDBConfig conf = IoTDBDescriptor.getInstance().getConfig(); DataNodeInternalRPCServiceImpl dataNodeInternalRPCServiceImpl; private static IConsensus instance; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java index 7901f9cc36a1d..b700dbbb6b033 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/ServerCommandLine.java @@ -18,6 +18,8 @@ */ package org.apache.iotdb.commons; +import org.apache.iotdb.commons.exception.IoTDBException; + import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.DefaultParser; @@ -26,7 +28,6 @@ import org.apache.commons.cli.OptionGroup; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; -import org.apache.iotdb.commons.exception.IoTDBException; import java.io.PrintWriter; import java.util.HashSet; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index d1a550c5ca1c9..4eeddff9db7f9 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -132,10 +132,15 @@ public static long getTimePartitionIdWithoutOverflow(long time) { public static long getTimeWithoutOverflow(long partitionId) { BigInteger bigTime = bigTimePartitionInterval.multiply(BigInteger.valueOf(partitionId)); - if (bigTime.compareTo(BigInteger.ZERO) > 0 || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { + if (bigTime.compareTo(BigInteger.ZERO) > 0 + || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { return bigTime.add(bigTimePartitionOrigin).longValue(); } - return BigInteger.valueOf(partitionId).add(BigInteger.ONE).multiply(bigTimePartitionInterval).add(bigTimePartitionOrigin).longValue(); + return BigInteger.valueOf(partitionId) + .add(BigInteger.ONE) + .multiply(bigTimePartitionInterval) + .add(bigTimePartitionOrigin) + .longValue(); } public static long getTimeByPartitionId(long partitionId) { @@ -155,7 +160,8 @@ public static boolean satisfyPartitionId(long startTime, long endTime, long part } public static boolean satisfyPartitionId(long startTime, long partitionId) { - long endTime = startTime >= timePartitionLowerBoundWithoutOverflow + long endTime = + startTime >= timePartitionLowerBoundWithoutOverflow ? Long.MAX_VALUE : (startTime + timePartitionInterval - 1); return satisfyPartitionId(startTime, endTime, partitionId); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java index faff05c6ff69c..7af863db614b4 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java @@ -23,13 +23,8 @@ import java.util.concurrent.locks.LockSupport; /** - * A global leaky-bucket rate limiter for bytes throughput. - * Features: - * - Strict throughput limiting (no burst) - * - Smooth bandwidth shaping - * - Thread-safe - * - Fair for multi-thread - * - Low contention + * A global leaky-bucket rate limiter for bytes throughput. Features: - Strict throughput limiting + * (no burst) - Smooth bandwidth shaping - Thread-safe - Fair for multi-thread - Low contention */ public class LeakyBucketRateLimiter { /** bytes per second */ @@ -52,7 +47,7 @@ public LeakyBucketRateLimiter(long bytesPerSecond) { /** * Acquire permission for reading bytes. * - * This method will block if reading too fast. + *

This method will block if reading too fast. */ public void acquire(long bytes) { if (bytes <= 0) { @@ -95,9 +90,7 @@ public boolean tryAcquire(long bytes) { return false; } - /** - * Update rate dynamically. - */ + /** Update rate dynamically. */ public void setRate(long newBytesPerSecond) { if (newBytesPerSecond <= 0) { throw new IllegalArgumentException("bytesPerSecond must be > 0"); @@ -105,23 +98,17 @@ public void setRate(long newBytesPerSecond) { this.bytesPerSecond = newBytesPerSecond; } - /** - * Current rate. - */ + /** Current rate. */ public long getRate() { return bytesPerSecond; } - /** - * Total bytes processed. - */ + /** Total bytes processed. */ public long getTotalBytes() { return totalBytes.get(); } - /** - * Expected time based on bytes processed. - */ + /** Expected time based on bytes processed. */ private long expectedTimeNs(long totalBytes) { return startTimeNs + (totalBytes * 1_000_000_000L) / bytesPerSecond; } From 0448cb0aad9b81ca2ea56f24e6e7b3d3a9ed7204 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 12 Mar 2026 18:29:24 +0800 Subject: [PATCH 10/39] Avoid writing duplicate values --- .../iotdb/commons/partition/SeriesPartitionTable.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java index f46344566dc32..ffb0413bc87e7 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java @@ -73,7 +73,11 @@ public Map> getSeriesPartitionMap() } public void putDataPartition(TTimePartitionSlot timePartitionSlot, TConsensusGroupId groupId) { - seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()).add(groupId); + seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()); + List groupList = seriesPartitionMap.get(timePartitionSlot); + if (!groupList.contains(groupId)) { + groupList.add(groupId); + } } /** From 2468cf7689f073cdee5527de49dde0a3a71f35f2 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 12 Mar 2026 23:05:53 +0800 Subject: [PATCH 11/39] Fix bug when get no data partition table in the ConfigNode. --- ...PartitionTableIntegrityCheckProcedure.java | 123 ++++++++++-------- 1 file changed, 67 insertions(+), 56 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 50522c5e66c93..ce433c113be0a 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -291,7 +291,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { lostDataPartitionsOfDatabases.add(database); - LOG.error( + LOG.warn( "No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", database); continue; @@ -510,10 +510,7 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - int failedCnt = 0; - while (failedCnt < MAX_RETRY_COUNT) { - try { - Map finalDataPartitionMap = new HashMap<>(); + Map finalDataPartitionMap = new HashMap<>(); for (String database : lostDataPartitionsOfDatabases) { // Get current DataPartitionTable from ConfigManager @@ -525,8 +522,8 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { || localDataPartitionTableMap.isEmpty() || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.error( - "No data partition table related to database {} was found from the ConfigNode", + LOG.warn( + "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", database); continue; } @@ -548,60 +545,74 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { })); } - finalDataPartitionMap.forEach( - (tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables + if (finalDataPartitionMap.isEmpty()) { + dataPartitionTables .values() .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap() + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap().forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (dnSeriesPartitionSlot == null + || dnSeriesPartitionTable == null) { + return; + } + finalDataPartitionMap.computeIfAbsent( + dnSeriesPartitionSlot, + k -> dnSeriesPartitionTable); + }); + }); + } else { + finalDataPartitionMap.forEach( + (tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables + .values() .forEach( - (dnSeriesPartitionSlot, dnDataPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null - || seriesPartitionTable.getSeriesPartitionMap() == null - || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put( - tSeriesPartitionSlot, dnDataPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnDataPartitionTable - .getSeriesPartitionMap() - .forEach( - (k, v) -> - v.forEach( - tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition( - k, tConsensusGroupId); - })); - }); - }); - }); + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null + || seriesPartitionTable.getSeriesPartitionMap() == null + || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put( + tSeriesPartitionSlot, dnSeriesPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnSeriesPartitionTable + .getSeriesPartitionMap() + .forEach( + (k, v) -> + v.forEach( + tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition( + k, tConsensusGroupId); + })); + }); + }); + }); + } finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); - break; - } catch (Exception e) { - LOG.error("Failed to merge DataPartitionTables", e); - setFailure("DataPartitionTableIntegrityCheckProcedure", e); - failedCnt++; - if (failedCnt >= MAX_RETRY_COUNT) { - return Flow.NO_MORE_STATE; - } - } - } LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); From 52e3fdec55896bf39c5fd6be9facd3cc315e5f35 Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 13 Mar 2026 11:13:04 +0800 Subject: [PATCH 12/39] Add a license description. --- ...tionTableIntegrityCheckProcedureState.java | 25 +++++++++++++++---- .../DataPartitionTableGeneratorState.java | 19 ++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java index 7028adf9b4b9a..2173ea8ef4589 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.confignode.procedure.state; public enum DataPartitionTableIntegrityCheckProcedureState { @@ -10,9 +29,5 @@ public enum DataPartitionTableIntegrityCheckProcedureState { /** Merge DataPartitionTables from all DataNodes */ MERGE_PARTITION_TABLES, /** Write final DataPartitionTable to raft log */ - WRITE_PARTITION_TABLE_TO_RAFT, - /** Procedure completed successfully */ - SUCCESS, - /** Procedure failed */ - FAILED + WRITE_PARTITION_TABLE_TO_RAFT } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java index 0d0d09c182e05..a07f6e313cdb2 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.commons.enums; public enum DataPartitionTableGeneratorState { From 7c7693d63545c49cb17152a5e60dfcfec031a80a Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 13 Mar 2026 11:52:11 +0800 Subject: [PATCH 13/39] mvn spotless:apply --- ...PartitionTableIntegrityCheckProcedure.java | 210 +++++++++--------- 1 file changed, 106 insertions(+), 104 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ce433c113be0a..a417e78ec6afd 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -44,6 +44,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.rpc.TSStatusCode; + import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -373,10 +374,12 @@ private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - ScheduledExecutorUtil.safelyScheduleAtFixedRate(heartBeatExecutor, this::checkPartitionTableGenerationStatus, - 0, - HEART_BEAT_REQUEST_RATE, - TimeUnit.MILLISECONDS); + ScheduledExecutorUtil.safelyScheduleAtFixedRate( + heartBeatExecutor, + this::checkPartitionTableGenerationStatus, + 0, + HEART_BEAT_REQUEST_RATE, + TimeUnit.MILLISECONDS); allDataNodes.removeAll(skipDataNodes); allDataNodes.removeAll(failedDataNodes); @@ -510,109 +513,108 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - Map finalDataPartitionMap = new HashMap<>(); - - for (String database : lostDataPartitionsOfDatabases) { - // Get current DataPartitionTable from ConfigManager - Map>>> - localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - - // Check if ConfigNode has a data partition that is associated with the earliestTimeslot - if (localDataPartitionTableMap == null - || localDataPartitionTableMap.isEmpty() - || localDataPartitionTableMap.get(database) == null - || localDataPartitionTableMap.get(database).isEmpty()) { - LOG.warn( - "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", - database); - continue; - } + Map finalDataPartitionMap = new HashMap<>(); - localDataPartitionTableMap - .values() - .forEach( - map -> - map.forEach( - (tSeriesPartitionSlot, seriesPartitionTableMap) -> { - if (tSeriesPartitionSlot == null - || seriesPartitionTableMap == null - || seriesPartitionTableMap.isEmpty()) { - return; - } - finalDataPartitionMap.computeIfAbsent( - tSeriesPartitionSlot, - k -> new SeriesPartitionTable(seriesPartitionTableMap)); - })); - } + for (String database : lostDataPartitionsOfDatabases) { + // Get current DataPartitionTable from ConfigManager + Map>>> + localDataPartitionTableMap = getLocalDataPartitionTable(env, database); - if (finalDataPartitionMap.isEmpty()) { - dataPartitionTables - .values() - .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap().forEach( - (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { - if (dnSeriesPartitionSlot == null - || dnSeriesPartitionTable == null) { - return; - } - finalDataPartitionMap.computeIfAbsent( - dnSeriesPartitionSlot, - k -> dnSeriesPartitionTable); - }); - }); - } else { - finalDataPartitionMap.forEach( - (tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables - .values() - .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap() - .forEach( - (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null - || seriesPartitionTable.getSeriesPartitionMap() == null - || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put( - tSeriesPartitionSlot, dnSeriesPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnSeriesPartitionTable - .getSeriesPartitionMap() - .forEach( - (k, v) -> - v.forEach( - tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition( - k, tConsensusGroupId); - })); - }); - }); - }); - } + // Check if ConfigNode has a data partition that is associated with the earliestTimeslot + if (localDataPartitionTableMap == null + || localDataPartitionTableMap.isEmpty() + || localDataPartitionTableMap.get(database) == null + || localDataPartitionTableMap.get(database).isEmpty()) { + LOG.warn( + "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", + database); + continue; + } - finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); + localDataPartitionTableMap + .values() + .forEach( + map -> + map.forEach( + (tSeriesPartitionSlot, seriesPartitionTableMap) -> { + if (tSeriesPartitionSlot == null + || seriesPartitionTableMap == null + || seriesPartitionTableMap.isEmpty()) { + return; + } + finalDataPartitionMap.computeIfAbsent( + tSeriesPartitionSlot, + k -> new SeriesPartitionTable(seriesPartitionTableMap)); + })); + } + + if (finalDataPartitionMap.isEmpty()) { + dataPartitionTables + .values() + .forEach( + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (dnSeriesPartitionSlot == null || dnSeriesPartitionTable == null) { + return; + } + finalDataPartitionMap.computeIfAbsent( + dnSeriesPartitionSlot, k -> dnSeriesPartitionTable); + }); + }); + } else { + finalDataPartitionMap.forEach( + (tSeriesPartitionSlot, seriesPartitionTable) -> { + dataPartitionTables + .values() + .forEach( + dataPartitionTable -> { + if (dataPartitionTable == null + || dataPartitionTable.getDataPartitionMap() == null + || dataPartitionTable.getDataPartitionMap().isEmpty()) { + return; + } + dataPartitionTable + .getDataPartitionMap() + .forEach( + (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { + if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { + return; + } + + if (seriesPartitionTable == null + || seriesPartitionTable.getSeriesPartitionMap() == null + || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { + finalDataPartitionMap.put( + tSeriesPartitionSlot, dnSeriesPartitionTable); + } + + // dnDataPartitionTable merged to seriesPartitionTable + dnSeriesPartitionTable + .getSeriesPartitionMap() + .forEach( + (k, v) -> + v.forEach( + tConsensusGroupId -> { + if (seriesPartitionTable == null) { + return; + } + seriesPartitionTable.putDataPartition( + k, tConsensusGroupId); + })); + }); + }); + }); + } + + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); From 5092137f371ec3194c36ea336a91507d81451560 Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 13 Mar 2026 17:32:43 +0800 Subject: [PATCH 14/39] Fix problems based on review opinions --- .../confignode/conf/ConfigNodeConfig.java | 12 ++-- .../confignode/conf/ConfigNodeDescriptor.java | 4 +- .../confignode/manager/node/NodeManager.java | 55 ------------------- ...PartitionTableIntegrityCheckProcedure.java | 8 +-- .../iotdb/confignode/service/ConfigNode.java | 16 +++--- .../org/apache/iotdb/db/conf/IoTDBConfig.java | 3 +- .../DataPartitionTableGenerator.java | 26 --------- .../impl/DataNodeInternalRPCServiceImpl.java | 49 +++++++++-------- 8 files changed, 47 insertions(+), 126 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java index c682107698a3a..59b318a4b11e9 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeConfig.java @@ -319,7 +319,7 @@ public class ConfigNodeConfig { private long forceWalPeriodForConfigNodeSimpleInMs = 100; - private long partitionTableRecoverWaitAllDnUpTimeout = 60000; + private long partitionTableRecoverWaitAllDnUpTimeoutInMs = 60000; public ConfigNodeConfig() { // empty constructor @@ -1289,12 +1289,12 @@ public void setFailureDetectorPhiAcceptablePauseInMs(long failureDetectorPhiAcce this.failureDetectorPhiAcceptablePauseInMs = failureDetectorPhiAcceptablePauseInMs; } - public long getPartitionTableRecoverWaitAllDnUpTimeout() { - return partitionTableRecoverWaitAllDnUpTimeout; + public long getPartitionTableRecoverWaitAllDnUpTimeoutInMs() { + return partitionTableRecoverWaitAllDnUpTimeoutInMs; } - public void setPartitionTableRecoverWaitAllDnUpTimeout( - long partitionTableRecoverWaitAllDnUpTimeout) { - this.partitionTableRecoverWaitAllDnUpTimeout = partitionTableRecoverWaitAllDnUpTimeout; + public void setPartitionTableRecoverWaitAllDnUpTimeoutInMs( + long partitionTableRecoverWaitAllDnUpTimeoutInMs) { + this.partitionTableRecoverWaitAllDnUpTimeoutInMs = partitionTableRecoverWaitAllDnUpTimeoutInMs; } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index e7d39fd3bcb87..6843aced0e511 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -322,11 +322,11 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio "failure_detector_phi_acceptable_pause_in_ms", String.valueOf(conf.getFailureDetectorPhiAcceptablePauseInMs())))); - conf.setPartitionTableRecoverWaitAllDnUpTimeout( + conf.setPartitionTableRecoverWaitAllDnUpTimeoutInMs( Long.parseLong( properties.getProperty( "partition_table_recover_wait_all_dn_up_timeout", - String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeout())))); + String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeoutInMs())))); String leaderDistributionPolicy = properties.getProperty("leader_distribution_policy", conf.getLeaderDistributionPolicy()); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java index fdf8ef89f65d7..e3d775259d626 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/node/NodeManager.java @@ -352,9 +352,6 @@ public DataSet registerDataNode(TDataNodeRegisterReq req) { // Adjust the maximum RegionGroup number of each Database getClusterSchemaManager().adjustMaxRegionGroupNum(); - // Check if all DataNodes are registered and trigger integrity check if needed - checkAndTriggerIntegrityCheck(); - resp.setStatus(ClusterNodeStartUtils.ACCEPT_NODE_REGISTRATION); resp.setDataNodeId( registerDataNodePlan.getDataNodeConfiguration().getLocation().getDataNodeId()); @@ -1349,56 +1346,4 @@ private TTLManager getTTLManager() { private ExternalServiceManager getServiceManager() { return configManager.getExternalServiceManager(); } - - /** - * Check if all DataNodes are registered and running, then trigger integrity check. This method - * should be called after each DataNode registration. - */ - private void checkAndTriggerIntegrityCheck() { - // Only trigger integrity check if this ConfigNode is the leader - if (!configManager.getConsensusManager().isLeader()) { - return; - } - - // Get all registered DataNodes - List registeredDataNodes = getRegisteredDataNodes(); - - // Check if all registered DataNodes are running - boolean allDataNodesRunning = - registeredDataNodes.stream() - .allMatch( - dataNode -> { - Integer dataNodeId = dataNode.getLocation().getDataNodeId(); - NodeStatus status = getLoadManager().getLoadCache().getNodeStatus(dataNodeId); - return status == NodeStatus.Running; - }); - - if (allDataNodesRunning && !registeredDataNodes.isEmpty()) { - LOGGER.info( - "All {} DataNodes are registered and running, triggering data partition table integrity check", - registeredDataNodes.size()); - - // Trigger integrity check asynchronously - try { - configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); - LOGGER.info("Data partition table integrity check procedure submitted successfully"); - } catch (Exception e) { - LOGGER.error("Failed to submit data partition table integrity check procedure", e); - } - } else { - LOGGER.debug( - "Not all DataNodes are ready yet. Registered: {}, Running: {}", - registeredDataNodes.size(), - (int) - registeredDataNodes.stream() - .filter( - dataNode -> { - Integer dataNodeId = dataNode.getLocation().getDataNodeId(); - NodeStatus status = - getLoadManager().getLoadCache().getNodeStatus(dataNodeId); - return status == NodeStatus.Running; - }) - .count()); - } - } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index a417e78ec6afd..7216396f5abb3 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -103,7 +103,8 @@ public class DataPartitionTableIntegrityCheckProcedure private DataPartitionTable finalDataPartitionTable; private static Set skipDataNodes = new HashSet<>(); - private static Set failedDataNodes = new HashSet<>(); + private static Set failedDataNodes = + Collections.newSetFromMap(new ConcurrentHashMap<>()); private static ScheduledExecutorService heartBeatExecutor; @@ -195,7 +196,7 @@ private Flow collectEarliestTimeslots() { if (allDataNodes.isEmpty()) { LOG.error( - "No DataNodes registered, no way to collect earliest timeslots, terminating procedure"); + "No DataNodes registered, no way to collect earliest timeslots, waiting for them to go up"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -252,8 +253,7 @@ private Flow collectEarliestTimeslots() { allDataNodes.size() - failedDataNodes.size()); } - if (failedDataNodes.size() == allDataNodes.size() - && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { + if (failedDataNodes.size() == allDataNodes.size()) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); } else { setNextState(DataPartitionTableIntegrityCheckProcedureState.ANALYZE_MISSING_PARTITIONS); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 01ae2499a9e80..7448a17cc922b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -81,7 +81,6 @@ import java.util.Arrays; import java.util.List; import java.util.Set; -import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -121,8 +120,6 @@ public class ConfigNode extends ServerCommandLine implements ConfigNodeMBean { private ExecutorService dataPartitionTableCheckExecutor = IoTDBThreadPoolFactory.newSingleThreadExecutor("DATA_PARTITION_TABLE_CHECK"); - private final CountDownLatch latch = new CountDownLatch(1); - public ConfigNode() { super("ConfigNode"); // We do not init anything here, so that we can re-initialize the instance in IT. @@ -164,6 +161,8 @@ protected void start() throws IoTDBException { dataPartitionTableCheckFuture.get(); } catch (ExecutionException | InterruptedException e) { LOGGER.error("Data partition table check task execute failed", e); + } finally { + dataPartitionTableCheckExecutor.shutdownNow(); } } @@ -226,10 +225,11 @@ public void active() { dataPartitionTableCheckExecutor.submit( () -> { LOGGER.info( - "Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); - // Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeout()); + "[DataPartitionIntegrity] Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); + // @todo + Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeoutInMs()); - while (latch.getCount() > 0) { + while (true) { List dnList = configManager .getLoadManager() @@ -241,10 +241,10 @@ public void active() { if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { LOGGER.error("Data partition table integrity check failed!"); } - latch.countDown(); + break; } else { LOGGER.info("No running datanodes found, waiting..."); - Thread.sleep(5000); // 等待5秒后重新检查 + Thread.sleep(5000); } } return null; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java index 2ce50415549e3..2cf1452088c4e 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java @@ -1221,7 +1221,8 @@ public class IoTDBConfig { /* Need use these parameters when repair data partition table */ private int partitionTableRecoverWorkerNum = 10; - private int partitionTableRecoverMaxReadBytesPerSecond = 1000; + // Rate limit set to 10 MB/s + private int partitionTableRecoverMaxReadBytesPerSecond = 10; IoTDBConfig() {} diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 4f9a326f05223..88f9130ec0e36 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -110,32 +110,6 @@ public DataPartitionTableGenerator( this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; } - public DataPartitionTableGenerator( - String dataDirectory, - ExecutorService executor, - Set databases, - int seriesSlotNum, - String seriesPartitionExecutorClass) { - this.dataDirectories = new String[] {dataDirectory}; - this.executor = executor; - this.databases = databases; - this.seriesSlotNum = seriesSlotNum; - this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; - } - - public DataPartitionTableGenerator( - String[] dataDirectories, - ExecutorService executor, - Set databases, - int seriesSlotNum, - String seriesPartitionExecutorClass) { - this.dataDirectories = dataDirectories; - this.executor = executor; - this.databases = databases; - this.seriesSlotNum = seriesSlotNum; - this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; - } - public enum TaskStatus { NOT_STARTED, IN_PROGRESS, diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index ab4f1523516aa..f488c367d1509 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -427,30 +427,6 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private static final String SYSTEM = "system"; - private final ExecutorService findEarliestTimeSlotExecutor = - new WrappedThreadPoolExecutor( - 0, - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), - 0L, - TimeUnit.SECONDS, - new ArrayBlockingQueue<>( - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), - new IoTThreadFactory(ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName()), - ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), - new ThreadPoolExecutor.CallerRunsPolicy()); - - private final ExecutorService partitionTableRecoverExecutor = - new WrappedThreadPoolExecutor( - 0, - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), - 0L, - TimeUnit.SECONDS, - new ArrayBlockingQueue<>( - IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), - new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), - ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), - new ThreadPoolExecutor.CallerRunsPolicy()); - private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); private static final long timeoutMs = 600000; // 600 seconds timeout @@ -3222,6 +3198,18 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable( String seriesPartitionExecutorClass = IoTDBDescriptor.getInstance().getConfig().getSeriesPartitionExecutorClass(); + final ExecutorService partitionTableRecoverExecutor = + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName()), + ThreadName.DATA_PARTITION_RECOVER_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); + currentGenerator = new DataPartitionTableGenerator( partitionTableRecoverExecutor, @@ -3370,6 +3358,18 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { String databaseName = databaseDir.getName(); List> futureList = new ArrayList<>(); + final ExecutorService findEarliestTimeSlotExecutor = + new WrappedThreadPoolExecutor( + 0, + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum(), + 0L, + TimeUnit.SECONDS, + new ArrayBlockingQueue<>( + IoTDBDescriptor.getInstance().getConfig().getPartitionTableRecoverWorkerNum()), + new IoTThreadFactory(ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName()), + ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), + new ThreadPoolExecutor.CallerRunsPolicy()); + try { Files.list(databaseDir.toPath()) .filter(Files::isDirectory) @@ -3430,6 +3430,7 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { Thread.currentThread().interrupt(); } } + findEarliestTimeSlotExecutor.shutdownNow(); return databaseEarliestRegionMap.get(databaseName); } From 631a1eaaa672d2c8b76806a63df7c1102b5d752e Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 13 Mar 2026 17:55:57 +0800 Subject: [PATCH 15/39] Remove some unuseful functions --- .../DataPartitionTableGenerator.java | 352 ++++-------------- 1 file changed, 69 insertions(+), 283 deletions(-) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 88f9130ec0e36..94545e07831ee 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -38,10 +38,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -50,7 +46,6 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -73,15 +68,12 @@ public class DataPartitionTableGenerator { private final AtomicLong totalFiles = new AtomicLong(0); // Configuration - private String[] dataDirectories; private final ExecutorService executor; private final Set databases; private final int seriesSlotNum; private final String seriesPartitionExecutorClass; - private static final int EXECUTOR_MAX_TIMEOUT = 60; - - private static final LeakyBucketRateLimiter limiter = + private final LeakyBucketRateLimiter limiter = new LeakyBucketRateLimiter( (long) IoTDBDescriptor.getInstance() @@ -90,7 +82,6 @@ public class DataPartitionTableGenerator { * 1024 * 1024); - public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; public static final Set IGNORE_DATABASE = new HashSet() { { @@ -99,6 +90,8 @@ public class DataPartitionTableGenerator { } }; + public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; + public DataPartitionTableGenerator( ExecutorService executor, Set databases, @@ -135,258 +128,90 @@ private void generateDataPartitionTableByMemory() { SeriesPartitionExecutor.getSeriesPartitionExecutor( seriesPartitionExecutorClass, seriesSlotNum); - for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { - CompletableFuture regionFuture = - CompletableFuture.runAsync( - () -> { - TsFileManager tsFileManager = dataRegion.getTsFileManager(); - String databaseName = dataRegion.getDatabaseName(); - if (!databases.contains(databaseName) || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - tsFileManager.readLock(); - List seqTsFileList = tsFileManager.getTsFileList(true); - List unseqTsFileList = tsFileManager.getTsFileList(false); - tsFileManager.readUnlock(); - - constructDataPartitionMap(seqTsFileList, seriesPartitionExecutor, dataPartitionMap); - constructDataPartitionMap( - unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); - }, - executor); - futures.add(regionFuture); - } - - // Wait for all tasks to complete - CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); - - if (dataPartitionMap.isEmpty()) { - LOG.error("Failed to generate DataPartitionTable, dataPartitionMap is empty"); - status = TaskStatus.FAILED; - return; - } - - dataPartitionTable = new DataPartitionTable(dataPartitionMap); - status = TaskStatus.COMPLETED; - } - - private static void constructDataPartitionMap( - List seqTsFileList, - SeriesPartitionExecutor seriesPartitionExecutor, - Map dataPartitionMap) { - for (TsFileResource tsFileResource : seqTsFileList) { - Set devices = tsFileResource.getDevices(limiter); - long timeSlotId = tsFileResource.getTsFileID().timePartitionId; - int regionId = tsFileResource.getTsFileID().regionId; - - TConsensusGroupId consensusGroupId = new TConsensusGroupId(); - consensusGroupId.setId(regionId); - consensusGroupId.setType(TConsensusGroupType.DataRegion); - - for (IDeviceID deviceId : devices) { - TSeriesPartitionSlot seriesSlotId = - seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); - TTimePartitionSlot timePartitionSlot = - new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); - dataPartitionMap - .computeIfAbsent( - seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) - .putDataPartition(timePartitionSlot, consensusGroupId); - } - } - } - - /** Generate DataPartitionTable by scanning all resource files. */ - private void generateDataPartitionTable() throws IOException { - LOG.info("Starting DataPartitionTable generation from {} directories", dataDirectories.length); - - List> futures = new ArrayList<>(); - - Map dataPartitionMap = new ConcurrentHashMap<>(); - try { - // Count total files first for progress tracking - countTotalFiles(); - - // Process all data directories - for (String dataDirectory : dataDirectories) { - LOG.info("Processing data directory: {}", dataDirectory); - - // First layer: database directories - Files.list(Paths.get(dataDirectory)) - .filter(Files::isDirectory) - .forEach( - sequenceTypePath -> { + for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { + CompletableFuture regionFuture = + CompletableFuture.runAsync( + () -> { try { - Files.list(sequenceTypePath) - .filter(Files::isDirectory) - .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (!databases.contains(databaseName) - || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - if (LOG.isDebugEnabled()) { - LOG.debug("Processing database: {}", databaseName); - } - - try { - Files.list(dbPath) - .filter(Files::isDirectory) - .forEach( - regionPath -> { - processRegionDirectory( - regionPath, - databaseName, - dataPartitionMap, - executor, - futures); - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", dbPath, e); - failedFiles.incrementAndGet(); - } - }); - } catch (IOException e) { - LOG.error("Failed to process database directory: {}", sequenceTypePath, e); + TsFileManager tsFileManager = dataRegion.getTsFileManager(); + String databaseName = dataRegion.getDatabaseName(); + if (!databases.contains(databaseName) + || IGNORE_DATABASE.contains(databaseName)) { + return; + } + + tsFileManager.readLock(); + List seqTsFileList = tsFileManager.getTsFileList(true); + List unseqTsFileList = tsFileManager.getTsFileList(false); + tsFileManager.readUnlock(); + + constructDataPartitionMap( + seqTsFileList, seriesPartitionExecutor, dataPartitionMap); + constructDataPartitionMap( + unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); + } catch (Exception e) { + LOG.error("Error processing data region: {}", dataRegion.getDatabaseName(), e); failedFiles.incrementAndGet(); + errorMessage = "Failed to process data region: " + e.getMessage(); } - }); + }, + executor); + futures.add(regionFuture); } // Wait for all tasks to complete CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); - dataPartitionTable = new DataPartitionTable(dataPartitionMap); + if (dataPartitionMap.isEmpty()) { + LOG.error("Failed to generate DataPartitionTable, dataPartitionMap is empty"); + status = TaskStatus.FAILED; + errorMessage = "DataPartitionMap is empty after processing resource file"; + return; + } + dataPartitionTable = new DataPartitionTable(dataPartitionMap); + status = TaskStatus.COMPLETED; LOG.info( - "DataPartitionTable generation completed. Processed: {}, Failed: {}", + "DataPartitionTable generation completed successfully. Processed: {}, Failed: {}", processedFiles.get(), failedFiles.get()); - - } finally { - executor.shutdown(); - try { - if (!executor.awaitTermination(EXECUTOR_MAX_TIMEOUT, TimeUnit.SECONDS)) { - executor.shutdownNow(); - } - } catch (InterruptedException e) { - executor.shutdownNow(); - Thread.currentThread().interrupt(); - } - } - } - - /** Process a region directory. */ - private void processRegionDirectory( - java.nio.file.Path regionPath, - String databaseName, - Map dataPartitionMap, - ExecutorService executor, - List> futures) { - - int regionId; - try { - regionId = Integer.parseInt(regionPath.getFileName().toString()); - LOG.debug("Processing region: {}", regionId); - } catch (NumberFormatException e) { - LOG.error("Invalid region directory: {}", regionPath); - return; - } - - TConsensusGroupId consensusGroupId = new TConsensusGroupId(); - consensusGroupId.setId(regionId); - consensusGroupId.setType(TConsensusGroupType.DataRegion); - - // Process time partitions asynchronously - CompletableFuture regionFuture = - CompletableFuture.runAsync( - () -> { - try { - Files.list(regionPath) - .filter(Files::isDirectory) - .forEach( - timeSlotPath -> { - processTimeSlotDirectory( - timeSlotPath, databaseName, consensusGroupId, dataPartitionMap); - }); - } catch (IOException e) { - LOG.error("Failed to list region directory: {}", regionPath, e); - } - }, - executor); - - futures.add(regionFuture); - } - - /** Process a time slot directory. */ - private void processTimeSlotDirectory( - java.nio.file.Path timeSlotPath, - String databaseName, - TConsensusGroupId consensusGroupId, - Map dataPartitionMap) { - - long timeSlotLong; - try { - timeSlotLong = Long.parseLong(timeSlotPath.getFileName().toString()); - LOG.debug("Processing time slot: {}", timeSlotLong); - } catch (NumberFormatException e) { - LOG.error("Invalid time slot directory: {}", timeSlotPath); - return; - } - - try { - // Fourth layer: .resource files - Files.walk(timeSlotPath) - .filter(Files::isRegularFile) - .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) - .forEach( - tsFilePath -> { - processTsFile( - tsFilePath.toFile(), consensusGroupId, timeSlotLong, dataPartitionMap); - }); - } catch (IOException e) { - LOG.error("Failed to walk time slot directory: {}", timeSlotPath, e); + } catch (Exception e) { + LOG.error("Failed to generate DataPartitionTable", e); + status = TaskStatus.FAILED; + errorMessage = "Generation failed: " + e.getMessage(); } } - /** Process a single tsfile. */ - private void processTsFile( - File tsFile, - TConsensusGroupId consensusGroupId, - long timeSlotId, + private void constructDataPartitionMap( + List seqTsFileList, + SeriesPartitionExecutor seriesPartitionExecutor, Map dataPartitionMap) { - try { - TsFileResource tsFileResource = new TsFileResource(tsFile.getAbsoluteFile()); - tsFileResource.deserialize(); - - Set devices = tsFileResource.getDevices(limiter); - processedFiles.incrementAndGet(); - - SeriesPartitionExecutor seriesPartitionExecutor = - SeriesPartitionExecutor.getSeriesPartitionExecutor( - seriesPartitionExecutorClass, seriesSlotNum); - - for (org.apache.tsfile.file.metadata.IDeviceID deviceId : devices) { - TSeriesPartitionSlot seriesSlotId = - seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); - TTimePartitionSlot timePartitionSlot = - new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); - dataPartitionMap - .computeIfAbsent( - seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) - .putDataPartition(timePartitionSlot, consensusGroupId); - } - - if (processedFiles.get() % 1000 == 0) { - LOG.info("Processed {} files, current: {}", processedFiles.get(), tsFile.getName()); + for (TsFileResource tsFileResource : seqTsFileList) { + try { + Set devices = tsFileResource.getDevices(limiter); + long timeSlotId = tsFileResource.getTsFileID().timePartitionId; + int regionId = tsFileResource.getTsFileID().regionId; + + TConsensusGroupId consensusGroupId = new TConsensusGroupId(); + consensusGroupId.setId(regionId); + consensusGroupId.setType(TConsensusGroupType.DataRegion); + + for (IDeviceID deviceId : devices) { + TSeriesPartitionSlot seriesSlotId = + seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); + TTimePartitionSlot timePartitionSlot = + new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + dataPartitionMap + .computeIfAbsent( + seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) + .putDataPartition(timePartitionSlot, consensusGroupId); + } + processedFiles.incrementAndGet(); + } catch (Exception e) { + failedFiles.incrementAndGet(); + LOG.error("Failed to process tsfile {}, {}", tsFileResource.getTsFileID(), e.getMessage()); } - } catch (IOException e) { - failedFiles.incrementAndGet(); - LOG.error("Failed to process tsfile: {} -> {}", tsFile.getAbsolutePath(), e.getMessage()); } } @@ -399,45 +224,6 @@ private static SeriesPartitionTable newSeriesPartitionTable( return seriesPartitionTable; } - /** Count total files for progress tracking. */ - private void countTotalFiles() throws IOException { - AtomicLong fileCount = new AtomicLong(0); - - for (String dataDirectory : dataDirectories) { - Files.list(Paths.get(dataDirectory)) - .filter(Files::isDirectory) - .forEach( - sequenceTypePath -> { - try { - Files.list(sequenceTypePath) - .filter(Files::isDirectory) - .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (!databases.contains(databaseName) - || IGNORE_DATABASE.contains(databaseName)) { - return; - } - - try { - Files.walk(dbPath) - .filter(Files::isRegularFile) - .filter(p -> p.toString().endsWith(SCAN_FILE_SUFFIX_NAME)) - .forEach(p -> fileCount.incrementAndGet()); - } catch (IOException e) { - LOG.error("countTotalFiles failed when scan {}", dbPath, e); - } - }); - } catch (IOException e) { - LOG.error("countTotalFiles failed when scan {}", sequenceTypePath, e); - } - }); - } - - totalFiles.set(fileCount.get()); - LOG.info("Found {} resource files to process", totalFiles.get()); - } - // Getters public TaskStatus getStatus() { return status; From 500a0aeedcea145d6c13cfdec685991350cb1f22 Mon Sep 17 00:00:00 2001 From: libo Date: Mon, 16 Mar 2026 10:08:09 +0800 Subject: [PATCH 16/39] Find the earliest time slot in the ConfigNode --- ...PartitionTableIntegrityCheckProcedure.java | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 7216396f5abb3..ed0cad2936669 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -44,7 +44,6 @@ import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.rpc.TSStatusCode; - import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -62,6 +61,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -132,7 +132,7 @@ protected Flow executeFromState( return analyzeMissingPartitions(env); case REQUEST_PARTITION_TABLES: heartBeatExecutor = Executors.newScheduledThreadPool(1); - return requestPartitionTables(env); + return requestPartitionTables(); case MERGE_PARTITION_TABLES: return mergePartitionTables(env); case WRITE_PARTITION_TABLE_TO_RAFT: @@ -304,19 +304,23 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { seriesPartitionEntry : seriesPartitionMap.entrySet()) { Map> tTimePartitionSlotListMap = seriesPartitionEntry.getValue(); - tTimePartitionSlotListMap - .keySet() - .forEach( - slot -> { - if (!TimePartitionUtils.satisfyPartitionId( - slot.getStartTime(), earliestTimeslot)) { - lostDataPartitionsOfDatabases.add(database); - LOG.warn( - "Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", - database, - earliestTimeslot); - } - }); + + if (tTimePartitionSlotListMap.isEmpty()) { + continue; + } + + TTimePartitionSlot localEarliestSlot = tTimePartitionSlotListMap.keySet() + .stream() + .min(Comparator.comparingLong(TTimePartitionSlot::getStartTime)) + .orElse(null); + + if (!TimePartitionUtils.satisfyPartitionId(localEarliestSlot.getStartTime(), earliestTimeslot)) { + lostDataPartitionsOfDatabases.add(database); + LOG.warn( + "Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", + database, + earliestTimeslot); + } } } @@ -361,7 +365,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { * Request DataPartitionTable generation from target DataNodes. Each DataNode scans its tsfile * resources and generates a DataPartitionTable. */ - private Flow requestPartitionTables(final ConfigNodeProcedureEnv env) { + private Flow requestPartitionTables() { if (LOG.isDebugEnabled()) { LOG.debug( "Requesting DataPartitionTable generation from {} DataNodes...", allDataNodes.size()); From 9355430a716f983744127574514bff30022fea02 Mon Sep 17 00:00:00 2001 From: libo Date: Mon, 16 Mar 2026 14:52:06 +0800 Subject: [PATCH 17/39] Resolve the problem that data partition table generation is not completed in the progress caused by rpc timeout --- ...PartitionTableIntegrityCheckProcedure.java | 90 +++++------ ...tionTableIntegrityCheckProcedureState.java | 2 + .../impl/DataNodeInternalRPCServiceImpl.java | 142 +++++++++--------- .../src/main/thrift/datanode.thrift | 2 +- 4 files changed, 117 insertions(+), 119 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ed0cad2936669..8036d9f96a59b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -24,7 +24,6 @@ import org.apache.iotdb.common.rpc.thrift.TSStatus; import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; -import org.apache.iotdb.commons.concurrent.threadpool.ScheduledExecutorUtil; import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.partition.DataPartitionTable; import org.apache.iotdb.commons.partition.SeriesPartitionTable; @@ -68,9 +67,6 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; /** * Procedure for checking and restoring data partition table integrity. This procedure scans all @@ -105,9 +101,6 @@ public class DataPartitionTableIntegrityCheckProcedure private static Set skipDataNodes = new HashSet<>(); private static Set failedDataNodes = Collections.newSetFromMap(new ConcurrentHashMap<>()); - - private static ScheduledExecutorService heartBeatExecutor; - // ============Need serialize END=============/ public DataPartitionTableIntegrityCheckProcedure() { @@ -131,8 +124,9 @@ protected Flow executeFromState( lostDataPartitionsOfDatabases = new HashSet<>(); return analyzeMissingPartitions(env); case REQUEST_PARTITION_TABLES: - heartBeatExecutor = Executors.newScheduledThreadPool(1); return requestPartitionTables(); + case REQUEST_PARTITION_TABLES_HEART_BEAT: + return requestPartitionTablesHeartBeat(); case MERGE_PARTITION_TABLES: return mergePartitionTables(env); case WRITE_PARTITION_TABLE_TO_RAFT: @@ -378,13 +372,6 @@ private Flow requestPartitionTables() { return Flow.HAS_MORE_STATE; } - ScheduledExecutorUtil.safelyScheduleAtFixedRate( - heartBeatExecutor, - this::checkPartitionTableGenerationStatus, - 0, - HEART_BEAT_REQUEST_RATE, - TimeUnit.MILLISECONDS); - allDataNodes.removeAll(skipDataNodes); allDataNodes.removeAll(failedDataNodes); for (TDataNodeConfiguration dataNode : allDataNodes) { @@ -407,13 +394,7 @@ private Flow requestPartitionTables() { "Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); - continue; } - - byte[] bytes = resp.getDataPartitionTable(); - DataPartitionTable dataPartitionTable = new DataPartitionTable(); - dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); - dataPartitionTables.put(dataNodeId, dataPartitionTable); } catch (Exception e) { failedDataNodes.add(dataNode); LOG.error( @@ -431,12 +412,11 @@ private Flow requestPartitionTables() { return Flow.HAS_MORE_STATE; } - setNextState(DataPartitionTableIntegrityCheckProcedureState.MERGE_PARTITION_TABLES); + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); return Flow.HAS_MORE_STATE; } - /** Check completion status of DataPartitionTable generation tasks. */ - private void checkPartitionTableGenerationStatus() { + private Flow requestPartitionTablesHeartBeat() { if (LOG.isDebugEnabled()) { LOG.info("Checking DataPartitionTable generation completion status..."); } @@ -448,51 +428,50 @@ private void checkPartitionTableGenerationStatus() { if (!dataPartitionTables.containsKey(dataNodeId)) { try { TGenerateDataPartitionTableHeartbeatResp resp = - (TGenerateDataPartitionTableHeartbeatResp) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithGivenRetry( - dataNode.getLocation().getInternalEndPoint(), - null, - CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, - MAX_RETRY_COUNT); + (TGenerateDataPartitionTableHeartbeatResp) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry( + dataNode.getLocation().getInternalEndPoint(), + null, + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + MAX_RETRY_COUNT); DataPartitionTableGeneratorState state = - DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); + DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { LOG.error( - "Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", - dataNode.getLocation().getDataNodeId(), - state, - resp.getStatus()); + "Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", + dataNode.getLocation().getDataNodeId(), + state, + resp.getStatus()); continue; } + switch (state) { case SUCCESS: + byte[] bytes = resp.getDataPartitionTable(); + DataPartitionTable dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); + dataPartitionTables.put(dataNodeId, dataPartitionTable); LOG.info( - "DataNode {} completed DataPartitionTable generation, terminating heart beat", - dataNodeId); + "DataNode {} completed DataPartitionTable generation, terminating heart beat", + dataNodeId); completeCount++; break; case IN_PROGRESS: LOG.info("DataNode {} still generating DataPartitionTable", dataNodeId); break; - case FAILED: - LOG.error( - "DataNode {} failed to generate DataPartitionTable, terminating heart beat", - dataNodeId); - completeCount++; - break; default: LOG.error( - "DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); + "DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); break; } } catch (Exception e) { LOG.error( - "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", - dataNodeId, - e.getMessage(), - e); + "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + dataNodeId, + e.getMessage(), + e); completeCount++; } } else { @@ -501,8 +480,18 @@ private void checkPartitionTableGenerationStatus() { } if (completeCount >= allDataNodes.size()) { - heartBeatExecutor.shutdown(); + setNextState(DataPartitionTableIntegrityCheckProcedureState.MERGE_PARTITION_TABLES); + return Flow.HAS_MORE_STATE; } + + try { + Thread.sleep(HEART_BEAT_REQUEST_RATE); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + LOG.error("Error checking DataPartitionTable status due to thread interruption."); + } + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); + return Flow.HAS_MORE_STATE; } /** Merge DataPartitionTables from all DataNodes into a final table. */ @@ -676,6 +665,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { return getFlow(); } + /** Determine whether there are still DataNode nodes with failed execution of a certain step in this round. If such nodes exist, calculate the skipDataNodes and exclude these nodes when requesting the list of DataNode nodes in the cluster for the next round; if no such nodes exist, it means the procedure has been completed */ private Flow getFlow() { if (!failedDataNodes.isEmpty()) { allDataNodes.removeAll(failedDataNodes); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java index 2173ea8ef4589..899ed502b2a88 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java @@ -26,6 +26,8 @@ public enum DataPartitionTableIntegrityCheckProcedureState { ANALYZE_MISSING_PARTITIONS, /** Request DataPartitionTable generation from DataNodes */ REQUEST_PARTITION_TABLES, + /** Round robin get DataPartitionTable generation result from DataNodes */ + REQUEST_PARTITION_TABLES_HEART_BEAT, /** Merge DataPartitionTables from all DataNodes */ MERGE_PARTITION_TABLES, /** Write final DataPartitionTable to raft log */ diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index f488c367d1509..cf93cd5c5d4d2 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -19,6 +19,7 @@ package org.apache.iotdb.db.protocol.thrift.impl; +import com.google.common.collect.ImmutableList; import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; @@ -321,8 +322,6 @@ import org.apache.iotdb.service.rpc.thrift.TSInsertRecordReq; import org.apache.iotdb.trigger.api.enums.FailureStrategy; import org.apache.iotdb.trigger.api.enums.TriggerEvent; - -import com.google.common.collect.ImmutableList; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -344,6 +343,7 @@ import java.io.DataOutputStream; import java.io.File; import java.io.IOException; +import java.lang.reflect.Method; import java.net.URL; import java.nio.ByteBuffer; import java.nio.file.Files; @@ -361,6 +361,7 @@ import java.util.Optional; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -429,7 +430,8 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); - private static final long timeoutMs = 600000; // 600 seconds timeout + // Must be lower than the RPC request timeout, in milliseconds + private static final long timeoutMs = 50000; public DataNodeInternalRPCServiceImpl() { super(); @@ -3140,6 +3142,7 @@ public void handleClientExit() { // ==================================================== private volatile DataPartitionTableGenerator currentGenerator; + private volatile CompletableFuture currentGeneratorFuture; private volatile long currentTaskId = 0; @Override @@ -3180,13 +3183,11 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { public TGenerateDataPartitionTableResp generateDataPartitionTable( TGenerateDataPartitionTableReq req) { TGenerateDataPartitionTableResp resp = new TGenerateDataPartitionTableResp(); - byte[] empty = new byte[0]; try { // Check if there's already a task in the progress if (currentGenerator != null && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { - resp.setDataPartitionTable(empty); resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); resp.setMessage("DataPartitionTable generation is already in the progress"); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); @@ -3219,40 +3220,8 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable( currentTaskId = System.currentTimeMillis(); // Start generation synchronously for now to return the data partition table immediately - currentGenerator.startGeneration().get(timeoutMs, TimeUnit.MILLISECONDS); - - if (currentGenerator != null) { - switch (currentGenerator.getStatus()) { - case IN_PROGRESS: - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage("DataPartitionTable generation interrupted"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - break; - case COMPLETED: - DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); - if (dataPartitionTable != null) { - byte[] result = serializeDataPartitionTable(dataPartitionTable); - resp.setDataPartitionTable(result); - } - - resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); - resp.setMessage("DataPartitionTable generation completed successfully"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); - LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); - break; - default: - resp.setDataPartitionTable(empty); - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage( - "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - break; - } - } - - // Clear current generator - currentGenerator = null; + currentGeneratorFuture = currentGenerator.startGeneration(); + parseGenerationStatus(resp); } catch (Exception e) { LOGGER.error("Failed to generate DataPartitionTable", e); resp.setStatus( @@ -3268,8 +3237,10 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable( @Override public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() { TGenerateDataPartitionTableHeartbeatResp resp = new TGenerateDataPartitionTableHeartbeatResp(); - + // Set default value + resp.setDataPartitionTable(new byte[0]); try { + currentGeneratorFuture.get(timeoutMs, TimeUnit.MILLISECONDS); if (currentGenerator == null) { resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); resp.setMessage("No DataPartitionTable generation task found"); @@ -3277,33 +3248,15 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb return resp; } - DataPartitionTableGenerator.TaskStatus status = currentGenerator.getStatus(); - - switch (status) { - case IN_PROGRESS: - resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); - resp.setMessage( - String.format( - "DataPartitionTable generation in progress: %.1f%%", - currentGenerator.getProgress() * 100)); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - break; - case COMPLETED: - resp.setErrorCode(DataPartitionTableGeneratorState.SUCCESS.getCode()); - resp.setMessage("DataPartitionTable generation completed successfully"); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); - break; - case FAILED: - resp.setErrorCode(DataPartitionTableGeneratorState.FAILED.getCode()); - resp.setMessage( - "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage()); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - break; - default: - resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); - resp.setMessage("Unknown task status: " + status); - resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); - break; + parseGenerationStatus(resp); + if (currentGenerator.getStatus().equals(DataPartitionTableGenerator.TaskStatus.COMPLETED)) { + DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); + if (dataPartitionTable != null) { + byte[] result = serializeDataPartitionTable(dataPartitionTable); + resp.setDataPartitionTable(result); + // Clear current generator + currentGenerator = null; + } } } catch (Exception e) { LOGGER.error("Failed to check DataPartitionTable generation status", e); @@ -3313,10 +3266,63 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb OperationType.CHECK_DATA_PARTITION_TABLE_STATUS, TSStatusCode.INTERNAL_SERVER_ERROR.getStatusCode())); } - return resp; } + private void parseGenerationStatus(T resp) { + if (resp instanceof TGenerateDataPartitionTableResp) { + handleResponse((TGenerateDataPartitionTableResp) resp); + } else { + handleResponse((TGenerateDataPartitionTableHeartbeatResp) resp); + } + } + + private void handleResponse(TGenerateDataPartitionTableResp resp) { + updateResponse(resp); + } + + private void handleResponse(TGenerateDataPartitionTableHeartbeatResp resp) { + updateResponse(resp); + } + + private void updateResponse(T resp) { + if (currentGenerator == null) return; + + switch (currentGenerator.getStatus()) { + case IN_PROGRESS: + setResponseFields(resp, DataPartitionTableGeneratorState.IN_PROGRESS.getCode(), String.format( + "DataPartitionTable generation in progress: %.1f%%", + currentGenerator.getProgress() * 100), RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + break; + case COMPLETED: + setResponseFields(resp, DataPartitionTableGeneratorState.SUCCESS.getCode(), "DataPartitionTable generation completed successfully", RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); + break; + case FAILED: + setResponseFields(resp, DataPartitionTableGeneratorState.FAILED.getCode(), "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage(), RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + LOGGER.info("DataPartitionTable generation failed with task ID: {}", currentTaskId); + break; + default: + setResponseFields(resp, DataPartitionTableGeneratorState.UNKNOWN.getCode(), "Unknown task status: " + currentGenerator.getStatus(), RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + LOGGER.info("DataPartitionTable generation failed with task ID: {}", currentTaskId); + break; + } + } + + private void setResponseFields(T resp, int errorCode, String message, TSStatus status) { + try { + Method setErrorCode = resp.getClass().getMethod("setErrorCode", int.class); + Method setMessage = resp.getClass().getMethod("setMessage", String.class); + Method setStatus = resp.getClass().getMethod("setStatus", TSStatus.class); + + setErrorCode.invoke(resp, errorCode); + setMessage.invoke(resp, message); + setStatus.invoke(resp, status); + } catch (Exception e) { + LOGGER.error("Failed to set response fields", e); + } + } + /** Process data directory to find the earliest timeslots for each database. */ private void processDataDirectoryForEarliestTimeslots( File dataDir, Map earliestTimeslots) { diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index b248599f59cc4..6fa7630b5afdf 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -695,13 +695,13 @@ struct TGenerateDataPartitionTableResp { 1: required common.TSStatus status 2: required i32 errorCode 3: optional string message - 4: optional binary dataPartitionTable } struct TGenerateDataPartitionTableHeartbeatResp { 1: required common.TSStatus status 2: required i32 errorCode 3: optional string message + 4: optional binary dataPartitionTable } /** From 328d85c66ebb9b6eef2bcebf2c435d1a362f2c51 Mon Sep 17 00:00:00 2001 From: libo Date: Mon, 16 Mar 2026 16:13:32 +0800 Subject: [PATCH 18/39] Correct need to redirect the target step. --- .../partition/DataPartitionTableIntegrityCheckProcedure.java | 5 +++-- .../java/org/apache/iotdb/confignode/service/ConfigNode.java | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 8036d9f96a59b..6caac6310cd3d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -98,7 +98,7 @@ public class DataPartitionTableIntegrityCheckProcedure /** Final merged DataPartitionTable */ private DataPartitionTable finalDataPartitionTable; - private static Set skipDataNodes = new HashSet<>(); + private static Set skipDataNodes = Collections.newSetFromMap(new ConcurrentHashMap<>()); private static Set failedDataNodes = Collections.newSetFromMap(new ConcurrentHashMap<>()); // ============Need serialize END=============/ @@ -149,6 +149,7 @@ protected void rollbackState( case COLLECT_EARLIEST_TIMESLOTS: case ANALYZE_MISSING_PARTITIONS: case REQUEST_PARTITION_TABLES: + case REQUEST_PARTITION_TABLES_HEART_BEAT: case MERGE_PARTITION_TABLES: case WRITE_PARTITION_TABLE_TO_RAFT: // Cleanup resources @@ -490,7 +491,7 @@ private Flow requestPartitionTablesHeartBeat() { Thread.currentThread().interrupt(); LOG.error("Error checking DataPartitionTable status due to thread interruption."); } - setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); return Flow.HAS_MORE_STATE; } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 7448a17cc922b..3cec10b91036e 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -69,7 +69,6 @@ import org.apache.iotdb.metrics.metricsets.net.NetMetrics; import org.apache.iotdb.metrics.metricsets.system.SystemMetrics; import org.apache.iotdb.rpc.TSStatusCode; - import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -226,7 +225,6 @@ public void active() { () -> { LOGGER.info( "[DataPartitionIntegrity] Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); - // @todo Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeoutInMs()); while (true) { From c6b77a40039186a0c1e0bd516f6e0cee0608fd40 Mon Sep 17 00:00:00 2001 From: libo Date: Mon, 16 Mar 2026 18:13:02 +0800 Subject: [PATCH 19/39] Integrate the merge logic into the DataPartitionTable class --- ...PartitionTableIntegrityCheckProcedure.java | 74 +------------------ .../commons/partition/DataPartitionTable.java | 36 ++++++++- .../partition/SeriesPartitionTable.java | 9 +++ 3 files changed, 47 insertions(+), 72 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 6caac6310cd3d..ec4de4488ab53 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -266,10 +266,9 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } if (earliestTimeslots.isEmpty()) { - LOG.error( + LOG.warn( "No missing data partitions detected, nothing needs to be repaired, terminating procedure"); - setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); - return Flow.HAS_MORE_STATE; + return Flow.NO_MORE_STATE; } // Find all databases that have lost data partition tables @@ -542,74 +541,7 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { })); } - if (finalDataPartitionMap.isEmpty()) { - dataPartitionTables - .values() - .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap() - .forEach( - (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { - if (dnSeriesPartitionSlot == null || dnSeriesPartitionTable == null) { - return; - } - finalDataPartitionMap.computeIfAbsent( - dnSeriesPartitionSlot, k -> dnSeriesPartitionTable); - }); - }); - } else { - finalDataPartitionMap.forEach( - (tSeriesPartitionSlot, seriesPartitionTable) -> { - dataPartitionTables - .values() - .forEach( - dataPartitionTable -> { - if (dataPartitionTable == null - || dataPartitionTable.getDataPartitionMap() == null - || dataPartitionTable.getDataPartitionMap().isEmpty()) { - return; - } - dataPartitionTable - .getDataPartitionMap() - .forEach( - (dnSeriesPartitionSlot, dnSeriesPartitionTable) -> { - if (!tSeriesPartitionSlot.equals(dnSeriesPartitionSlot)) { - return; - } - - if (seriesPartitionTable == null - || seriesPartitionTable.getSeriesPartitionMap() == null - || seriesPartitionTable.getSeriesPartitionMap().isEmpty()) { - finalDataPartitionMap.put( - tSeriesPartitionSlot, dnSeriesPartitionTable); - } - - // dnDataPartitionTable merged to seriesPartitionTable - dnSeriesPartitionTable - .getSeriesPartitionMap() - .forEach( - (k, v) -> - v.forEach( - tConsensusGroupId -> { - if (seriesPartitionTable == null) { - return; - } - seriesPartitionTable.putDataPartition( - k, tConsensusGroupId); - })); - }); - }); - }); - } - - finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap); - + finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap).merge(dataPartitionTables); LOG.info("DataPartitionTable merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); return Flow.HAS_MORE_STATE; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java index 91346f0c69c85..c210bf90195ad 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java @@ -23,7 +23,6 @@ import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.utils.ThriftCommonsSerDeUtils; import org.apache.iotdb.confignode.rpc.thrift.TTimeSlotList; - import org.apache.thrift.TException; import org.apache.thrift.protocol.TProtocol; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -282,6 +281,41 @@ public Set autoCleanPartitionTable( return removedTimePartitionSlots; } + /** + * Merge a complete DataPartitionTable from the partition tables received from multiple DataNodes (supports cross-database merging, which is exactly the logic implemented in the current PR) + * + * @param sourceMap Map + * @return The complete merged partition table + */ + public DataPartitionTable merge(Map sourceMap) { + DataPartitionTable merged = new DataPartitionTable(this.dataPartitionMap); + for (DataPartitionTable table : sourceMap.values()) { + for (Map.Entry entry : table.dataPartitionMap.entrySet()) { + TSeriesPartitionSlot slot = entry.getKey(); + SeriesPartitionTable seriesTable = entry.getValue(); + merged.dataPartitionMap + .computeIfAbsent(slot, k -> new SeriesPartitionTable()) + .merge(seriesTable); + } + } + return merged; + } + + /** + * Support single table merging + * Merge another DataPartitionTable into the current object (used for incremental merging) + */ + public void merge(DataPartitionTable sourcePartitionTable) { + if (sourcePartitionTable == null) { + return; + } + for (Map.Entry entry : sourcePartitionTable.dataPartitionMap.entrySet()) { + this.dataPartitionMap + .computeIfAbsent(entry.getKey(), k -> new SeriesPartitionTable()) + .merge(entry.getValue()); + } + } + public void serialize(OutputStream outputStream, TProtocol protocol) throws IOException, TException { ReadWriteIOUtils.write(dataPartitionMap.size(), outputStream); diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java index ffb0413bc87e7..34be206870f04 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java @@ -274,6 +274,15 @@ public List autoCleanPartitionTable( return removedTimePartitions; } + public void merge(SeriesPartitionTable sourceMap) { + if (sourceMap == null) return; + sourceMap.seriesPartitionMap.forEach((timeSlot, groups) -> { + this.seriesPartitionMap + .computeIfAbsent(timeSlot, k -> new ArrayList<>()) + .addAll(groups); + }); + } + public void serialize(OutputStream outputStream, TProtocol protocol) throws IOException, TException { ReadWriteIOUtils.write(seriesPartitionMap.size(), outputStream); From 0956e071a3c4ff484d11f94f30fcbbce4fa1ef65 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 10:01:02 +0800 Subject: [PATCH 20/39] Support multiple databases data partition tables are repaired --- ...PartitionTableIntegrityCheckProcedure.java | 215 ++++++++++++------ .../DataPartitionTableGenerator.java | 49 ++-- .../impl/DataNodeInternalRPCServiceImpl.java | 70 ++++-- .../commons/partition/DataPartitionTable.java | 8 +- .../DatabaseScopedDataPartitionTable.java | 83 +++++++ .../src/main/thrift/datanode.thrift | 2 +- 6 files changed, 305 insertions(+), 122 deletions(-) create mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ec4de4488ab53..1eb1e720ea538 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -26,6 +26,7 @@ import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.enums.DataPartitionTableGeneratorState; import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.DatabaseScopedDataPartitionTable; import org.apache.iotdb.commons.partition.SeriesPartitionTable; import org.apache.iotdb.commons.utils.TimePartitionUtils; import org.apache.iotdb.confignode.client.sync.CnToDnSyncRequestType; @@ -53,10 +54,9 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collections; @@ -91,12 +91,12 @@ public class DataPartitionTableIntegrityCheckProcedure private Map earliestTimeslots = new ConcurrentHashMap<>(); /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ - private Map dataPartitionTables = new ConcurrentHashMap<>(); + private Map> dataPartitionTables = new ConcurrentHashMap<>(); private Set lostDataPartitionsOfDatabases = new HashSet<>(); /** Final merged DataPartitionTable */ - private DataPartitionTable finalDataPartitionTable; + private Map finalDataPartitionTables; private static Set skipDataNodes = Collections.newSetFromMap(new ConcurrentHashMap<>()); private static Set failedDataNodes = @@ -156,7 +156,7 @@ protected void rollbackState( earliestTimeslots.clear(); dataPartitionTables.clear(); allDataNodes.clear(); - finalDataPartitionTable = null; + finalDataPartitionTables = null; break; default: throw new ProcedureException("Unknown state for rollback: " + state); @@ -449,10 +449,9 @@ private Flow requestPartitionTablesHeartBeat() { switch (state) { case SUCCESS: - byte[] bytes = resp.getDataPartitionTable(); - DataPartitionTable dataPartitionTable = new DataPartitionTable(); - dataPartitionTable.deserialize(ByteBuffer.wrap(bytes)); - dataPartitionTables.put(dataNodeId, dataPartitionTable); + List byteBufferList = resp.getDatabaseScopedDataPartitionTables(); + List databaseScopedDataPartitionTableList = deserializeDatabaseScopedTableList(byteBufferList); + dataPartitionTables.put(dataNodeId, databaseScopedDataPartitionTableList); LOG.info( "DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); @@ -539,10 +538,16 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { tSeriesPartitionSlot, k -> new SeriesPartitionTable(seriesPartitionTableMap)); })); + + dataPartitionTables.forEach((k, v) -> v.forEach(databaseScopedDataPartitionTable -> { + if (!databaseScopedDataPartitionTable.getDatabase().equals(database)) { + return; + } + finalDataPartitionTables.put(database, new DataPartitionTable(finalDataPartitionMap).merge(databaseScopedDataPartitionTable.getDataPartitionTable())); + })); } - finalDataPartitionTable = new DataPartitionTable(finalDataPartitionMap).merge(dataPartitionTables); - LOG.info("DataPartitionTable merge completed successfully"); + LOG.info("DataPartitionTables merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); return Flow.HAS_MORE_STATE; } @@ -561,7 +566,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { return getFlow(); } - if (finalDataPartitionTable == null) { + if (finalDataPartitionTables.isEmpty()) { LOG.error("No DataPartitionTable to write to raft"); setFailure( "DataPartitionTableIntegrityCheckProcedure", @@ -574,8 +579,9 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { try { CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); Map assignedDataPartition = new HashMap<>(); - assignedDataPartition.put( - lostDataPartitionsOfDatabases.stream().findFirst().get(), finalDataPartitionTable); + for (String database : lostDataPartitionsOfDatabases) { + assignedDataPartition.put(database, finalDataPartitionTables.get(database)); + } createPlan.setAssignedDataPartition(assignedDataPartition); TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); @@ -624,21 +630,30 @@ public void serialize(final DataOutputStream stream) throws IOException { // Serialize dataPartitionTables count stream.writeInt(dataPartitionTables.size()); - for (Map.Entry entry : dataPartitionTables.entrySet()) { + for (Map.Entry> entry : dataPartitionTables.entrySet()) { stream.writeInt(entry.getKey()); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(oos); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - entry.getValue().serialize(oos, protocol); - // Write the size and data for byte array after serialize - byte[] data = baos.toByteArray(); - stream.writeInt(data.length); - stream.write(data); - } catch (IOException | TException e) { - LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); - throw new IOException("Failed to serialize dataPartitionTables", e); + List tableList = entry.getValue(); + stream.writeInt(tableList.size()); + + for (DatabaseScopedDataPartitionTable table : tableList) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos)) { + + TTransport transport = new TIOStreamTransport(dos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + table.serialize(dos, protocol); + + byte[] data = baos.toByteArray(); + // Length of data written for a single object + stream.writeInt(data.length); + // data written for a single object + stream.write(data); + } catch (IOException | TException e) { + LOG.error("{} serialize failed for dataNodeId: {}", this.getClass().getSimpleName(), entry.getKey(), e); + throw new IOException("Failed to serialize dataPartitionTables", e); + } } } @@ -647,24 +662,31 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeUTF(database); } - if (finalDataPartitionTable != null) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(oos); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - finalDataPartitionTable.serialize(oos, protocol); + if (finalDataPartitionTables != null && !finalDataPartitionTables.isEmpty()) { + stream.writeInt(finalDataPartitionTables.size()); - // Write the size and data for byte array after serialize - byte[] data = baos.toByteArray(); - stream.writeInt(data.length); - stream.write(data); - } catch (IOException | TException e) { - LOG.error("{} serialize failed", this.getClass().getSimpleName(), e); - throw new IOException("Failed to serialize finalDataPartitionTable", e); + for (Map.Entry entry : finalDataPartitionTables.entrySet()) { + stream.writeUTF(entry.getKey()); + + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos)) { + + TTransport transport = new TIOStreamTransport(dos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + entry.getValue().serialize(dos, protocol); + + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (IOException | TException e) { + LOG.error("{} serialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); + throw new IOException("Failed to serialize finalDataPartitionTables", e); + } + } + } else { + stream.writeInt(0); } - } else { - stream.writeInt(0); - } stream.writeInt(skipDataNodes.size()); for (TDataNodeConfiguration skipDataNode : skipDataNodes) { @@ -714,25 +736,35 @@ public void deserialize(final ByteBuffer byteBuffer) { // Deserialize dataPartitionTables count int dataPartitionTablesSize = byteBuffer.getInt(); - dataPartitionTables = new HashMap<>(); + dataPartitionTables = new ConcurrentHashMap<>(); for (int i = 0; i < dataPartitionTablesSize; i++) { - int key = byteBuffer.getInt(); - int size = byteBuffer.getInt(); - byte[] bytes = new byte[size]; - byteBuffer.get(bytes); - try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { - TTransport transport = new TIOStreamTransport(ois); - TBinaryProtocol protocol = new TBinaryProtocol(transport); + int dataNodeId = byteBuffer.getInt(); + int listSize = byteBuffer.getInt(); - // Deserialize by input stream and protocol - DataPartitionTable value = new DataPartitionTable(); - value.deserialize(ois, protocol); - dataPartitionTables.put(key, value); - } catch (IOException | TException e) { - LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); - throw new RuntimeException(e); + List tableList = new ArrayList<>(listSize); + + for (int j = 0; j < listSize; j++) { + int dataSize = byteBuffer.getInt(); + byte[] bytes = new byte[dataSize]; + byteBuffer.get(bytes); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + DataInputStream dis = new DataInputStream(bais)) { + + TTransport transport = new TIOStreamTransport(dis); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + DatabaseScopedDataPartitionTable table = + DatabaseScopedDataPartitionTable.deserialize(dis, protocol); + tableList.add(table); + + } catch (IOException | TException e) { + LOG.error("{} deserialize failed for dataNodeId: {}", this.getClass().getSimpleName(), dataNodeId, e); + throw new RuntimeException("Failed to deserialize dataPartitionTables", e); + } } + + dataPartitionTables.put(dataNodeId, tableList); } int lostDataPartitionsOfDatabasesSize = byteBuffer.getInt(); @@ -742,24 +774,31 @@ public void deserialize(final ByteBuffer byteBuffer) { } // Deserialize finalDataPartitionTable size - int finalDataPartitionTableSize = byteBuffer.getInt(); - if (finalDataPartitionTableSize > 0) { - byte[] finalDataPartitionTableBytes = new byte[finalDataPartitionTableSize]; - byteBuffer.get(finalDataPartitionTableBytes); - try (ByteArrayInputStream bais = new ByteArrayInputStream(finalDataPartitionTableBytes); - ObjectInputStream ois = new ObjectInputStream(bais)) { - TTransport transport = new TIOStreamTransport(ois); + int finalDataPartitionTablesSize = byteBuffer.getInt(); + finalDataPartitionTables = new ConcurrentHashMap<>(); + + for (int i = 0; i < finalDataPartitionTablesSize; i++) { + String database = ReadWriteIOUtils.readString(byteBuffer); + + int dataSize = byteBuffer.getInt(); + byte[] bytes = new byte[dataSize]; + byteBuffer.get(bytes); + + try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); + DataInputStream dis = new DataInputStream(bais)) { + + TTransport transport = new TIOStreamTransport(dis); TBinaryProtocol protocol = new TBinaryProtocol(transport); - // Deserialize by input stream and protocol - finalDataPartitionTable = new DataPartitionTable(); - finalDataPartitionTable.deserialize(ois, protocol); + DataPartitionTable dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(dis, protocol); + + finalDataPartitionTables.put(database, dataPartitionTable); + } catch (IOException | TException e) { - LOG.error("{} deserialize failed", this.getClass().getSimpleName(), e); - throw new RuntimeException(e); + LOG.error("{} deserialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); + throw new RuntimeException("Failed to deserialize finalDataPartitionTables", e); } - } else { - finalDataPartitionTable = null; } skipDataNodes = new HashSet<>(); @@ -802,4 +841,34 @@ public void deserialize(final ByteBuffer byteBuffer) { } } } + + private List deserializeDatabaseScopedTableList(List dataList) { + if (dataList == null || dataList.isEmpty()) { + return Collections.emptyList(); + } + + List result = new ArrayList<>(dataList.size()); + + for (ByteBuffer data : dataList) { + if (data == null || data.remaining() == 0) { + LOG.warn("Skipping empty ByteBuffer during deserialization"); + continue; + } + + try { + ByteBuffer dataBuffer = data.duplicate(); + + // 直接调用静态deserialize方法 + DatabaseScopedDataPartitionTable table = + DatabaseScopedDataPartitionTable.deserialize(dataBuffer); + + result.add(table); + + } catch (Exception e) { + LOG.error("Failed to deserialize DatabaseScopedDataPartitionTable", e); + } + } + + return result; + } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 94545e07831ee..a2a0288167ca6 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -24,6 +24,7 @@ import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.DatabaseScopedDataPartitionTable; import org.apache.iotdb.commons.partition.SeriesPartitionTable; import org.apache.iotdb.commons.partition.executor.SeriesPartitionExecutor; import org.apache.iotdb.commons.utils.TimePartitionUtils; @@ -45,6 +46,7 @@ import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -60,7 +62,7 @@ public class DataPartitionTableGenerator { // Task status private volatile TaskStatus status = TaskStatus.NOT_STARTED; private volatile String errorMessage; - private volatile DataPartitionTable dataPartitionTable; + private Map databasePartitionTableMap = new ConcurrentHashMap<>(); // Progress tracking private final AtomicInteger processedFiles = new AtomicInteger(0); @@ -103,6 +105,10 @@ public DataPartitionTableGenerator( this.seriesPartitionExecutorClass = seriesPartitionExecutorClass; } + public Map getDatabasePartitionTableMap() { + return databasePartitionTableMap; + } + public enum TaskStatus { NOT_STARTED, IN_PROGRESS, @@ -150,6 +156,23 @@ private void generateDataPartitionTableByMemory() { seqTsFileList, seriesPartitionExecutor, dataPartitionMap); constructDataPartitionMap( unseqTsFileList, seriesPartitionExecutor, dataPartitionMap); + + if (dataPartitionMap.isEmpty()) { + LOG.error("Failed to generate DataPartitionTable, dataPartitionMap is empty"); + status = TaskStatus.FAILED; + errorMessage = "DataPartitionMap is empty after processing resource file"; + return; + } + + DataPartitionTable dataPartitionTable = new DataPartitionTable(dataPartitionMap); + + databasePartitionTableMap.compute(databaseName, (k,v) -> { + if (v == null) { + return new DataPartitionTable(dataPartitionMap); + } + v.merge(dataPartitionTable); + return v; + }); } catch (Exception e) { LOG.error("Error processing data region: {}", dataRegion.getDatabaseName(), e); failedFiles.incrementAndGet(); @@ -163,14 +186,6 @@ private void generateDataPartitionTableByMemory() { // Wait for all tasks to complete CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); - if (dataPartitionMap.isEmpty()) { - LOG.error("Failed to generate DataPartitionTable, dataPartitionMap is empty"); - status = TaskStatus.FAILED; - errorMessage = "DataPartitionMap is empty after processing resource file"; - return; - } - - dataPartitionTable = new DataPartitionTable(dataPartitionMap); status = TaskStatus.COMPLETED; LOG.info( "DataPartitionTable generation completed successfully. Processed: {}, Failed: {}", @@ -233,22 +248,6 @@ public String getErrorMessage() { return errorMessage; } - public DataPartitionTable getDataPartitionTable() { - return dataPartitionTable; - } - - public int getProcessedFiles() { - return processedFiles.get(); - } - - public int getFailedFiles() { - return failedFiles.get(); - } - - public long getTotalFiles() { - return totalFiles.get(); - } - public double getProgress() { if (totalFiles.get() == 0) { return 0.0; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index cf93cd5c5d4d2..38c02dd4d22ba 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -66,6 +66,7 @@ import org.apache.iotdb.commons.exception.IllegalPathException; import org.apache.iotdb.commons.exception.MetadataException; import org.apache.iotdb.commons.partition.DataPartitionTable; +import org.apache.iotdb.commons.partition.DatabaseScopedDataPartitionTable; import org.apache.iotdb.commons.path.ExtendedPartialPath; import org.apache.iotdb.commons.path.MeasurementPath; import org.apache.iotdb.commons.path.PartialPath; @@ -328,6 +329,7 @@ import org.apache.thrift.transport.TTransport; import org.apache.tsfile.enums.TSDataType; import org.apache.tsfile.exception.NotImplementedException; +import org.apache.tsfile.external.commons.lang3.StringUtils; import org.apache.tsfile.read.common.TimeRange; import org.apache.tsfile.read.common.block.TsBlock; import org.apache.tsfile.utils.Pair; @@ -3238,7 +3240,7 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable( public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() { TGenerateDataPartitionTableHeartbeatResp resp = new TGenerateDataPartitionTableHeartbeatResp(); // Set default value - resp.setDataPartitionTable(new byte[0]); + resp.setDatabaseScopedDataPartitionTables(Collections.emptyList()); try { currentGeneratorFuture.get(timeoutMs, TimeUnit.MILLISECONDS); if (currentGenerator == null) { @@ -3250,10 +3252,25 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb parseGenerationStatus(resp); if (currentGenerator.getStatus().equals(DataPartitionTableGenerator.TaskStatus.COMPLETED)) { - DataPartitionTable dataPartitionTable = currentGenerator.getDataPartitionTable(); - if (dataPartitionTable != null) { - byte[] result = serializeDataPartitionTable(dataPartitionTable); - resp.setDataPartitionTable(result); + boolean success = false; + List databaseScopedDataPartitionTableList = new ArrayList<>(); + Map dataPartitionTableMap = currentGenerator.getDatabasePartitionTableMap(); + if (!dataPartitionTableMap.isEmpty()) { + for (Map.Entry entry : dataPartitionTableMap.entrySet()) { + String database = entry.getKey(); + DataPartitionTable dataPartitionTable = entry.getValue(); + if (StringUtils.isEmpty(database) && dataPartitionTable != null) { + DatabaseScopedDataPartitionTable databaseScopedDataPartitionTable = new DatabaseScopedDataPartitionTable(database, dataPartitionTable); + databaseScopedDataPartitionTableList.add(databaseScopedDataPartitionTable); + success = true; + } + } + } + + if (success) { + List result = serializeDatabaseScopedTableList(databaseScopedDataPartitionTableList); + resp.setDatabaseScopedDataPartitionTables(result); + // Clear current generator currentGenerator = null; } @@ -3406,10 +3423,10 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { } String timeSlotName = timeSlotPath.getFileName().toString(); long timeslot = Long.parseLong(timeSlotName); - if (timeslot - < databaseEarliestRegionMap.get(databaseName)) { - databaseEarliestRegionMap.put(databaseName, timeslot); - } + databaseEarliestRegionMap.compute( + databaseName, + (k, v) -> + v == null ? timeslot : Math.min(v, timeslot)); } catch (IOException e) { LOGGER.error( "Failed to find any {} files in the {} directory", @@ -3440,17 +3457,30 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { return databaseEarliestRegionMap.get(databaseName); } - /** Serialize DataPartitionTable to ByteBuffer for RPC transmission. */ - private byte[] serializeDataPartitionTable(DataPartitionTable dataPartitionTable) { - try (PublicBAOS baos = new PublicBAOS(); - DataOutputStream oos = new DataOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(oos); - TBinaryProtocol protocol = new TBinaryProtocol(transport); - dataPartitionTable.serialize(oos, protocol); - return baos.getBuf(); - } catch (IOException | TException e) { - LOGGER.error("Failed to serialize DataPartitionTable", e); - return ByteBuffer.allocate(0).array(); + private List serializeDatabaseScopedTableList(List list) { + if (list == null || list.isEmpty()) { + return Collections.emptyList(); + } + + List result = new ArrayList<>(list.size()); + + for (DatabaseScopedDataPartitionTable table : list) { + try (PublicBAOS baos = new PublicBAOS(); + DataOutputStream oos = new DataOutputStream(baos)) { + + TTransport transport = new TIOStreamTransport(oos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); + + table.serialize(oos, protocol); + + result.add(ByteBuffer.wrap(baos.toByteArray())); + + } catch (IOException | TException e) { + LOGGER.error("Failed to serialize DatabaseScopedDataPartitionTable for database: {}", + table.getDatabase(), e); + } } + + return result; } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java index c210bf90195ad..a1286ed0e10ae 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DataPartitionTable.java @@ -305,15 +305,17 @@ public DataPartitionTable merge(Map sourceMap) { * Support single table merging * Merge another DataPartitionTable into the current object (used for incremental merging) */ - public void merge(DataPartitionTable sourcePartitionTable) { + public DataPartitionTable merge(DataPartitionTable sourcePartitionTable) { + DataPartitionTable merged = new DataPartitionTable(this.dataPartitionMap); if (sourcePartitionTable == null) { - return; + return merged; } for (Map.Entry entry : sourcePartitionTable.dataPartitionMap.entrySet()) { - this.dataPartitionMap + merged.dataPartitionMap .computeIfAbsent(entry.getKey(), k -> new SeriesPartitionTable()) .merge(entry.getValue()); } + return merged; } public void serialize(OutputStream outputStream, TProtocol protocol) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java new file mode 100644 index 0000000000000..e81057ed96780 --- /dev/null +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java @@ -0,0 +1,83 @@ +package org.apache.iotdb.commons.partition; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TProtocol; +import org.apache.tsfile.utils.ReadWriteIOUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.util.Objects; + +public class DatabaseScopedDataPartitionTable { + private final String database; + private DataPartitionTable dataPartitionTable; + + public DatabaseScopedDataPartitionTable(String database, DataPartitionTable dataPartitionTable) { + this.database = database; + this.dataPartitionTable = dataPartitionTable; + } + + public String getDatabase() { + return database; + } + + public DataPartitionTable getDataPartitionTable() { + return dataPartitionTable; + } + + public void serialize(OutputStream outputStream, TProtocol protocol) + throws IOException, TException { + ReadWriteIOUtils.write(database, outputStream); + + ReadWriteIOUtils.write(dataPartitionTable != null, outputStream); + + if (dataPartitionTable != null) { + dataPartitionTable.serialize(outputStream, protocol); + } + } + + public static DatabaseScopedDataPartitionTable deserialize(ByteBuffer buffer) { + String database = ReadWriteIOUtils.readString(buffer); + + boolean hasDataPartitionTable = ReadWriteIOUtils.readBool(buffer); + + DataPartitionTable dataPartitionTable = null; + if (hasDataPartitionTable) { + dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(buffer); + } + + return new DatabaseScopedDataPartitionTable(database, dataPartitionTable); + } + + public static DatabaseScopedDataPartitionTable deserialize( + InputStream inputStream, TProtocol protocol) throws IOException, TException { + String database = ReadWriteIOUtils.readString(inputStream); + + boolean hasDataPartitionTable = ReadWriteIOUtils.readBool(inputStream); + + DataPartitionTable dataPartitionTable = null; + if (hasDataPartitionTable) { + dataPartitionTable = new DataPartitionTable(); + dataPartitionTable.deserialize(inputStream, protocol); + } + + return new DatabaseScopedDataPartitionTable(database, dataPartitionTable); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + DatabaseScopedDataPartitionTable that = (DatabaseScopedDataPartitionTable) o; + return Objects.equals(database, that.database) && + Objects.equals(dataPartitionTable, that.dataPartitionTable); + } + + @Override + public int hashCode() { + return Objects.hash(database, dataPartitionTable); + } +} diff --git a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift index 6fa7630b5afdf..84d4c2a1c0b11 100644 --- a/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift +++ b/iotdb-protocol/thrift-datanode/src/main/thrift/datanode.thrift @@ -701,7 +701,7 @@ struct TGenerateDataPartitionTableHeartbeatResp { 1: required common.TSStatus status 2: required i32 errorCode 3: optional string message - 4: optional binary dataPartitionTable + 4: optional list databaseScopedDataPartitionTables } /** From 62fb27798df0a8a1ecb31038d3c0051606a3bd93 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 11:47:47 +0800 Subject: [PATCH 21/39] Fix bugs --- ...PartitionTableIntegrityCheckProcedure.java | 2 +- .../iotdb/confignode/service/ConfigNode.java | 14 ++++++----- .../DataPartitionTableGenerator.java | 14 +++++++---- .../impl/DataNodeInternalRPCServiceImpl.java | 16 ++++++------- .../tsfile/timeindex/FileTimeIndex.java | 24 +++++++++++-------- .../DataPartitionTableGeneratorState.java | 4 ++-- .../partition/SeriesPartitionTable.java | 10 ++++---- .../commons/utils/TimePartitionUtils.java | 18 +++++++------- .../rateLimiter/LeakyBucketRateLimiter.java | 21 ++++++++++++++-- 9 files changed, 76 insertions(+), 47 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 1eb1e720ea538..ca97568055f86 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -729,7 +729,7 @@ public void deserialize(final ByteBuffer byteBuffer) { int earliestTimeslotsSize = byteBuffer.getInt(); earliestTimeslots = new ConcurrentHashMap<>(); for (int i = 0; i < earliestTimeslotsSize; i++) { - String database = String.valueOf(byteBuffer.getChar()); + String database = ReadWriteIOUtils.readString(byteBuffer); long timeslot = byteBuffer.getLong(); earliestTimeslots.put(database, timeslot); } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 3cec10b91036e..7999afe65ddc8 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -156,12 +156,14 @@ protected void start() throws IoTDBException { } active(); LOGGER.info("IoTDB started"); - try { - dataPartitionTableCheckFuture.get(); - } catch (ExecutionException | InterruptedException e) { - LOGGER.error("Data partition table check task execute failed", e); - } finally { - dataPartitionTableCheckExecutor.shutdownNow(); + if (dataPartitionTableCheckFuture != null) { + try { + dataPartitionTableCheckFuture.get(); + } catch (ExecutionException | InterruptedException e) { + LOGGER.error("Data partition table check task execute failed", e); + } finally { + dataPartitionTableCheckExecutor.shutdownNow(); + } } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index a2a0288167ca6..93417bae91f7f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -24,7 +24,6 @@ import org.apache.iotdb.common.rpc.thrift.TSeriesPartitionSlot; import org.apache.iotdb.common.rpc.thrift.TTimePartitionSlot; import org.apache.iotdb.commons.partition.DataPartitionTable; -import org.apache.iotdb.commons.partition.DatabaseScopedDataPartitionTable; import org.apache.iotdb.commons.partition.SeriesPartitionTable; import org.apache.iotdb.commons.partition.executor.SeriesPartitionExecutor; import org.apache.iotdb.commons.utils.TimePartitionUtils; @@ -34,7 +33,6 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; - import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,7 +44,6 @@ import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -123,7 +120,16 @@ public CompletableFuture startGeneration() { } status = TaskStatus.IN_PROGRESS; - return CompletableFuture.runAsync(this::generateDataPartitionTableByMemory); + return CompletableFuture.runAsync(() -> { + try { + generateDataPartitionTableByMemory(); + } catch (Throwable t) { + status = TaskStatus.FAILED; + errorMessage = "Failed to generate DataPartitionTable: " + t.getMessage(); + LOG.error("Failed to generate DataPartitionTable asynchronously", t); + throw t; + } + }, executor); } private void generateDataPartitionTableByMemory() { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 38c02dd4d22ba..456bc1bc3f437 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3343,13 +3343,13 @@ private void setResponseFields(T resp, int errorCode, String message, TSStat /** Process data directory to find the earliest timeslots for each database. */ private void processDataDirectoryForEarliestTimeslots( File dataDir, Map earliestTimeslots) { - try { - Files.list(dataDir.toPath()) + try (Stream sequenceTypePaths = Files.list(dataDir.toPath())) { + sequenceTypePaths .filter(Files::isDirectory) .forEach( sequenceTypePath -> { - try { - Files.list(sequenceTypePath) + try (Stream dbPaths = Files.list(sequenceTypePath)) { + dbPaths .filter(Files::isDirectory) .forEach( dbPath -> { @@ -3393,16 +3393,16 @@ private long findEarliestTimeslotInDatabase(File databaseDir) { ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), new ThreadPoolExecutor.CallerRunsPolicy()); - try { - Files.list(databaseDir.toPath()) + try (Stream databasePaths = Files.list(databaseDir.toPath())) { + databasePaths .filter(Files::isDirectory) .forEach( regionPath -> { Future future = findEarliestTimeSlotExecutor.submit( () -> { - try { - Files.list(regionPath) + try (Stream regionPaths = Files.list(regionPath)) { + regionPaths .filter(Files::isDirectory) .forEach( timeSlotPath -> { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index 059663c5a6aea..b79ffc578e83f 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -125,16 +125,20 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc public Set getDevices( String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { tsFileResource.readLock(); - try (InputStream inputStream = - FSFactoryProducer.getFSFactory() - .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { - // The first byte is VERSION_NUMBER, second byte is timeIndexType. - byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); - limiter.acquire(bytes.length); - if (bytes[1] == ARRAY_DEVICE_TIME_INDEX_TYPE) { - return ArrayDeviceTimeIndex.getDevices(inputStream); - } else { - return PlainDeviceTimeIndex.getDevices(inputStream); + try { + limiter.acquire(tsFileResource.getTsFileSize()); + + try (InputStream inputStream = + FSFactoryProducer.getFSFactory() + .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { + // The first byte is VERSION_NUMBER, second byte is timeIndexType. + byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); + + if (bytes[1] == ARRAY_DEVICE_TIME_INDEX_TYPE) { + return ArrayDeviceTimeIndex.getDevices(inputStream); + } else { + return PlainDeviceTimeIndex.getDevices(inputStream); + } } } catch (NoSuchFileException e) { // deleted by ttl diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java index a07f6e313cdb2..93cca687799fc 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/enums/DataPartitionTableGeneratorState.java @@ -36,10 +36,10 @@ public int getCode() { } /** - * get DataNodeRemoveState by code + * get DataPartitionTableGeneratorState by code * * @param code code - * @return DataNodeRemoveState + * @return DataPartitionTableGeneratorState */ public static DataPartitionTableGeneratorState getStateByCode(int code) { for (DataPartitionTableGeneratorState state : DataPartitionTableGeneratorState.values()) { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java index 34be206870f04..1c1350b0cbbe4 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java @@ -73,10 +73,12 @@ public Map> getSeriesPartitionMap() } public void putDataPartition(TTimePartitionSlot timePartitionSlot, TConsensusGroupId groupId) { - seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()); - List groupList = seriesPartitionMap.get(timePartitionSlot); - if (!groupList.contains(groupId)) { - groupList.add(groupId); + List groupList = + seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()); + synchronized (groupList) { + if (!groupList.contains(groupId)) { + groupList.add(groupId); + } } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index 4eeddff9db7f9..341b111852061 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -130,17 +130,15 @@ public static long getTimePartitionIdWithoutOverflow(long time) { return partitionId.longValue(); } + /** Since bigTimePartitionInterval.multiply(partitionId) is always an exact multiple of + * bigTimePartitionInterval, the previous conditional logic was redundant and the else + * branch was unreachable. We directly compute the time without risk of overflow here. + */ public static long getTimeWithoutOverflow(long partitionId) { - BigInteger bigTime = bigTimePartitionInterval.multiply(BigInteger.valueOf(partitionId)); - if (bigTime.compareTo(BigInteger.ZERO) > 0 - || bigTime.remainder(bigTimePartitionInterval).equals(BigInteger.ZERO)) { - return bigTime.add(bigTimePartitionOrigin).longValue(); - } - return BigInteger.valueOf(partitionId) - .add(BigInteger.ONE) - .multiply(bigTimePartitionInterval) - .add(bigTimePartitionOrigin) - .longValue(); + return bigTimePartitionInterval + .multiply(BigInteger.valueOf(partitionId)) + .add(bigTimePartitionOrigin) + .longValue(); } public static long getTimeByPartitionId(long partitionId) { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java index 7af863db614b4..54551ec0a0298 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java @@ -108,8 +108,25 @@ public long getTotalBytes() { return totalBytes.get(); } - /** Expected time based on bytes processed. */ + /** + * Calculate the expected time using double (double can easily hold nanoseconds on the order of 10^18), then perform clamping and convert to long. + * Advantages: Extremely simple, zero exceptions thrown, and double precision is sufficient (nanosecond-level errors are negligible). + * Disadvantages: In extreme cases (when totalBytes is close to 2^63), double loses precision in the trailing digits. However, in IoTDB's actual scenarios, bytesPerSecond is typically between 10MB/s and 1GB/s, so this situation will not occur. + */ private long expectedTimeNs(long totalBytes) { - return startTimeNs + (totalBytes * 1_000_000_000L) / bytesPerSecond; + if (totalBytes <= 0) { + return startTimeNs; + } + + // Use double for calculations to avoid overflow in long multiplication + double seconds = (double) totalBytes / bytesPerSecond; + double elapsedNsDouble = seconds * 1_000_000_000.0; + + if (elapsedNsDouble > Long.MAX_VALUE - startTimeNs) { + // clamp + return Long.MAX_VALUE; + } + + return startTimeNs + (long) elapsedNsDouble; } } From b47f0fbe1f1bbd280d70349cf99047644e83ae54 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 14:15:55 +0800 Subject: [PATCH 22/39] Fix --- ...PartitionTableIntegrityCheckProcedure.java | 20 ++++++++++++------- .../DataPartitionTableGenerator.java | 11 +--------- .../impl/DataNodeInternalRPCServiceImpl.java | 2 +- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ca97568055f86..c625dfd663db0 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -128,6 +128,7 @@ protected Flow executeFromState( case REQUEST_PARTITION_TABLES_HEART_BEAT: return requestPartitionTablesHeartBeat(); case MERGE_PARTITION_TABLES: + finalDataPartitionTables = new HashMap<>(); return mergePartitionTables(env); case WRITE_PARTITION_TABLE_TO_RAFT: return writePartitionTableToRaft(env); @@ -145,20 +146,26 @@ protected Flow executeFromState( protected void rollbackState( final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state) throws IOException, InterruptedException, ProcedureException { + // Cleanup resources switch (state) { case COLLECT_EARLIEST_TIMESLOTS: + earliestTimeslots.clear(); + break; case ANALYZE_MISSING_PARTITIONS: + lostDataPartitionsOfDatabases.clear(); + break; case REQUEST_PARTITION_TABLES: case REQUEST_PARTITION_TABLES_HEART_BEAT: - case MERGE_PARTITION_TABLES: - case WRITE_PARTITION_TABLE_TO_RAFT: - // Cleanup resources - earliestTimeslots.clear(); dataPartitionTables.clear(); - allDataNodes.clear(); - finalDataPartitionTables = null; + break; + case MERGE_PARTITION_TABLES: + finalDataPartitionTables.clear(); break; default: + allDataNodes.clear(); + earliestTimeslots.clear(); + dataPartitionTables.clear(); + finalDataPartitionTables.clear(); throw new ProcedureException("Unknown state for rollback: " + state); } } @@ -858,7 +865,6 @@ private List deserializeDatabaseScopedTableLis try { ByteBuffer dataBuffer = data.duplicate(); - // 直接调用静态deserialize方法 DatabaseScopedDataPartitionTable table = DatabaseScopedDataPartitionTable.deserialize(dataBuffer); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 93417bae91f7f..989bdd017a474 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -120,16 +120,7 @@ public CompletableFuture startGeneration() { } status = TaskStatus.IN_PROGRESS; - return CompletableFuture.runAsync(() -> { - try { - generateDataPartitionTableByMemory(); - } catch (Throwable t) { - status = TaskStatus.FAILED; - errorMessage = "Failed to generate DataPartitionTable: " + t.getMessage(); - LOG.error("Failed to generate DataPartitionTable asynchronously", t); - throw t; - } - }, executor); + return CompletableFuture.runAsync(this::generateDataPartitionTableByMemory); } private void generateDataPartitionTableByMemory() { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 456bc1bc3f437..e2cb543209382 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3259,7 +3259,7 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb for (Map.Entry entry : dataPartitionTableMap.entrySet()) { String database = entry.getKey(); DataPartitionTable dataPartitionTable = entry.getValue(); - if (StringUtils.isEmpty(database) && dataPartitionTable != null) { + if (!StringUtils.isEmpty(database) && dataPartitionTable != null) { DatabaseScopedDataPartitionTable databaseScopedDataPartitionTable = new DatabaseScopedDataPartitionTable(database, dataPartitionTable); databaseScopedDataPartitionTableList.add(databaseScopedDataPartitionTable); success = true; From 4929a9136d98db765837bf32057a1e4dacbc92c6 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 14:44:44 +0800 Subject: [PATCH 23/39] Compare startTime between ConfigNode and DataNode, if the larger one is from ConfigNode, the data partition table lost. --- .../partition/DataPartitionTableIntegrityCheckProcedure.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index c625dfd663db0..68fabfcb3a877 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -315,7 +315,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { .min(Comparator.comparingLong(TTimePartitionSlot::getStartTime)) .orElse(null); - if (!TimePartitionUtils.satisfyPartitionId(localEarliestSlot.getStartTime(), earliestTimeslot)) { + if (localEarliestSlot.getStartTime() > TimePartitionUtils.getTimeByPartitionId(earliestTimeslot)) { lostDataPartitionsOfDatabases.add(database); LOG.warn( "Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", @@ -496,7 +496,7 @@ private Flow requestPartitionTablesHeartBeat() { Thread.currentThread().interrupt(); LOG.error("Error checking DataPartitionTable status due to thread interruption."); } - setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); return Flow.HAS_MORE_STATE; } From 6dd58ab8b1bead66750699a819c10c0d58a98dcf Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 14:46:26 +0800 Subject: [PATCH 24/39] Change to 10 seconds. --- .../partition/DataPartitionTableIntegrityCheckProcedure.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 68fabfcb3a877..ade49460ade4d 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -81,7 +81,7 @@ public class DataPartitionTableIntegrityCheckProcedure LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedure.class); private static final int MAX_RETRY_COUNT = 3; - private static final long HEART_BEAT_REQUEST_RATE = 60000; + private static final long HEART_BEAT_REQUEST_RATE = 10000; NodeManager dataNodeManager; private List allDataNodes = new ArrayList<>(); From 5a652a99839d49dc5329710fb0c2aab33766feab Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 14:54:34 +0800 Subject: [PATCH 25/39] Correct the input parameter name --- .../src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java index 2cf1452088c4e..102f7e733cd6d 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java @@ -4385,7 +4385,7 @@ public int getPartitionTableRecoverMaxReadBytesPerSecond() { return partitionTableRecoverMaxReadBytesPerSecond; } - public void setPartitionTableRecoverMaxReadBytesPerSecond(int partitionTableRecoverWorkerNum) { - this.partitionTableRecoverWorkerNum = partitionTableRecoverWorkerNum; + public void setPartitionTableRecoverMaxReadBytesPerSecond(int partitionTableRecoverMaxReadBytesPerSecond) { + this.partitionTableRecoverMaxReadBytesPerSecond = partitionTableRecoverMaxReadBytesPerSecond; } } From 83cb061e6355bb277a4793f0f1d95da8383c0c68 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 15:23:17 +0800 Subject: [PATCH 26/39] Changed to the local variable. --- .../thrift/impl/DataNodeInternalRPCServiceImpl.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index e2cb543209382..5a089685bfd41 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -430,8 +430,6 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private static final String SYSTEM = "system"; - private Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); - // Must be lower than the RPC request timeout, in milliseconds private static final long timeoutMs = 50000; @@ -3343,6 +3341,7 @@ private void setResponseFields(T resp, int errorCode, String message, TSStat /** Process data directory to find the earliest timeslots for each database. */ private void processDataDirectoryForEarliestTimeslots( File dataDir, Map earliestTimeslots) { + Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); try (Stream sequenceTypePaths = Files.list(dataDir.toPath())) { sequenceTypePaths .filter(Files::isDirectory) @@ -3360,7 +3359,7 @@ private void processDataDirectoryForEarliestTimeslots( } databaseEarliestRegionMap.computeIfAbsent( databaseName, key -> Long.MAX_VALUE); - long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile()); + long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile(), databaseEarliestRegionMap); if (earliestTimeslot != Long.MAX_VALUE) { earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); @@ -3377,7 +3376,7 @@ private void processDataDirectoryForEarliestTimeslots( } /** Find the earliest timeslot in a database directory. */ - private long findEarliestTimeslotInDatabase(File databaseDir) { + private long findEarliestTimeslotInDatabase(File databaseDir, Map databaseEarliestRegionMap) { String databaseName = databaseDir.getName(); List> futureList = new ArrayList<>(); From 57e80440b3e2a78c4a4072f2f85b8798ff90cafb Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 16:15:31 +0800 Subject: [PATCH 27/39] Fix all opinions --- ...PartitionTableIntegrityCheckProcedure.java | 80 +++++++++---------- .../org/apache/iotdb/db/conf/IoTDBConfig.java | 10 +-- .../apache/iotdb/db/conf/IoTDBDescriptor.java | 4 +- .../DataPartitionTableGenerator.java | 2 +- .../impl/DataNodeInternalRPCServiceImpl.java | 15 +++- .../DataNodeInternalRPCServiceImplTest.java | 2 - .../commons/utils/TimePartitionUtils.java | 8 -- 7 files changed, 59 insertions(+), 62 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ade49460ade4d..3b1f574589092 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -136,7 +136,7 @@ protected Flow executeFromState( throw new ProcedureException("Unknown state: " + state); } } catch (Exception e) { - LOG.error("Error executing state {}: {}", state, e.getMessage(), e); + LOG.error("[DataPartitionIntegrity] Error executing state {}: {}", state, e.getMessage(), e); setFailure("DataPartitionTableIntegrityCheckProcedure", e); return Flow.NO_MORE_STATE; } @@ -198,7 +198,7 @@ private Flow collectEarliestTimeslots() { if (allDataNodes.isEmpty()) { LOG.error( - "No DataNodes registered, no way to collect earliest timeslots, waiting for them to go up"); + "[DataPartitionIntegrity] No DataNodes registered, no way to collect earliest timeslots, waiting for them to go up"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -218,7 +218,7 @@ private Flow collectEarliestTimeslots() { if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { failedDataNodes.add(dataNode); LOG.error( - "Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", + "[DataPartitionIntegrity] Failed to collected earliest timeslots from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); continue; @@ -239,7 +239,7 @@ private Flow collectEarliestTimeslots() { } } catch (Exception e) { LOG.error( - "Failed to collect earliest timeslots from the DataNode[id={}]: {}", + "[DataPartitionIntegrity] Failed to collect earliest timeslots from the DataNode[id={}]: {}", dataNode.getLocation().getDataNodeId(), e.getMessage(), e); @@ -248,7 +248,7 @@ private Flow collectEarliestTimeslots() { } if (LOG.isDebugEnabled()) { - LOG.info( + LOG.debug( "Collected earliest timeslots from {} DataNodes: {}, the number of successful DataNodes is {}", allDataNodes.size(), earliestTimeslots, @@ -274,7 +274,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (earliestTimeslots.isEmpty()) { LOG.warn( - "No missing data partitions detected, nothing needs to be repaired, terminating procedure"); + "[DataPartitionIntegrity] No missing data partitions detected, nothing needs to be repaired, terminating procedure"); return Flow.NO_MORE_STATE; } @@ -294,7 +294,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { || localDataPartitionTable.get(database).isEmpty()) { lostDataPartitionsOfDatabases.add(database); LOG.warn( - "No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", + "[DataPartitionIntegrity] No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", database); continue; } @@ -318,7 +318,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (localEarliestSlot.getStartTime() > TimePartitionUtils.getTimeByPartitionId(earliestTimeslot)) { lostDataPartitionsOfDatabases.add(database); LOG.warn( - "Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", + "[DataPartitionIntegrity] Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", database, earliestTimeslot); } @@ -326,12 +326,12 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } if (lostDataPartitionsOfDatabases.isEmpty()) { - LOG.info("No databases have lost data partitions, terminating procedure"); + LOG.info("[DataPartitionIntegrity] No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; } LOG.info( - "Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", + "[DataPartitionIntegrity] Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", lostDataPartitionsOfDatabases.size(), allDataNodes.size() - failedDataNodes.size()); setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); @@ -374,7 +374,7 @@ private Flow requestPartitionTables() { if (allDataNodes.isEmpty()) { LOG.error( - "No DataNodes registered, no way to requested DataPartitionTable generation, terminating procedure"); + "[DataPartitionIntegrity] No DataNodes registered, no way to requested DataPartitionTable generation, terminating procedure"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -398,14 +398,14 @@ private Flow requestPartitionTables() { if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { failedDataNodes.add(dataNode); LOG.error( - "Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", + "[DataPartitionIntegrity] Failed to request DataPartitionTable generation from the DataNode[id={}], response status is {}", dataNode.getLocation().getDataNodeId(), resp.getStatus()); } } catch (Exception e) { failedDataNodes.add(dataNode); LOG.error( - "Failed to request DataPartitionTable generation from DataNode[id={}]: {}", + "[DataPartitionIntegrity] Failed to request DataPartitionTable generation from DataNode[id={}]: {}", dataNodeId, e.getMessage(), e); @@ -425,7 +425,7 @@ private Flow requestPartitionTables() { private Flow requestPartitionTablesHeartBeat() { if (LOG.isDebugEnabled()) { - LOG.info("Checking DataPartitionTable generation completion status..."); + LOG.debug("Checking DataPartitionTable generation completion status..."); } int completeCount = 0; @@ -447,7 +447,7 @@ private Flow requestPartitionTablesHeartBeat() { if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { LOG.error( - "Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", + "[DataPartitionIntegrity] Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", dataNode.getLocation().getDataNodeId(), state, resp.getStatus()); @@ -460,21 +460,21 @@ private Flow requestPartitionTablesHeartBeat() { List databaseScopedDataPartitionTableList = deserializeDatabaseScopedTableList(byteBufferList); dataPartitionTables.put(dataNodeId, databaseScopedDataPartitionTableList); LOG.info( - "DataNode {} completed DataPartitionTable generation, terminating heart beat", + "[DataPartitionIntegrity] DataNode {} completed DataPartitionTable generation, terminating heart beat", dataNodeId); completeCount++; break; case IN_PROGRESS: - LOG.info("DataNode {} still generating DataPartitionTable", dataNodeId); + LOG.info("[DataPartitionIntegrity] DataNode {} still generating DataPartitionTable", dataNodeId); break; default: LOG.error( - "DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); + "[DataPartitionIntegrity] DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); break; } } catch (Exception e) { LOG.error( - "Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + "[DataPartitionIntegrity] Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", dataNodeId, e.getMessage(), e); @@ -494,7 +494,7 @@ private Flow requestPartitionTablesHeartBeat() { Thread.sleep(HEART_BEAT_REQUEST_RATE); } catch (InterruptedException e) { Thread.currentThread().interrupt(); - LOG.error("Error checking DataPartitionTable status due to thread interruption."); + LOG.error("[DataPartitionIntegrity] Error checking DataPartitionTable status due to thread interruption."); } setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); return Flow.HAS_MORE_STATE; @@ -503,11 +503,11 @@ private Flow requestPartitionTablesHeartBeat() { /** Merge DataPartitionTables from all DataNodes into a final table. */ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { - LOG.info("Merging DataPartitionTables from {} DataNodes...", dataPartitionTables.size()); + LOG.debug("Merging DataPartitionTables from {} DataNodes...", dataPartitionTables.size()); } if (dataPartitionTables.isEmpty()) { - LOG.error("No DataPartitionTables to merge, dataPartitionTables is empty"); + LOG.error("[DataPartitionIntegrity] No DataPartitionTables to merge, dataPartitionTables is empty"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -525,7 +525,7 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { || localDataPartitionTableMap.get(database) == null || localDataPartitionTableMap.get(database).isEmpty()) { LOG.warn( - "No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", + "[DataPartitionIntegrity] No data partition table related to database {} was found from the ConfigNode, use data partition table of DataNode directly", database); continue; } @@ -554,7 +554,7 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { })); } - LOG.info("DataPartitionTables merge completed successfully"); + LOG.info("[DataPartitionIntegrity] DataPartitionTables merge completed successfully"); setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); return Flow.HAS_MORE_STATE; } @@ -562,11 +562,11 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { /** Write the final DataPartitionTable to raft log. */ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { - LOG.info("Writing DataPartitionTable to raft log..."); + LOG.debug("Writing DataPartitionTable to raft log..."); } if (lostDataPartitionsOfDatabases.isEmpty()) { - LOG.error("No database lost data partition table"); + LOG.error("[DataPartitionIntegrity] No database lost data partition table"); setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("No database lost data partition table for raft write")); @@ -574,7 +574,7 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { } if (finalDataPartitionTables.isEmpty()) { - LOG.error("No DataPartitionTable to write to raft"); + LOG.error("[DataPartitionIntegrity] DataPartitionTable to write to raft"); setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("No DataPartitionTable available for raft write")); @@ -593,16 +593,16 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); if (tsStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOG.info("DataPartitionTable successfully written to raft log"); + LOG.info("[DataPartitionIntegrity] DataPartitionTable successfully written to raft log"); break; } else { - LOG.error("Failed to write DataPartitionTable to raft log"); + LOG.error("[DataPartitionIntegrity] Failed to write DataPartitionTable to raft log"); setFailure( "DataPartitionTableIntegrityCheckProcedure", new ProcedureException("Failed to write DataPartitionTable to raft log")); } } catch (Exception e) { - LOG.error("Error writing DataPartitionTable to raft log", e); + LOG.error("[DataPartitionIntegrity] Error writing DataPartitionTable to raft log", e); setFailure("DataPartitionTableIntegrityCheckProcedure", e); } failedCnt++; @@ -658,7 +658,7 @@ public void serialize(final DataOutputStream stream) throws IOException { // data written for a single object stream.write(data); } catch (IOException | TException e) { - LOG.error("{} serialize failed for dataNodeId: {}", this.getClass().getSimpleName(), entry.getKey(), e); + LOG.error("[DataPartitionIntegrity] {} serialize failed for dataNodeId: {}", this.getClass().getSimpleName(), entry.getKey(), e); throw new IOException("Failed to serialize dataPartitionTables", e); } } @@ -687,7 +687,7 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(data.length); stream.write(data); } catch (IOException | TException e) { - LOG.error("{} serialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); + LOG.error("[DataPartitionIntegrity] {} serialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); throw new IOException("Failed to serialize finalDataPartitionTables", e); } } @@ -706,7 +706,7 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(data.length); stream.write(data); } catch (TException e) { - LOG.error("Failed to serialize skipDataNode", e); + LOG.error("[DataPartitionIntegrity] Failed to serialize skipDataNode", e); throw new IOException("Failed to serialize skipDataNode", e); } } @@ -722,7 +722,7 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(data.length); stream.write(data); } catch (TException e) { - LOG.error("Failed to serialize failedDataNode", e); + LOG.error("[DataPartitionIntegrity] Failed to serialize failedDataNode", e); throw new IOException("Failed to serialize failedDataNode", e); } } @@ -766,7 +766,7 @@ public void deserialize(final ByteBuffer byteBuffer) { tableList.add(table); } catch (IOException | TException e) { - LOG.error("{} deserialize failed for dataNodeId: {}", this.getClass().getSimpleName(), dataNodeId, e); + LOG.error("[DataPartitionIntegrity] {} deserialize failed for dataNodeId: {}", this.getClass().getSimpleName(), dataNodeId, e); throw new RuntimeException("Failed to deserialize dataPartitionTables", e); } } @@ -803,7 +803,7 @@ public void deserialize(final ByteBuffer byteBuffer) { finalDataPartitionTables.put(database, dataPartitionTable); } catch (IOException | TException e) { - LOG.error("{} deserialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); + LOG.error("[DataPartitionIntegrity] {} deserialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); throw new RuntimeException("Failed to deserialize finalDataPartitionTables", e); } } @@ -823,7 +823,7 @@ public void deserialize(final ByteBuffer byteBuffer) { dataNode.read(protocol); skipDataNodes.add(dataNode); } catch (TException | IOException e) { - LOG.error("Failed to deserialize skipDataNode", e); + LOG.error("[DataPartitionIntegrity] Failed to deserialize skipDataNode", e); throw new RuntimeException(e); } } @@ -843,7 +843,7 @@ public void deserialize(final ByteBuffer byteBuffer) { dataNode.read(protocol); failedDataNodes.add(dataNode); } catch (TException | IOException e) { - LOG.error("Failed to deserialize failedDataNode", e); + LOG.error("[DataPartitionIntegrity] Failed to deserialize failedDataNode", e); throw new RuntimeException(e); } } @@ -858,7 +858,7 @@ private List deserializeDatabaseScopedTableLis for (ByteBuffer data : dataList) { if (data == null || data.remaining() == 0) { - LOG.warn("Skipping empty ByteBuffer during deserialization"); + LOG.warn("[DataPartitionIntegrity] Skipping empty ByteBuffer during deserialization"); continue; } @@ -871,7 +871,7 @@ private List deserializeDatabaseScopedTableLis result.add(table); } catch (Exception e) { - LOG.error("Failed to deserialize DatabaseScopedDataPartitionTable", e); + LOG.error("[DataPartitionIntegrity] Failed to deserialize DatabaseScopedDataPartitionTable", e); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java index 102f7e733cd6d..701c182653ed7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java @@ -1222,7 +1222,7 @@ public class IoTDBConfig { /* Need use these parameters when repair data partition table */ private int partitionTableRecoverWorkerNum = 10; // Rate limit set to 10 MB/s - private int partitionTableRecoverMaxReadBytesPerSecond = 10; + private int partitionTableRecoverMaxReadMBsPerSecond = 10; IoTDBConfig() {} @@ -4381,11 +4381,11 @@ public void setPartitionTableRecoverWorkerNum(int partitionTableRecoverWorkerNum this.partitionTableRecoverWorkerNum = partitionTableRecoverWorkerNum; } - public int getPartitionTableRecoverMaxReadBytesPerSecond() { - return partitionTableRecoverMaxReadBytesPerSecond; + public int getPartitionTableRecoverMaxReadMBsPerSecond() { + return partitionTableRecoverMaxReadMBsPerSecond; } - public void setPartitionTableRecoverMaxReadBytesPerSecond(int partitionTableRecoverMaxReadBytesPerSecond) { - this.partitionTableRecoverMaxReadBytesPerSecond = partitionTableRecoverMaxReadBytesPerSecond; + public void setPartitionTableRecoverMaxReadMBsPerSecond(int partitionTableRecoverMaxReadMBsPerSecond) { + this.partitionTableRecoverMaxReadMBsPerSecond = partitionTableRecoverMaxReadMBsPerSecond; } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index a5e89bb250dfb..5b49ce83ee2ff 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -1144,11 +1144,11 @@ public void loadProperties(TrimProperties properties) throws BadNodeUrlException properties.getProperty( "partition_table_recover_worker_num", String.valueOf(conf.getPartitionTableRecoverWorkerNum())))); - conf.setPartitionTableRecoverMaxReadBytesPerSecond( + conf.setPartitionTableRecoverMaxReadMBsPerSecond( Integer.parseInt( properties.getProperty( "partition_table_recover_max_read_bytes_per_second", - String.valueOf(conf.getPartitionTableRecoverMaxReadBytesPerSecond())))); + String.valueOf(conf.getPartitionTableRecoverMaxReadMBsPerSecond())))); conf.setIncludeNullValueInWriteThroughputMetric( Boolean.parseBoolean( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 989bdd017a474..4cde31960a148 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -77,7 +77,7 @@ public class DataPartitionTableGenerator { (long) IoTDBDescriptor.getInstance() .getConfig() - .getPartitionTableRecoverMaxReadBytesPerSecond() + .getPartitionTableRecoverMaxReadMBsPerSecond() * 1024 * 1024); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 5a089685bfd41..a7e50ce9063de 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -430,9 +430,6 @@ public class DataNodeInternalRPCServiceImpl implements IDataNodeRPCService.Iface private static final String SYSTEM = "system"; - // Must be lower than the RPC request timeout, in milliseconds - private static final long timeoutMs = 50000; - public DataNodeInternalRPCServiceImpl() { super(); partitionFetcher = ClusterPartitionFetcher.getInstance(); @@ -3237,6 +3234,8 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable( @Override public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartbeat() { TGenerateDataPartitionTableHeartbeatResp resp = new TGenerateDataPartitionTableHeartbeatResp(); + // Must be lower than the RPC request timeout, in milliseconds + final long timeoutMs = 50000; // Set default value resp.setDatabaseScopedDataPartitionTables(Collections.emptyList()); try { @@ -3308,6 +3307,9 @@ private void updateResponse(T resp) { setResponseFields(resp, DataPartitionTableGeneratorState.IN_PROGRESS.getCode(), String.format( "DataPartitionTable generation in progress: %.1f%%", currentGenerator.getProgress() * 100), RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + LOGGER.info(String.format( + "DataPartitionTable generation with task ID: %s in progress: %.1f%%", + currentTaskId, currentGenerator.getProgress() * 100)); break; case COMPLETED: setResponseFields(resp, DataPartitionTableGeneratorState.SUCCESS.getCode(), "DataPartitionTable generation completed successfully", RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); @@ -3338,7 +3340,12 @@ private void setResponseFields(T resp, int errorCode, String message, TSStat } } - /** Process data directory to find the earliest timeslots for each database. */ + /** + * Process data directory to find the earliest timeslots for each database. + * Map earliestTimeslots + * key(String): database name + * value(Long): the earliest time slot id of the database + */ private void processDataDirectoryForEarliestTimeslots( File dataDir, Map earliestTimeslots) { Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); diff --git a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java index 066e1bea5bfd7..e4be3e884d45c 100644 --- a/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java +++ b/iotdb-core/datanode/src/test/java/org/apache/iotdb/db/service/DataNodeInternalRPCServiceImplTest.java @@ -83,8 +83,6 @@ public class DataNodeInternalRPCServiceImplTest { - private static final Logger LOG = - LoggerFactory.getLogger(DataNodeInternalRPCServiceImplTest.class); private static final IoTDBConfig conf = IoTDBDescriptor.getInstance().getConfig(); DataNodeInternalRPCServiceImpl dataNodeInternalRPCServiceImpl; private static IConsensus instance; diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index 341b111852061..73d68787eee6c 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -157,14 +157,6 @@ public static boolean satisfyPartitionId(long startTime, long endTime, long part return startPartition <= partitionId && endPartition >= partitionId; } - public static boolean satisfyPartitionId(long startTime, long partitionId) { - long endTime = - startTime >= timePartitionLowerBoundWithoutOverflow - ? Long.MAX_VALUE - : (startTime + timePartitionInterval - 1); - return satisfyPartitionId(startTime, endTime, partitionId); - } - public static boolean satisfyPartitionStartTime(Filter timeFilter, long partitionStartTime) { if (timeFilter == null) { return true; From 617b9ce1dfc6e8dbdd855120e0b03f2ad8a5cdd5 Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 16:55:28 +0800 Subject: [PATCH 28/39] spotless code --- ...PartitionTableIntegrityCheckProcedure.java | 183 +++++++++++------- .../iotdb/confignode/service/ConfigNode.java | 1 + .../org/apache/iotdb/db/conf/IoTDBConfig.java | 3 +- .../DataPartitionTableGenerator.java | 22 ++- .../impl/DataNodeInternalRPCServiceImpl.java | 74 ++++--- .../DataNodeInternalRPCServiceImplTest.java | 2 - .../commons/partition/DataPartitionTable.java | 28 +-- .../DatabaseScopedDataPartitionTable.java | 8 +- .../partition/SeriesPartitionTable.java | 11 +- .../commons/utils/TimePartitionUtils.java | 15 +- .../rateLimiter/LeakyBucketRateLimiter.java | 9 +- 11 files changed, 219 insertions(+), 137 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 3b1f574589092..de4d01943e3dc 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -44,6 +44,7 @@ import org.apache.iotdb.mpp.rpc.thrift.TGenerateDataPartitionTableResp; import org.apache.iotdb.mpp.rpc.thrift.TGetEarliestTimeslotsResp; import org.apache.iotdb.rpc.TSStatusCode; + import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -91,16 +92,19 @@ public class DataPartitionTableIntegrityCheckProcedure private Map earliestTimeslots = new ConcurrentHashMap<>(); /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ - private Map> dataPartitionTables = new ConcurrentHashMap<>(); + private Map> dataPartitionTables = + new ConcurrentHashMap<>(); private Set lostDataPartitionsOfDatabases = new HashSet<>(); /** Final merged DataPartitionTable */ private Map finalDataPartitionTables; - private static Set skipDataNodes = Collections.newSetFromMap(new ConcurrentHashMap<>()); + private static Set skipDataNodes = + Collections.newSetFromMap(new ConcurrentHashMap<>()); private static Set failedDataNodes = Collections.newSetFromMap(new ConcurrentHashMap<>()); + // ============Need serialize END=============/ public DataPartitionTableIntegrityCheckProcedure() { @@ -310,23 +314,25 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { continue; } - TTimePartitionSlot localEarliestSlot = tTimePartitionSlotListMap.keySet() - .stream() + TTimePartitionSlot localEarliestSlot = + tTimePartitionSlotListMap.keySet().stream() .min(Comparator.comparingLong(TTimePartitionSlot::getStartTime)) .orElse(null); - if (localEarliestSlot.getStartTime() > TimePartitionUtils.getTimeByPartitionId(earliestTimeslot)) { + if (localEarliestSlot.getStartTime() + > TimePartitionUtils.getTimeByPartitionId(earliestTimeslot)) { lostDataPartitionsOfDatabases.add(database); LOG.warn( - "[DataPartitionIntegrity] Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", - database, - earliestTimeslot); + "[DataPartitionIntegrity] Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", + database, + earliestTimeslot); } } } if (lostDataPartitionsOfDatabases.isEmpty()) { - LOG.info("[DataPartitionIntegrity] No databases have lost data partitions, terminating procedure"); + LOG.info( + "[DataPartitionIntegrity] No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; } @@ -419,7 +425,8 @@ private Flow requestPartitionTables() { return Flow.HAS_MORE_STATE; } - setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); + setNextState( + DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); return Flow.HAS_MORE_STATE; } @@ -435,49 +442,54 @@ private Flow requestPartitionTablesHeartBeat() { if (!dataPartitionTables.containsKey(dataNodeId)) { try { TGenerateDataPartitionTableHeartbeatResp resp = - (TGenerateDataPartitionTableHeartbeatResp) - SyncDataNodeClientPool.getInstance() - .sendSyncRequestToDataNodeWithGivenRetry( - dataNode.getLocation().getInternalEndPoint(), - null, - CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, - MAX_RETRY_COUNT); + (TGenerateDataPartitionTableHeartbeatResp) + SyncDataNodeClientPool.getInstance() + .sendSyncRequestToDataNodeWithGivenRetry( + dataNode.getLocation().getInternalEndPoint(), + null, + CnToDnSyncRequestType.GENERATE_DATA_PARTITION_TABLE_HEART_BEAT, + MAX_RETRY_COUNT); DataPartitionTableGeneratorState state = - DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); + DataPartitionTableGeneratorState.getStateByCode(resp.getErrorCode()); if (resp.getStatus().getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { LOG.error( - "[DataPartitionIntegrity] Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", - dataNode.getLocation().getDataNodeId(), - state, - resp.getStatus()); + "[DataPartitionIntegrity] Failed to request DataPartitionTable generation heart beat from the DataNode[id={}], state is {}, response status is {}", + dataNode.getLocation().getDataNodeId(), + state, + resp.getStatus()); continue; } switch (state) { case SUCCESS: List byteBufferList = resp.getDatabaseScopedDataPartitionTables(); - List databaseScopedDataPartitionTableList = deserializeDatabaseScopedTableList(byteBufferList); + List databaseScopedDataPartitionTableList = + deserializeDatabaseScopedTableList(byteBufferList); dataPartitionTables.put(dataNodeId, databaseScopedDataPartitionTableList); LOG.info( - "[DataPartitionIntegrity] DataNode {} completed DataPartitionTable generation, terminating heart beat", - dataNodeId); + "[DataPartitionIntegrity] DataNode {} completed DataPartitionTable generation, terminating heart beat", + dataNodeId); completeCount++; break; case IN_PROGRESS: - LOG.info("[DataPartitionIntegrity] DataNode {} still generating DataPartitionTable", dataNodeId); + LOG.info( + "[DataPartitionIntegrity] DataNode {} still generating DataPartitionTable", + dataNodeId); break; default: LOG.error( - "[DataPartitionIntegrity] DataNode {} returned unknown error code: {}", dataNodeId, resp.getErrorCode()); + "[DataPartitionIntegrity] DataNode {} returned unknown error code: {}", + dataNodeId, + resp.getErrorCode()); break; } } catch (Exception e) { LOG.error( - "[DataPartitionIntegrity] Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", - dataNodeId, - e.getMessage(), - e); + "[DataPartitionIntegrity] Error checking DataPartitionTable status from DataNode {}: {}, terminating heart beat", + dataNodeId, + e.getMessage(), + e); completeCount++; } } else { @@ -494,9 +506,11 @@ private Flow requestPartitionTablesHeartBeat() { Thread.sleep(HEART_BEAT_REQUEST_RATE); } catch (InterruptedException e) { Thread.currentThread().interrupt(); - LOG.error("[DataPartitionIntegrity] Error checking DataPartitionTable status due to thread interruption."); + LOG.error( + "[DataPartitionIntegrity] Error checking DataPartitionTable status due to thread interruption."); } - setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); + setNextState( + DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); return Flow.HAS_MORE_STATE; } @@ -507,7 +521,8 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { } if (dataPartitionTables.isEmpty()) { - LOG.error("[DataPartitionIntegrity] No DataPartitionTables to merge, dataPartitionTables is empty"); + LOG.error( + "[DataPartitionIntegrity] No DataPartitionTables to merge, dataPartitionTables is empty"); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -546,12 +561,18 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { k -> new SeriesPartitionTable(seriesPartitionTableMap)); })); - dataPartitionTables.forEach((k, v) -> v.forEach(databaseScopedDataPartitionTable -> { - if (!databaseScopedDataPartitionTable.getDatabase().equals(database)) { - return; - } - finalDataPartitionTables.put(database, new DataPartitionTable(finalDataPartitionMap).merge(databaseScopedDataPartitionTable.getDataPartitionTable())); - })); + dataPartitionTables.forEach( + (k, v) -> + v.forEach( + databaseScopedDataPartitionTable -> { + if (!databaseScopedDataPartitionTable.getDatabase().equals(database)) { + return; + } + finalDataPartitionTables.put( + database, + new DataPartitionTable(finalDataPartitionMap) + .merge(databaseScopedDataPartitionTable.getDataPartitionTable())); + })); } LOG.info("[DataPartitionIntegrity] DataPartitionTables merge completed successfully"); @@ -611,7 +632,12 @@ private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { return getFlow(); } - /** Determine whether there are still DataNode nodes with failed execution of a certain step in this round. If such nodes exist, calculate the skipDataNodes and exclude these nodes when requesting the list of DataNode nodes in the cluster for the next round; if no such nodes exist, it means the procedure has been completed */ + /** + * Determine whether there are still DataNode nodes with failed execution of a certain step in + * this round. If such nodes exist, calculate the skipDataNodes and exclude these nodes when + * requesting the list of DataNode nodes in the cluster for the next round; if no such nodes + * exist, it means the procedure has been completed + */ private Flow getFlow() { if (!failedDataNodes.isEmpty()) { allDataNodes.removeAll(failedDataNodes); @@ -637,7 +663,8 @@ public void serialize(final DataOutputStream stream) throws IOException { // Serialize dataPartitionTables count stream.writeInt(dataPartitionTables.size()); - for (Map.Entry> entry : dataPartitionTables.entrySet()) { + for (Map.Entry> entry : + dataPartitionTables.entrySet()) { stream.writeInt(entry.getKey()); List tableList = entry.getValue(); @@ -645,7 +672,7 @@ public void serialize(final DataOutputStream stream) throws IOException { for (DatabaseScopedDataPartitionTable table : tableList) { try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos)) { + DataOutputStream dos = new DataOutputStream(baos)) { TTransport transport = new TIOStreamTransport(dos); TBinaryProtocol protocol = new TBinaryProtocol(transport); @@ -658,7 +685,11 @@ public void serialize(final DataOutputStream stream) throws IOException { // data written for a single object stream.write(data); } catch (IOException | TException e) { - LOG.error("[DataPartitionIntegrity] {} serialize failed for dataNodeId: {}", this.getClass().getSimpleName(), entry.getKey(), e); + LOG.error( + "[DataPartitionIntegrity] {} serialize failed for dataNodeId: {}", + this.getClass().getSimpleName(), + entry.getKey(), + e); throw new IOException("Failed to serialize dataPartitionTables", e); } } @@ -669,31 +700,34 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeUTF(database); } - if (finalDataPartitionTables != null && !finalDataPartitionTables.isEmpty()) { - stream.writeInt(finalDataPartitionTables.size()); + if (finalDataPartitionTables != null && !finalDataPartitionTables.isEmpty()) { + stream.writeInt(finalDataPartitionTables.size()); - for (Map.Entry entry : finalDataPartitionTables.entrySet()) { - stream.writeUTF(entry.getKey()); + for (Map.Entry entry : finalDataPartitionTables.entrySet()) { + stream.writeUTF(entry.getKey()); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos)) { + try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(dos); - TBinaryProtocol protocol = new TBinaryProtocol(transport); + TTransport transport = new TIOStreamTransport(dos); + TBinaryProtocol protocol = new TBinaryProtocol(transport); - entry.getValue().serialize(dos, protocol); + entry.getValue().serialize(dos, protocol); - byte[] data = baos.toByteArray(); - stream.writeInt(data.length); - stream.write(data); - } catch (IOException | TException e) { - LOG.error("[DataPartitionIntegrity] {} serialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); - throw new IOException("Failed to serialize finalDataPartitionTables", e); - } + byte[] data = baos.toByteArray(); + stream.writeInt(data.length); + stream.write(data); + } catch (IOException | TException e) { + LOG.error( + "[DataPartitionIntegrity] {} serialize finalDataPartitionTables failed", + this.getClass().getSimpleName(), + e); + throw new IOException("Failed to serialize finalDataPartitionTables", e); } - } else { - stream.writeInt(0); } + } else { + stream.writeInt(0); + } stream.writeInt(skipDataNodes.size()); for (TDataNodeConfiguration skipDataNode : skipDataNodes) { @@ -756,17 +790,21 @@ public void deserialize(final ByteBuffer byteBuffer) { byteBuffer.get(bytes); try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - DataInputStream dis = new DataInputStream(bais)) { + DataInputStream dis = new DataInputStream(bais)) { TTransport transport = new TIOStreamTransport(dis); TBinaryProtocol protocol = new TBinaryProtocol(transport); DatabaseScopedDataPartitionTable table = - DatabaseScopedDataPartitionTable.deserialize(dis, protocol); + DatabaseScopedDataPartitionTable.deserialize(dis, protocol); tableList.add(table); } catch (IOException | TException e) { - LOG.error("[DataPartitionIntegrity] {} deserialize failed for dataNodeId: {}", this.getClass().getSimpleName(), dataNodeId, e); + LOG.error( + "[DataPartitionIntegrity] {} deserialize failed for dataNodeId: {}", + this.getClass().getSimpleName(), + dataNodeId, + e); throw new RuntimeException("Failed to deserialize dataPartitionTables", e); } } @@ -792,7 +830,7 @@ public void deserialize(final ByteBuffer byteBuffer) { byteBuffer.get(bytes); try (ByteArrayInputStream bais = new ByteArrayInputStream(bytes); - DataInputStream dis = new DataInputStream(bais)) { + DataInputStream dis = new DataInputStream(bais)) { TTransport transport = new TIOStreamTransport(dis); TBinaryProtocol protocol = new TBinaryProtocol(transport); @@ -803,7 +841,10 @@ public void deserialize(final ByteBuffer byteBuffer) { finalDataPartitionTables.put(database, dataPartitionTable); } catch (IOException | TException e) { - LOG.error("[DataPartitionIntegrity] {} deserialize finalDataPartitionTables failed", this.getClass().getSimpleName(), e); + LOG.error( + "[DataPartitionIntegrity] {} deserialize finalDataPartitionTables failed", + this.getClass().getSimpleName(), + e); throw new RuntimeException("Failed to deserialize finalDataPartitionTables", e); } } @@ -849,7 +890,8 @@ public void deserialize(final ByteBuffer byteBuffer) { } } - private List deserializeDatabaseScopedTableList(List dataList) { + private List deserializeDatabaseScopedTableList( + List dataList) { if (dataList == null || dataList.isEmpty()) { return Collections.emptyList(); } @@ -866,12 +908,13 @@ private List deserializeDatabaseScopedTableLis ByteBuffer dataBuffer = data.duplicate(); DatabaseScopedDataPartitionTable table = - DatabaseScopedDataPartitionTable.deserialize(dataBuffer); + DatabaseScopedDataPartitionTable.deserialize(dataBuffer); result.add(table); } catch (Exception e) { - LOG.error("[DataPartitionIntegrity] Failed to deserialize DatabaseScopedDataPartitionTable", e); + LOG.error( + "[DataPartitionIntegrity] Failed to deserialize DatabaseScopedDataPartitionTable", e); } } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 7999afe65ddc8..33e9df4c662fc 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -69,6 +69,7 @@ import org.apache.iotdb.metrics.metricsets.net.NetMetrics; import org.apache.iotdb.metrics.metricsets.system.SystemMetrics; import org.apache.iotdb.rpc.TSStatusCode; + import org.apache.ratis.util.ExitUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java index 701c182653ed7..9b4d7ada28432 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBConfig.java @@ -4385,7 +4385,8 @@ public int getPartitionTableRecoverMaxReadMBsPerSecond() { return partitionTableRecoverMaxReadMBsPerSecond; } - public void setPartitionTableRecoverMaxReadMBsPerSecond(int partitionTableRecoverMaxReadMBsPerSecond) { + public void setPartitionTableRecoverMaxReadMBsPerSecond( + int partitionTableRecoverMaxReadMBsPerSecond) { this.partitionTableRecoverMaxReadMBsPerSecond = partitionTableRecoverMaxReadMBsPerSecond; } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 4cde31960a148..c85a377e395a3 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -33,6 +33,7 @@ import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; + import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -161,15 +162,18 @@ private void generateDataPartitionTableByMemory() { return; } - DataPartitionTable dataPartitionTable = new DataPartitionTable(dataPartitionMap); - - databasePartitionTableMap.compute(databaseName, (k,v) -> { - if (v == null) { - return new DataPartitionTable(dataPartitionMap); - } - v.merge(dataPartitionTable); - return v; - }); + DataPartitionTable dataPartitionTable = + new DataPartitionTable(dataPartitionMap); + + databasePartitionTableMap.compute( + databaseName, + (k, v) -> { + if (v == null) { + return new DataPartitionTable(dataPartitionMap); + } + v.merge(dataPartitionTable); + return v; + }); } catch (Exception e) { LOG.error("Error processing data region: {}", dataRegion.getDatabaseName(), e); failedFiles.incrementAndGet(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index a7e50ce9063de..e80a85fccd0b7 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -19,7 +19,6 @@ package org.apache.iotdb.db.protocol.thrift.impl; -import com.google.common.collect.ImmutableList; import org.apache.iotdb.common.rpc.thrift.TConfigNodeLocation; import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId; import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation; @@ -323,6 +322,8 @@ import org.apache.iotdb.service.rpc.thrift.TSInsertRecordReq; import org.apache.iotdb.trigger.api.enums.FailureStrategy; import org.apache.iotdb.trigger.api.enums.TriggerEvent; + +import com.google.common.collect.ImmutableList; import org.apache.thrift.TException; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; @@ -3250,14 +3251,17 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb parseGenerationStatus(resp); if (currentGenerator.getStatus().equals(DataPartitionTableGenerator.TaskStatus.COMPLETED)) { boolean success = false; - List databaseScopedDataPartitionTableList = new ArrayList<>(); - Map dataPartitionTableMap = currentGenerator.getDatabasePartitionTableMap(); + List databaseScopedDataPartitionTableList = + new ArrayList<>(); + Map dataPartitionTableMap = + currentGenerator.getDatabasePartitionTableMap(); if (!dataPartitionTableMap.isEmpty()) { for (Map.Entry entry : dataPartitionTableMap.entrySet()) { String database = entry.getKey(); DataPartitionTable dataPartitionTable = entry.getValue(); if (!StringUtils.isEmpty(database) && dataPartitionTable != null) { - DatabaseScopedDataPartitionTable databaseScopedDataPartitionTable = new DatabaseScopedDataPartitionTable(database, dataPartitionTable); + DatabaseScopedDataPartitionTable databaseScopedDataPartitionTable = + new DatabaseScopedDataPartitionTable(database, dataPartitionTable); databaseScopedDataPartitionTableList.add(databaseScopedDataPartitionTable); success = true; } @@ -3265,7 +3269,8 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb } if (success) { - List result = serializeDatabaseScopedTableList(databaseScopedDataPartitionTableList); + List result = + serializeDatabaseScopedTableList(databaseScopedDataPartitionTableList); resp.setDatabaseScopedDataPartitionTables(result); // Clear current generator @@ -3304,23 +3309,40 @@ private void updateResponse(T resp) { switch (currentGenerator.getStatus()) { case IN_PROGRESS: - setResponseFields(resp, DataPartitionTableGeneratorState.IN_PROGRESS.getCode(), String.format( + setResponseFields( + resp, + DataPartitionTableGeneratorState.IN_PROGRESS.getCode(), + String.format( "DataPartitionTable generation in progress: %.1f%%", - currentGenerator.getProgress() * 100), RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); - LOGGER.info(String.format( + currentGenerator.getProgress() * 100), + RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + LOGGER.info( + String.format( "DataPartitionTable generation with task ID: %s in progress: %.1f%%", currentTaskId, currentGenerator.getProgress() * 100)); break; case COMPLETED: - setResponseFields(resp, DataPartitionTableGeneratorState.SUCCESS.getCode(), "DataPartitionTable generation completed successfully", RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); + setResponseFields( + resp, + DataPartitionTableGeneratorState.SUCCESS.getCode(), + "DataPartitionTable generation completed successfully", + RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); LOGGER.info("DataPartitionTable generation completed with task ID: {}", currentTaskId); break; case FAILED: - setResponseFields(resp, DataPartitionTableGeneratorState.FAILED.getCode(), "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage(), RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + setResponseFields( + resp, + DataPartitionTableGeneratorState.FAILED.getCode(), + "DataPartitionTable generation failed: " + currentGenerator.getErrorMessage(), + RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); LOGGER.info("DataPartitionTable generation failed with task ID: {}", currentTaskId); break; default: - setResponseFields(resp, DataPartitionTableGeneratorState.UNKNOWN.getCode(), "Unknown task status: " + currentGenerator.getStatus(), RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); + setResponseFields( + resp, + DataPartitionTableGeneratorState.UNKNOWN.getCode(), + "Unknown task status: " + currentGenerator.getStatus(), + RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); LOGGER.info("DataPartitionTable generation failed with task ID: {}", currentTaskId); break; } @@ -3341,10 +3363,9 @@ private void setResponseFields(T resp, int errorCode, String message, TSStat } /** - * Process data directory to find the earliest timeslots for each database. - * Map earliestTimeslots - * key(String): database name - * value(Long): the earliest time slot id of the database + * Process data directory to find the earliest timeslots for each database. Map + * earliestTimeslots key(String): database name value(Long): the earliest time slot id of the + * database */ private void processDataDirectoryForEarliestTimeslots( File dataDir, Map earliestTimeslots) { @@ -3366,7 +3387,9 @@ private void processDataDirectoryForEarliestTimeslots( } databaseEarliestRegionMap.computeIfAbsent( databaseName, key -> Long.MAX_VALUE); - long earliestTimeslot = findEarliestTimeslotInDatabase(dbPath.toFile(), databaseEarliestRegionMap); + long earliestTimeslot = + findEarliestTimeslotInDatabase( + dbPath.toFile(), databaseEarliestRegionMap); if (earliestTimeslot != Long.MAX_VALUE) { earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); @@ -3383,7 +3406,8 @@ private void processDataDirectoryForEarliestTimeslots( } /** Find the earliest timeslot in a database directory. */ - private long findEarliestTimeslotInDatabase(File databaseDir, Map databaseEarliestRegionMap) { + private long findEarliestTimeslotInDatabase( + File databaseDir, Map databaseEarliestRegionMap) { String databaseName = databaseDir.getName(); List> futureList = new ArrayList<>(); @@ -3430,9 +3454,8 @@ private long findEarliestTimeslotInDatabase(File databaseDir, Map String timeSlotName = timeSlotPath.getFileName().toString(); long timeslot = Long.parseLong(timeSlotName); databaseEarliestRegionMap.compute( - databaseName, - (k, v) -> - v == null ? timeslot : Math.min(v, timeslot)); + databaseName, + (k, v) -> v == null ? timeslot : Math.min(v, timeslot)); } catch (IOException e) { LOGGER.error( "Failed to find any {} files in the {} directory", @@ -3463,7 +3486,8 @@ private long findEarliestTimeslotInDatabase(File databaseDir, Map return databaseEarliestRegionMap.get(databaseName); } - private List serializeDatabaseScopedTableList(List list) { + private List serializeDatabaseScopedTableList( + List list) { if (list == null || list.isEmpty()) { return Collections.emptyList(); } @@ -3472,7 +3496,7 @@ private List serializeDatabaseScopedTableList(List serializeDatabaseScopedTableList(List autoCleanPartitionTable( } /** - * Merge a complete DataPartitionTable from the partition tables received from multiple DataNodes (supports cross-database merging, which is exactly the logic implemented in the current PR) + * Merge a complete DataPartitionTable from the partition tables received from multiple DataNodes + * (supports cross-database merging, which is exactly the logic implemented in the current PR) * * @param sourceMap Map * @return The complete merged partition table @@ -290,30 +292,34 @@ public Set autoCleanPartitionTable( public DataPartitionTable merge(Map sourceMap) { DataPartitionTable merged = new DataPartitionTable(this.dataPartitionMap); for (DataPartitionTable table : sourceMap.values()) { - for (Map.Entry entry : table.dataPartitionMap.entrySet()) { + for (Map.Entry entry : + table.dataPartitionMap.entrySet()) { TSeriesPartitionSlot slot = entry.getKey(); SeriesPartitionTable seriesTable = entry.getValue(); - merged.dataPartitionMap - .computeIfAbsent(slot, k -> new SeriesPartitionTable()) - .merge(seriesTable); + merged + .dataPartitionMap + .computeIfAbsent(slot, k -> new SeriesPartitionTable()) + .merge(seriesTable); } } return merged; } /** - * Support single table merging - * Merge another DataPartitionTable into the current object (used for incremental merging) + * Support single table merging Merge another DataPartitionTable into the current object (used for + * incremental merging) */ public DataPartitionTable merge(DataPartitionTable sourcePartitionTable) { DataPartitionTable merged = new DataPartitionTable(this.dataPartitionMap); if (sourcePartitionTable == null) { return merged; } - for (Map.Entry entry : sourcePartitionTable.dataPartitionMap.entrySet()) { - merged.dataPartitionMap - .computeIfAbsent(entry.getKey(), k -> new SeriesPartitionTable()) - .merge(entry.getValue()); + for (Map.Entry entry : + sourcePartitionTable.dataPartitionMap.entrySet()) { + merged + .dataPartitionMap + .computeIfAbsent(entry.getKey(), k -> new SeriesPartitionTable()) + .merge(entry.getValue()); } return merged; } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java index e81057ed96780..154ae4b55a86d 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java @@ -28,7 +28,7 @@ public DataPartitionTable getDataPartitionTable() { } public void serialize(OutputStream outputStream, TProtocol protocol) - throws IOException, TException { + throws IOException, TException { ReadWriteIOUtils.write(database, outputStream); ReadWriteIOUtils.write(dataPartitionTable != null, outputStream); @@ -53,7 +53,7 @@ public static DatabaseScopedDataPartitionTable deserialize(ByteBuffer buffer) { } public static DatabaseScopedDataPartitionTable deserialize( - InputStream inputStream, TProtocol protocol) throws IOException, TException { + InputStream inputStream, TProtocol protocol) throws IOException, TException { String database = ReadWriteIOUtils.readString(inputStream); boolean hasDataPartitionTable = ReadWriteIOUtils.readBool(inputStream); @@ -72,8 +72,8 @@ public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; DatabaseScopedDataPartitionTable that = (DatabaseScopedDataPartitionTable) o; - return Objects.equals(database, that.database) && - Objects.equals(dataPartitionTable, that.dataPartitionTable); + return Objects.equals(database, that.database) + && Objects.equals(dataPartitionTable, that.dataPartitionTable); } @Override diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java index 1c1350b0cbbe4..915e3df4e329a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/SeriesPartitionTable.java @@ -74,7 +74,7 @@ public Map> getSeriesPartitionMap() public void putDataPartition(TTimePartitionSlot timePartitionSlot, TConsensusGroupId groupId) { List groupList = - seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()); + seriesPartitionMap.computeIfAbsent(timePartitionSlot, empty -> new Vector<>()); synchronized (groupList) { if (!groupList.contains(groupId)) { groupList.add(groupId); @@ -278,11 +278,10 @@ public List autoCleanPartitionTable( public void merge(SeriesPartitionTable sourceMap) { if (sourceMap == null) return; - sourceMap.seriesPartitionMap.forEach((timeSlot, groups) -> { - this.seriesPartitionMap - .computeIfAbsent(timeSlot, k -> new ArrayList<>()) - .addAll(groups); - }); + sourceMap.seriesPartitionMap.forEach( + (timeSlot, groups) -> { + this.seriesPartitionMap.computeIfAbsent(timeSlot, k -> new ArrayList<>()).addAll(groups); + }); } public void serialize(OutputStream outputStream, TProtocol protocol) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index 73d68787eee6c..f06d019826653 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -130,15 +130,16 @@ public static long getTimePartitionIdWithoutOverflow(long time) { return partitionId.longValue(); } - /** Since bigTimePartitionInterval.multiply(partitionId) is always an exact multiple of - * bigTimePartitionInterval, the previous conditional logic was redundant and the else - * branch was unreachable. We directly compute the time without risk of overflow here. - */ + /** + * Since bigTimePartitionInterval.multiply(partitionId) is always an exact multiple of + * bigTimePartitionInterval, the previous conditional logic was redundant and the else branch was + * unreachable. We directly compute the time without risk of overflow here. + */ public static long getTimeWithoutOverflow(long partitionId) { return bigTimePartitionInterval - .multiply(BigInteger.valueOf(partitionId)) - .add(bigTimePartitionOrigin) - .longValue(); + .multiply(BigInteger.valueOf(partitionId)) + .add(bigTimePartitionOrigin) + .longValue(); } public static long getTimeByPartitionId(long partitionId) { diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java index 54551ec0a0298..287030753b28a 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java @@ -109,9 +109,12 @@ public long getTotalBytes() { } /** - * Calculate the expected time using double (double can easily hold nanoseconds on the order of 10^18), then perform clamping and convert to long. - * Advantages: Extremely simple, zero exceptions thrown, and double precision is sufficient (nanosecond-level errors are negligible). - * Disadvantages: In extreme cases (when totalBytes is close to 2^63), double loses precision in the trailing digits. However, in IoTDB's actual scenarios, bytesPerSecond is typically between 10MB/s and 1GB/s, so this situation will not occur. + * Calculate the expected time using double (double can easily hold nanoseconds on the order of + * 10^18), then perform clamping and convert to long. Advantages: Extremely simple, zero + * exceptions thrown, and double precision is sufficient (nanosecond-level errors are negligible). + * Disadvantages: In extreme cases (when totalBytes is close to 2^63), double loses precision in + * the trailing digits. However, in IoTDB's actual scenarios, bytesPerSecond is typically between + * 10MB/s and 1GB/s, so this situation will not occur. */ private long expectedTimeNs(long totalBytes) { if (totalBytes <= 0) { From b90105b8d4c2d14c7c0802d77e6aeed470df4d5e Mon Sep 17 00:00:00 2001 From: libo Date: Tue, 17 Mar 2026 17:04:09 +0800 Subject: [PATCH 29/39] license --- .../DatabaseScopedDataPartitionTable.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java index 154ae4b55a86d..a47f4024eac88 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/partition/DatabaseScopedDataPartitionTable.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.iotdb.commons.partition; import org.apache.thrift.TException; From 1427c3d6f893ae471c3e909f506a875b22b32241 Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 18 Mar 2026 10:45:25 +0800 Subject: [PATCH 30/39] Use our standard binary written function instead of stream.UTF(); The overflow problem still exist, that's no way to resolve, previously, the user set timePartitionOrigin to Long.MIN_VALUE. In this case, adding it to partitionId = -1 will indeed cause an overflow. However, the partition table in the system only accepts timestamps of the long type and does not support bigint timestamps. Therefore, if an overflow actually occurs, we have to accept the outcome where the program is interrupted by an exception being thrown. --- ...PartitionTableIntegrityCheckProcedure.java | 8 +++---- .../DataPartitionTableGenerator.java | 4 ++-- .../commons/utils/TimePartitionUtils.java | 24 ++----------------- 3 files changed, 8 insertions(+), 28 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index de4d01943e3dc..ef7da5bfb3c85 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -320,7 +320,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { .orElse(null); if (localEarliestSlot.getStartTime() - > TimePartitionUtils.getTimeByPartitionId(earliestTimeslot)) { + > TimePartitionUtils.getStartTimeByPartitionId(earliestTimeslot)) { lostDataPartitionsOfDatabases.add(database); LOG.warn( "[DataPartitionIntegrity] Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", @@ -657,7 +657,7 @@ public void serialize(final DataOutputStream stream) throws IOException { // Serialize earliestTimeslots stream.writeInt(earliestTimeslots.size()); for (Map.Entry entry : earliestTimeslots.entrySet()) { - stream.writeUTF(entry.getKey()); + ReadWriteIOUtils.write(entry.getKey(), stream); stream.writeLong(entry.getValue()); } @@ -697,14 +697,14 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(lostDataPartitionsOfDatabases.size()); for (String database : lostDataPartitionsOfDatabases) { - stream.writeUTF(database); + ReadWriteIOUtils.write(database, stream); } if (finalDataPartitionTables != null && !finalDataPartitionTables.isEmpty()) { stream.writeInt(finalDataPartitionTables.size()); for (Map.Entry entry : finalDataPartitionTables.entrySet()) { - stream.writeUTF(entry.getKey()); + ReadWriteIOUtils.write(entry.getKey(), stream); try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos)) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index c85a377e395a3..bdb5eff49bef1 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -217,7 +217,7 @@ private void constructDataPartitionMap( TSeriesPartitionSlot seriesSlotId = seriesPartitionExecutor.getSeriesPartitionSlot(deviceId); TTimePartitionSlot timePartitionSlot = - new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + new TTimePartitionSlot(TimePartitionUtils.getStartTimeByPartitionId(timeSlotId)); dataPartitionMap .computeIfAbsent( seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) @@ -235,7 +235,7 @@ private static SeriesPartitionTable newSeriesPartitionTable( TConsensusGroupId consensusGroupId, long timeSlotId) { SeriesPartitionTable seriesPartitionTable = new SeriesPartitionTable(); TTimePartitionSlot timePartitionSlot = - new TTimePartitionSlot(TimePartitionUtils.getTimeByPartitionId(timeSlotId)); + new TTimePartitionSlot(TimePartitionUtils.getStartTimeByPartitionId(timeSlotId)); seriesPartitionTable.putDataPartition(timePartitionSlot, consensusGroupId); return seriesPartitionTable; } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java index f06d019826653..250a347d1496b 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/TimePartitionUtils.java @@ -112,14 +112,6 @@ public static long getTimePartitionId(long time) { : time / timePartitionInterval - 1; } - public static long getTime(long partitionId) { - long time = partitionId * timePartitionInterval; - if (time > 0 || time % timePartitionInterval == 0) { - return time + timePartitionOrigin; - } - return ((partitionId + 1) * timePartitionInterval) + timePartitionOrigin; - } - public static long getTimePartitionIdWithoutOverflow(long time) { BigInteger bigTime = BigInteger.valueOf(time).subtract(bigTimePartitionOrigin); BigInteger partitionId = @@ -130,20 +122,8 @@ public static long getTimePartitionIdWithoutOverflow(long time) { return partitionId.longValue(); } - /** - * Since bigTimePartitionInterval.multiply(partitionId) is always an exact multiple of - * bigTimePartitionInterval, the previous conditional logic was redundant and the else branch was - * unreachable. We directly compute the time without risk of overflow here. - */ - public static long getTimeWithoutOverflow(long partitionId) { - return bigTimePartitionInterval - .multiply(BigInteger.valueOf(partitionId)) - .add(bigTimePartitionOrigin) - .longValue(); - } - - public static long getTimeByPartitionId(long partitionId) { - return originMayCauseOverflow ? getTimeWithoutOverflow(partitionId) : getTime(partitionId); + public static long getStartTimeByPartitionId(long partitionId) { + return (partitionId * timePartitionInterval) + timePartitionOrigin; } public static boolean satisfyPartitionId(long startTime, long endTime, long partitionId) { From 208135973e6dacce344c25c0585912ec095c8787 Mon Sep 17 00:00:00 2001 From: libo Date: Wed, 18 Mar 2026 23:31:27 +0800 Subject: [PATCH 31/39] Fix some opinions --- ...PartitionTableIntegrityCheckProcedure.java | 104 +++++++++++------- ...tionTableIntegrityCheckProcedureState.java | 2 +- .../iotdb/confignode/service/ConfigNode.java | 2 +- 3 files changed, 67 insertions(+), 41 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index ef7da5bfb3c85..0a41924d8d782 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -81,8 +81,14 @@ public class DataPartitionTableIntegrityCheckProcedure private static final Logger LOG = LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedure.class); + // how many times will retry after rpc request failed private static final int MAX_RETRY_COUNT = 3; - private static final long HEART_BEAT_REQUEST_RATE = 10000; + + // how long to start a heartbeat request, the unit is ms + private static final long HEART_BEAT_REQUEST_INTERVAL = 10000; + + // how long to check all datanode are alive, the unit is ms + private static final long CHECK_ALL_DATANODE_IS_ALIVE_INTERVAL = 10000; NodeManager dataNodeManager; private List allDataNodes = new ArrayList<>(); @@ -91,13 +97,20 @@ public class DataPartitionTableIntegrityCheckProcedure /** Collected earliest timeslots from DataNodes: database -> earliest timeslot */ private Map earliestTimeslots = new ConcurrentHashMap<>(); - /** DataPartitionTables collected from DataNodes: dataNodeId -> DataPartitionTable */ + /** DataPartitionTables collected from DataNodes: dataNodeId -> */ private Map> dataPartitionTables = new ConcurrentHashMap<>(); - private Set lostDataPartitionsOfDatabases = new HashSet<>(); + /** + * Collect all database names that those database lost data partition, the string in the Set + * collection is database name + */ + private Set databasesWithLostDataPartition = new HashSet<>(); - /** Final merged DataPartitionTable */ + /** + * Final merged DataPartitionTable for every database Map key(String): + * database name + */ private Map finalDataPartitionTables; private static Set skipDataNodes = @@ -125,7 +138,7 @@ protected Flow executeFromState( failedDataNodes = new HashSet<>(); return collectEarliestTimeslots(); case ANALYZE_MISSING_PARTITIONS: - lostDataPartitionsOfDatabases = new HashSet<>(); + databasesWithLostDataPartition = new HashSet<>(); return analyzeMissingPartitions(env); case REQUEST_PARTITION_TABLES: return requestPartitionTables(); @@ -134,8 +147,8 @@ protected Flow executeFromState( case MERGE_PARTITION_TABLES: finalDataPartitionTables = new HashMap<>(); return mergePartitionTables(env); - case WRITE_PARTITION_TABLE_TO_RAFT: - return writePartitionTableToRaft(env); + case WRITE_PARTITION_TABLE_TO_CONSENSUS: + return writePartitionTableToConsensus(env); default: throw new ProcedureException("Unknown state: " + state); } @@ -156,7 +169,7 @@ protected void rollbackState( earliestTimeslots.clear(); break; case ANALYZE_MISSING_PARTITIONS: - lostDataPartitionsOfDatabases.clear(); + databasesWithLostDataPartition.clear(); break; case REQUEST_PARTITION_TABLES: case REQUEST_PARTITION_TABLES_HEART_BEAT: @@ -203,6 +216,9 @@ private Flow collectEarliestTimeslots() { if (allDataNodes.isEmpty()) { LOG.error( "[DataPartitionIntegrity] No DataNodes registered, no way to collect earliest timeslots, waiting for them to go up"); + sleep( + CHECK_ALL_DATANODE_IS_ALIVE_INTERVAL, + "[DataPartitionIntegrity] Error waiting for DataNode startup due to thread interruption."); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -296,7 +312,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { || localDataPartitionTable.isEmpty() || localDataPartitionTable.get(database) == null || localDataPartitionTable.get(database).isEmpty()) { - lostDataPartitionsOfDatabases.add(database); + databasesWithLostDataPartition.add(database); LOG.warn( "[DataPartitionIntegrity] No data partition table related to database {} was found from the ConfigNode, and this issue needs to be repaired", database); @@ -321,7 +337,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { if (localEarliestSlot.getStartTime() > TimePartitionUtils.getStartTimeByPartitionId(earliestTimeslot)) { - lostDataPartitionsOfDatabases.add(database); + databasesWithLostDataPartition.add(database); LOG.warn( "[DataPartitionIntegrity] Database {} has lost timeslot {} in its data table partition, and this issue needs to be repaired", database, @@ -330,7 +346,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { } } - if (lostDataPartitionsOfDatabases.isEmpty()) { + if (databasesWithLostDataPartition.isEmpty()) { LOG.info( "[DataPartitionIntegrity] No databases have lost data partitions, terminating procedure"); return Flow.NO_MORE_STATE; @@ -338,7 +354,7 @@ private Flow analyzeMissingPartitions(final ConfigNodeProcedureEnv env) { LOG.info( "[DataPartitionIntegrity] Identified {} databases have lost data partitions, will request DataPartitionTable generation from {} DataNodes", - lostDataPartitionsOfDatabases.size(), + databasesWithLostDataPartition.size(), allDataNodes.size() - failedDataNodes.size()); setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); return Flow.HAS_MORE_STATE; @@ -381,6 +397,9 @@ private Flow requestPartitionTables() { if (allDataNodes.isEmpty()) { LOG.error( "[DataPartitionIntegrity] No DataNodes registered, no way to requested DataPartitionTable generation, terminating procedure"); + sleep( + CHECK_ALL_DATANODE_IS_ALIVE_INTERVAL, + "[DataPartitionIntegrity] Error waiting for DataNode startup due to thread interruption."); setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -392,7 +411,7 @@ private Flow requestPartitionTables() { if (!dataPartitionTables.containsKey(dataNodeId)) { try { TGenerateDataPartitionTableReq req = new TGenerateDataPartitionTableReq(); - req.setDatabases(lostDataPartitionsOfDatabases); + req.setDatabases(databasesWithLostDataPartition); TGenerateDataPartitionTableResp resp = (TGenerateDataPartitionTableResp) SyncDataNodeClientPool.getInstance() @@ -502,16 +521,21 @@ private Flow requestPartitionTablesHeartBeat() { return Flow.HAS_MORE_STATE; } + sleep( + HEART_BEAT_REQUEST_INTERVAL, + "[DataPartitionIntegrity] Error checking DataPartitionTable status due to thread interruption."); + setNextState( + DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); + return Flow.HAS_MORE_STATE; + } + + private static void sleep(long intervalTime, String logMessage) { try { - Thread.sleep(HEART_BEAT_REQUEST_RATE); + Thread.sleep(intervalTime); } catch (InterruptedException e) { Thread.currentThread().interrupt(); - LOG.error( - "[DataPartitionIntegrity] Error checking DataPartitionTable status due to thread interruption."); + LOG.error(logMessage); } - setNextState( - DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES_HEART_BEAT); - return Flow.HAS_MORE_STATE; } /** Merge DataPartitionTables from all DataNodes into a final table. */ @@ -529,7 +553,7 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { Map finalDataPartitionMap = new HashMap<>(); - for (String database : lostDataPartitionsOfDatabases) { + for (String database : databasesWithLostDataPartition) { // Get current DataPartitionTable from ConfigManager Map>>> localDataPartitionTableMap = getLocalDataPartitionTable(env, database); @@ -576,54 +600,56 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { } LOG.info("[DataPartitionIntegrity] DataPartitionTables merge completed successfully"); - setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_RAFT); + setNextState(DataPartitionTableIntegrityCheckProcedureState.WRITE_PARTITION_TABLE_TO_CONSENSUS); return Flow.HAS_MORE_STATE; } - /** Write the final DataPartitionTable to raft log. */ - private Flow writePartitionTableToRaft(final ConfigNodeProcedureEnv env) { + /** Write the final DataPartitionTable to consensus log. */ + private Flow writePartitionTableToConsensus(final ConfigNodeProcedureEnv env) { if (LOG.isDebugEnabled()) { - LOG.debug("Writing DataPartitionTable to raft log..."); + LOG.debug("Writing DataPartitionTable to consensus log..."); } - if (lostDataPartitionsOfDatabases.isEmpty()) { + if (databasesWithLostDataPartition.isEmpty()) { LOG.error("[DataPartitionIntegrity] No database lost data partition table"); setFailure( "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("No database lost data partition table for raft write")); + new ProcedureException("No database lost data partition table for consensus write")); return getFlow(); } if (finalDataPartitionTables.isEmpty()) { - LOG.error("[DataPartitionIntegrity] DataPartitionTable to write to raft"); + LOG.error("[DataPartitionIntegrity] DataPartitionTable to write to consensus"); setFailure( "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("No DataPartitionTable available for raft write")); + new ProcedureException("No DataPartitionTable available for consensus write")); return getFlow(); } int failedCnt = 0; - while (failedCnt < MAX_RETRY_COUNT) { + final int MAX_RETRY_COUNT_FOR_CONSENSUS = 3; + while (failedCnt < MAX_RETRY_COUNT_FOR_CONSENSUS) { try { CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); Map assignedDataPartition = new HashMap<>(); - for (String database : lostDataPartitionsOfDatabases) { + for (String database : databasesWithLostDataPartition) { assignedDataPartition.put(database, finalDataPartitionTables.get(database)); } createPlan.setAssignedDataPartition(assignedDataPartition); TSStatus tsStatus = env.getConfigManager().getConsensusManager().write(createPlan); if (tsStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOG.info("[DataPartitionIntegrity] DataPartitionTable successfully written to raft log"); + LOG.info( + "[DataPartitionIntegrity] DataPartitionTable successfully written to consensus log"); break; } else { - LOG.error("[DataPartitionIntegrity] Failed to write DataPartitionTable to raft log"); + LOG.error("[DataPartitionIntegrity] Failed to write DataPartitionTable to consensus log"); setFailure( "DataPartitionTableIntegrityCheckProcedure", - new ProcedureException("Failed to write DataPartitionTable to raft log")); + new ProcedureException("Failed to write DataPartitionTable to consensus log")); } } catch (Exception e) { - LOG.error("[DataPartitionIntegrity] Error writing DataPartitionTable to raft log", e); + LOG.error("[DataPartitionIntegrity] Error writing DataPartitionTable to consensus log", e); setFailure("DataPartitionTableIntegrityCheckProcedure", e); } failedCnt++; @@ -695,8 +721,8 @@ public void serialize(final DataOutputStream stream) throws IOException { } } - stream.writeInt(lostDataPartitionsOfDatabases.size()); - for (String database : lostDataPartitionsOfDatabases) { + stream.writeInt(databasesWithLostDataPartition.size()); + for (String database : databasesWithLostDataPartition) { ReadWriteIOUtils.write(database, stream); } @@ -812,10 +838,10 @@ public void deserialize(final ByteBuffer byteBuffer) { dataPartitionTables.put(dataNodeId, tableList); } - int lostDataPartitionsOfDatabasesSize = byteBuffer.getInt(); - for (int i = 0; i < lostDataPartitionsOfDatabasesSize; i++) { + int databasesWithLostDataPartitionSize = byteBuffer.getInt(); + for (int i = 0; i < databasesWithLostDataPartitionSize; i++) { String database = ReadWriteIOUtils.readString(byteBuffer); - lostDataPartitionsOfDatabases.add(database); + databasesWithLostDataPartition.add(database); } // Deserialize finalDataPartitionTable size diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java index 899ed502b2a88..bf302db755bac 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/state/DataPartitionTableIntegrityCheckProcedureState.java @@ -31,5 +31,5 @@ public enum DataPartitionTableIntegrityCheckProcedureState { /** Merge DataPartitionTables from all DataNodes */ MERGE_PARTITION_TABLES, /** Write final DataPartitionTable to raft log */ - WRITE_PARTITION_TABLE_TO_RAFT + WRITE_PARTITION_TABLE_TO_CONSENSUS } diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 33e9df4c662fc..d04e2814be49c 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -227,7 +227,7 @@ public void active() { dataPartitionTableCheckExecutor.submit( () -> { LOGGER.info( - "[DataPartitionIntegrity] Prepare to start dataPartitionTableIntegrityCheck after all datanodes are started up"); + "[DataPartitionIntegrity] Prepare to start dataPartitionTableIntegrityCheck after all datanodes started up"); Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeoutInMs()); while (true) { From e9231760a9a0460da8d035ca86051b4c8ff13cc4 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 19 Mar 2026 10:38:07 +0800 Subject: [PATCH 32/39] Fix another lots of opinions --- .../confignode/conf/ConfigNodeDescriptor.java | 2 +- ...PartitionTableIntegrityCheckProcedure.java | 15 ++--- .../iotdb/confignode/service/ConfigNode.java | 55 ++++++++++--------- .../apache/iotdb/db/conf/IoTDBDescriptor.java | 2 +- .../DataPartitionTableGenerator.java | 6 +- .../impl/DataNodeInternalRPCServiceImpl.java | 9 ++- .../conf/iotdb-system.properties.template | 4 +- 7 files changed, 51 insertions(+), 42 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java index 6843aced0e511..9f8206f5dd78b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/conf/ConfigNodeDescriptor.java @@ -325,7 +325,7 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio conf.setPartitionTableRecoverWaitAllDnUpTimeoutInMs( Long.parseLong( properties.getProperty( - "partition_table_recover_wait_all_dn_up_timeout", + "partition_table_recover_wait_all_dn_up_timeout_ms", String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeoutInMs())))); String leaderDistributionPolicy = diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 0a41924d8d782..690306779d23e 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -551,9 +551,9 @@ private Flow mergePartitionTables(final ConfigNodeProcedureEnv env) { return Flow.HAS_MORE_STATE; } - Map finalDataPartitionMap = new HashMap<>(); - for (String database : databasesWithLostDataPartition) { + Map finalDataPartitionMap = new HashMap<>(); + // Get current DataPartitionTable from ConfigManager Map>>> localDataPartitionTableMap = getLocalDataPartitionTable(env, database); @@ -627,8 +627,8 @@ private Flow writePartitionTableToConsensus(final ConfigNodeProcedureEnv env) { } int failedCnt = 0; - final int MAX_RETRY_COUNT_FOR_CONSENSUS = 3; - while (failedCnt < MAX_RETRY_COUNT_FOR_CONSENSUS) { + final int maxRetryCountForConsensus = 3; + while (failedCnt < maxRetryCountForConsensus) { try { CreateDataPartitionPlan createPlan = new CreateDataPartitionPlan(); Map assignedDataPartition = new HashMap<>(); @@ -931,13 +931,8 @@ private List deserializeDatabaseScopedTableLis } try { - ByteBuffer dataBuffer = data.duplicate(); - - DatabaseScopedDataPartitionTable table = - DatabaseScopedDataPartitionTable.deserialize(dataBuffer); - + DatabaseScopedDataPartitionTable table = DatabaseScopedDataPartitionTable.deserialize(data); result.add(table); - } catch (Exception e) { LOG.error( "[DataPartitionIntegrity] Failed to deserialize DatabaseScopedDataPartitionTable", e); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index d04e2814be49c..9361c87894517 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -223,33 +223,38 @@ public void active() { loadSecretKey(); loadHardwareCode(); - dataPartitionTableCheckFuture = - dataPartitionTableCheckExecutor.submit( - () -> { - LOGGER.info( - "[DataPartitionIntegrity] Prepare to start dataPartitionTableIntegrityCheck after all datanodes started up"); - Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeoutInMs()); - - while (true) { - List dnList = - configManager - .getLoadManager() - .filterDataNodeThroughStatus(NodeStatus.Running); - if (dnList != null && !dnList.isEmpty()) { - LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); - TSStatus status = - configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); - if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { - LOGGER.error("Data partition table integrity check failed!"); + if (configManager.getConsensusManager().isLeader()) { + dataPartitionTableCheckFuture = + dataPartitionTableCheckExecutor.submit( + () -> { + LOGGER.info( + "[DataPartitionIntegrity] Prepare to start dataPartitionTableIntegrityCheck after all datanodes started up"); + Thread.sleep(CONF.getPartitionTableRecoverWaitAllDnUpTimeoutInMs()); + + while (true) { + List dnList = + configManager + .getLoadManager() + .filterDataNodeThroughStatus(NodeStatus.Running); + if (dnList != null && !dnList.isEmpty()) { + LOGGER.info("Starting dataPartitionTableIntegrityCheck..."); + TSStatus status = + configManager.getProcedureManager().dataPartitionTableIntegrityCheck(); + if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) { + LOGGER.error( + "Data partition table integrity check failed! Current status code is {}, status message is {}", + status.getCode(), + status.getMessage()); + } + break; + } else { + LOGGER.info("No running datanodes found, waiting..."); + Thread.sleep(5000); } - break; - } else { - LOGGER.info("No running datanodes found, waiting..."); - Thread.sleep(5000); } - } - return null; - }); + return null; + }); + } return; } else { saveSecretKey(); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java index 5b49ce83ee2ff..6f974a095dee8 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/conf/IoTDBDescriptor.java @@ -1147,7 +1147,7 @@ public void loadProperties(TrimProperties properties) throws BadNodeUrlException conf.setPartitionTableRecoverMaxReadMBsPerSecond( Integer.parseInt( properties.getProperty( - "partition_table_recover_max_read_bytes_per_second", + "partition_table_recover_max_read_megabytes_per_second", String.valueOf(conf.getPartitionTableRecoverMaxReadMBsPerSecond())))); conf.setIncludeNullValueInWriteThroughputMetric( diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index bdb5eff49bef1..b15dc715aebac 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -125,7 +125,6 @@ public CompletableFuture startGeneration() { } private void generateDataPartitionTableByMemory() { - Map dataPartitionMap = new ConcurrentHashMap<>(); List> futures = new ArrayList<>(); SeriesPartitionExecutor seriesPartitionExecutor = @@ -145,6 +144,9 @@ private void generateDataPartitionTableByMemory() { return; } + Map dataPartitionMap = + new ConcurrentHashMap<>(); + tsFileManager.readLock(); List seqTsFileList = tsFileManager.getTsFileList(true); List unseqTsFileList = tsFileManager.getTsFileList(false); @@ -224,8 +226,10 @@ private void constructDataPartitionMap( .putDataPartition(timePartitionSlot, consensusGroupId); } processedFiles.incrementAndGet(); + totalFiles.incrementAndGet(); } catch (Exception e) { failedFiles.incrementAndGet(); + totalFiles.incrementAndGet(); LOG.error("Failed to process tsfile {}, {}", tsFileResource.getTsFileID(), e.getMessage()); } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index e80a85fccd0b7..f2a909941a598 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3187,7 +3187,10 @@ public TGenerateDataPartitionTableResp generateDataPartitionTable( if (currentGenerator != null && currentGenerator.getStatus() == DataPartitionTableGenerator.TaskStatus.IN_PROGRESS) { resp.setErrorCode(DataPartitionTableGeneratorState.IN_PROGRESS.getCode()); - resp.setMessage("DataPartitionTable generation is already in the progress"); + resp.setMessage( + String.format( + "DataPartitionTable generation is already in the progress: %.1f%%", + currentGenerator.getProgress() * 100)); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); return resp; } @@ -3305,7 +3308,9 @@ private void handleResponse(TGenerateDataPartitionTableHeartbeatResp resp) { } private void updateResponse(T resp) { - if (currentGenerator == null) return; + if (currentGenerator == null) { + return; + } switch (currentGenerator.getStatus()) { case IN_PROGRESS: diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index c36f35cd5778c..5f8f45e844fcd 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -750,12 +750,12 @@ partition_table_recover_worker_num=10 # Limit the number of bytes read per second from a file, the unit is MB # effectiveMode: restart # Datatype: Integer -partition_table_recover_max_read_bytes_per_second=10 +partition_table_recover_max_read_megabytes_per_second=10 # Set a timeout to wait for all datanodes complete startup, the unit is ms # effectiveMode: restart # Datatype: Integer -partition_table_recover_wait_all_dn_up_timeout=60000 +partition_table_recover_wait_all_dn_up_timeout_ms=60000 #################### ### Memory Control Configuration From 0728ca3fe176ad269d419f87ddeafb6d342c8161 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 19 Mar 2026 16:04:03 +0800 Subject: [PATCH 33/39] Optimize codes --- ...PartitionTableIntegrityCheckProcedure.java | 11 +- .../iotdb/confignode/service/ConfigNode.java | 8 +- .../impl/DataNodeInternalRPCServiceImpl.java | 234 ++++++------------ 3 files changed, 90 insertions(+), 163 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 690306779d23e..5730b3821e5d2 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -438,8 +438,7 @@ private Flow requestPartitionTables() { } } - if (failedDataNodes.size() == allDataNodes.size() - && new HashSet<>(allDataNodes).containsAll(failedDataNodes)) { + if (failedDataNodes.size() == allDataNodes.size()) { setNextState(DataPartitionTableIntegrityCheckProcedureState.COLLECT_EARLIEST_TIMESLOTS); return Flow.HAS_MORE_STATE; } @@ -497,6 +496,7 @@ private Flow requestPartitionTablesHeartBeat() { dataNodeId); break; default: + failedDataNodes.add(dataNode); LOG.error( "[DataPartitionIntegrity] DataNode {} returned unknown error code: {}", dataNodeId, @@ -521,6 +521,13 @@ private Flow requestPartitionTablesHeartBeat() { return Flow.HAS_MORE_STATE; } + // Don't find any one data partition table generation task on all registered DataNodes, go back + // to the REQUEST_PARTITION_TABLES step and re-execute + if (failedDataNodes.size() == allDataNodes.size()) { + setNextState(DataPartitionTableIntegrityCheckProcedureState.REQUEST_PARTITION_TABLES); + return Flow.HAS_MORE_STATE; + } + sleep( HEART_BEAT_REQUEST_INTERVAL, "[DataPartitionIntegrity] Error checking DataPartitionTable status due to thread interruption."); diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java index 9361c87894517..3a6c93b12ec2f 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/service/ConfigNode.java @@ -194,7 +194,7 @@ public void active() { int configNodeId = CONF.getConfigNodeId(); configManager.initConsensusManager(); upgrade(); - waitForLeaderElected(); + TConfigNodeLocation leaderNodeLocation = waitForLeaderElected(); setUpMetricService(); // Notice: We always set up Seed-ConfigNode's RPC service lastly to ensure // that the external service is not provided until ConfigNode is fully available @@ -223,7 +223,8 @@ public void active() { loadSecretKey(); loadHardwareCode(); - if (configManager.getConsensusManager().isLeader()) { + /* After the ConfigNode leader election, a leader switch may occur, which could cause the procedure not to be created. This can happen if the original leader has not yet executed the procedure creation, while the other followers have already finished starting up. Therefore, having the original leader (before the leader switch) initiate the process ensures that only one procedure will be created. */ + if (leaderNodeLocation.getConfigNodeId() == configNodeId) { dataPartitionTableCheckFuture = dataPartitionTableCheckExecutor.submit( () -> { @@ -521,7 +522,7 @@ protected ConfigNodeRPCServiceProcessor getConfigNodeRPCServiceProcessor() { return new ConfigNodeRPCServiceProcessor(configManager); } - private void waitForLeaderElected() { + private TConfigNodeLocation waitForLeaderElected() { while (!configManager.getConsensusManager().isLeaderExist()) { LOGGER.info("Leader has not been elected yet, wait for 1 second"); try { @@ -531,6 +532,7 @@ private void waitForLeaderElected() { LOGGER.warn("Unexpected interruption during waiting for leader election."); } } + return configManager.getConsensusManager().getLeaderLocation(); } /** diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index f2a909941a598..1796fa0f86d04 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -197,6 +197,7 @@ import org.apache.iotdb.db.service.externalservice.ExternalServiceManagementService; import org.apache.iotdb.db.service.metrics.FileMetrics; import org.apache.iotdb.db.storageengine.StorageEngine; +import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.compaction.repair.RepairTaskStatus; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionScheduleTaskManager; import org.apache.iotdb.db.storageengine.dataregion.compaction.schedule.CompactionTaskManager; @@ -205,6 +206,8 @@ import org.apache.iotdb.db.storageengine.dataregion.modification.DeletionPredicate; import org.apache.iotdb.db.storageengine.dataregion.modification.IDPredicate; import org.apache.iotdb.db.storageengine.dataregion.modification.TableDeletionEntry; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; +import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; import org.apache.iotdb.db.storageengine.rescon.quotas.DataNodeSpaceQuotaManager; import org.apache.iotdb.db.storageengine.rescon.quotas.DataNodeThrottleQuotaManager; import org.apache.iotdb.db.subscription.agent.SubscriptionAgent; @@ -344,13 +347,9 @@ import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; -import java.io.File; import java.io.IOException; -import java.lang.reflect.Method; import java.net.URL; import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; import java.time.ZoneId; import java.util.ArrayList; import java.util.Arrays; @@ -3148,23 +3147,13 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { TGetEarliestTimeslotsResp resp = new TGetEarliestTimeslotsResp(); try { - Map earliestTimeslots = new HashMap<>(); - - // Get data directories from configuration - String[] dataDirs = IoTDBDescriptor.getInstance().getConfig().getDataDirs(); - - for (String dataDir : dataDirs) { - File dir = new File(dataDir); - if (dir.exists() && dir.isDirectory()) { - processDataDirectoryForEarliestTimeslots(dir, earliestTimeslots); - } - } + Map earliestTimeslots = new ConcurrentHashMap<>(); + processDataDirectoryForEarliestTimeslots(earliestTimeslots); resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); resp.setDatabaseToEarliestTimeslot(earliestTimeslots); LOGGER.info("Retrieved earliest timeslots for {} databases", earliestTimeslots.size()); - } catch (Exception e) { LOGGER.error("Failed to get earliest timeslots", e); resp.setStatus( @@ -3243,14 +3232,17 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb // Set default value resp.setDatabaseScopedDataPartitionTables(Collections.emptyList()); try { - currentGeneratorFuture.get(timeoutMs, TimeUnit.MILLISECONDS); - if (currentGenerator == null) { + // To resolve this situation that the DataNode is registered and didn't request + // generateDataPartitionTable interface yet. + if (currentGeneratorFuture == null || currentGenerator == null) { resp.setErrorCode(DataPartitionTableGeneratorState.UNKNOWN.getCode()); resp.setMessage("No DataPartitionTable generation task found"); resp.setStatus(RpcUtils.getStatus(TSStatusCode.INTERNAL_SERVER_ERROR)); return resp; } + currentGeneratorFuture.get(timeoutMs, TimeUnit.MILLISECONDS); + parseGenerationStatus(resp); if (currentGenerator.getStatus().equals(DataPartitionTableGenerator.TaskStatus.COMPLETED)) { boolean success = false; @@ -3291,23 +3283,7 @@ public TGenerateDataPartitionTableHeartbeatResp generateDataPartitionTableHeartb return resp; } - private void parseGenerationStatus(T resp) { - if (resp instanceof TGenerateDataPartitionTableResp) { - handleResponse((TGenerateDataPartitionTableResp) resp); - } else { - handleResponse((TGenerateDataPartitionTableHeartbeatResp) resp); - } - } - - private void handleResponse(TGenerateDataPartitionTableResp resp) { - updateResponse(resp); - } - - private void handleResponse(TGenerateDataPartitionTableHeartbeatResp resp) { - updateResponse(resp); - } - - private void updateResponse(T resp) { + private void parseGenerationStatus(Object resp) { if (currentGenerator == null) { return; } @@ -3353,69 +3329,31 @@ private void updateResponse(T resp) { } } - private void setResponseFields(T resp, int errorCode, String message, TSStatus status) { - try { - Method setErrorCode = resp.getClass().getMethod("setErrorCode", int.class); - Method setMessage = resp.getClass().getMethod("setMessage", String.class); - Method setStatus = resp.getClass().getMethod("setStatus", TSStatus.class); - - setErrorCode.invoke(resp, errorCode); - setMessage.invoke(resp, message); - setStatus.invoke(resp, status); - } catch (Exception e) { - LOGGER.error("Failed to set response fields", e); + private void setResponseFields(Object resp, int errorCode, String message, TSStatus status) { + if (resp instanceof TGenerateDataPartitionTableResp) { + ((TGenerateDataPartitionTableResp) resp).setErrorCode(errorCode); + ((TGenerateDataPartitionTableResp) resp).setMessage(message); + ((TGenerateDataPartitionTableResp) resp).setStatus(status); + } else if (resp instanceof TGenerateDataPartitionTableHeartbeatResp) { + ((TGenerateDataPartitionTableHeartbeatResp) resp).setErrorCode(errorCode); + ((TGenerateDataPartitionTableHeartbeatResp) resp).setMessage(message); + ((TGenerateDataPartitionTableHeartbeatResp) resp).setStatus(status); } } /** - * Process data directory to find the earliest timeslots for each database. Map - * earliestTimeslots key(String): database name value(Long): the earliest time slot id of the - * database + * Scan the seq and unseq directory on every data region, then compute the earliest time slot id + * of database */ - private void processDataDirectoryForEarliestTimeslots( - File dataDir, Map earliestTimeslots) { - Map databaseEarliestRegionMap = new ConcurrentHashMap<>(); - try (Stream sequenceTypePaths = Files.list(dataDir.toPath())) { - sequenceTypePaths - .filter(Files::isDirectory) - .forEach( - sequenceTypePath -> { - try (Stream dbPaths = Files.list(sequenceTypePath)) { - dbPaths - .filter(Files::isDirectory) - .forEach( - dbPath -> { - String databaseName = dbPath.getFileName().toString(); - if (DataPartitionTableGenerator.IGNORE_DATABASE.contains( - databaseName)) { - return; - } - databaseEarliestRegionMap.computeIfAbsent( - databaseName, key -> Long.MAX_VALUE); - long earliestTimeslot = - findEarliestTimeslotInDatabase( - dbPath.toFile(), databaseEarliestRegionMap); - - if (earliestTimeslot != Long.MAX_VALUE) { - earliestTimeslots.merge(databaseName, earliestTimeslot, Math::min); - } - }); - } catch (IOException e) { - LOGGER.error( - "Failed to process data directory: {}", sequenceTypePath.toFile(), e); - } - }); - } catch (IOException e) { - LOGGER.error("Failed to process data directory: {}", dataDir, e); - } - } - - /** Find the earliest timeslot in a database directory. */ - private long findEarliestTimeslotInDatabase( - File databaseDir, Map databaseEarliestRegionMap) { - String databaseName = databaseDir.getName(); - List> futureList = new ArrayList<>(); - + private void processDataDirectoryForEarliestTimeslots(Map earliestTimeslots) { + final Set ignoreDatabase = + new HashSet() { + { + add("root.__audit"); + add("root.__system"); + } + }; + List> futures = new ArrayList<>(); final ExecutorService findEarliestTimeSlotExecutor = new WrappedThreadPoolExecutor( 0, @@ -3428,67 +3366,51 @@ private long findEarliestTimeslotInDatabase( ThreadName.FIND_EARLIEST_TIME_SLOT_PARALLEL_POOL.getName(), new ThreadPoolExecutor.CallerRunsPolicy()); - try (Stream databasePaths = Files.list(databaseDir.toPath())) { - databasePaths - .filter(Files::isDirectory) - .forEach( - regionPath -> { - Future future = - findEarliestTimeSlotExecutor.submit( - () -> { - try (Stream regionPaths = Files.list(regionPath)) { - regionPaths - .filter(Files::isDirectory) - .forEach( - timeSlotPath -> { - try { - Optional matchedFile = - Files.find( - timeSlotPath, - 1, - (path, attrs) -> - attrs.isRegularFile() - && path.toString() - .endsWith( - DataPartitionTableGenerator - .SCAN_FILE_SUFFIX_NAME)) - .findFirst(); - if (!matchedFile.isPresent()) { - return; - } - String timeSlotName = timeSlotPath.getFileName().toString(); - long timeslot = Long.parseLong(timeSlotName); - databaseEarliestRegionMap.compute( - databaseName, - (k, v) -> v == null ? timeslot : Math.min(v, timeslot)); - } catch (IOException e) { - LOGGER.error( - "Failed to find any {} files in the {} directory", - DataPartitionTableGenerator.SCAN_FILE_SUFFIX_NAME, - timeSlotPath, - e); - } - }); - } catch (IOException e) { - LOGGER.error("Failed to scan {}", regionPath, e); - } - }); - futureList.add(future); - }); - } catch (IOException e) { - LOGGER.error("Failed to walk database directory: {}", databaseDir, e); - } - - for (Future future : futureList) { - try { - future.get(); - } catch (InterruptedException | ExecutionException e) { - LOGGER.error("Failed to wait for task completion", e); - Thread.currentThread().interrupt(); - } + for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { + CompletableFuture regionFuture = + CompletableFuture.runAsync( + () -> { + TsFileManager tsFileManager = dataRegion.getTsFileManager(); + String databaseName = dataRegion.getDatabaseName(); + if (ignoreDatabase.contains(databaseName)) { + return; + } + + tsFileManager.readLock(); + List seqTsFileList = tsFileManager.getTsFileList(true); + List unseqTsFileList = tsFileManager.getTsFileList(false); + tsFileManager.readUnlock(); + + long earliestTimeSlotId = Long.MIN_VALUE; + + earliestTimeSlotId = + findEarliestTimeslotInDatabase(seqTsFileList, earliestTimeSlotId); + earliestTimeSlotId = + findEarliestTimeslotInDatabase(unseqTsFileList, earliestTimeSlotId); + + long finalEarliestTimeSlotId = earliestTimeSlotId; + earliestTimeslots.compute( + databaseName, + (k, v) -> + v == null ? finalEarliestTimeSlotId : Math.min(finalEarliestTimeSlotId, v)); + }, + findEarliestTimeSlotExecutor); + futures.add(regionFuture); } - findEarliestTimeSlotExecutor.shutdownNow(); - return databaseEarliestRegionMap.get(databaseName); + + // Wait for all tasks to complete + CompletableFuture.allOf(futures.toArray(new CompletableFuture[0])).join(); + LOGGER.info("Process data directory for earliestTimeslots completed successfully"); + } + + private long findEarliestTimeslotInDatabase( + List seqTsFileList, long earliestTimeSlotId) { + for (TsFileResource tsFileResource : seqTsFileList) { + long timeSlotId = tsFileResource.getTsFileID().timePartitionId; + earliestTimeSlotId = Math.min(earliestTimeSlotId, timeSlotId); + } + + return earliestTimeSlotId; } private List serializeDatabaseScopedTableList( @@ -3502,14 +3424,10 @@ private List serializeDatabaseScopedTableList( for (DatabaseScopedDataPartitionTable table : list) { try (PublicBAOS baos = new PublicBAOS(); DataOutputStream oos = new DataOutputStream(baos)) { - TTransport transport = new TIOStreamTransport(oos); TBinaryProtocol protocol = new TBinaryProtocol(transport); - table.serialize(oos, protocol); - - result.add(ByteBuffer.wrap(baos.toByteArray())); - + result.add(ByteBuffer.wrap(baos.getBuf(), 0, baos.size())); } catch (IOException | TException e) { LOGGER.error( "Failed to serialize DatabaseScopedDataPartitionTable for database: {}", From 35546a68f48baa96d505ff9995b0fd8c80ed7406 Mon Sep 17 00:00:00 2001 From: libo Date: Thu, 19 Mar 2026 16:44:07 +0800 Subject: [PATCH 34/39] Skip current loop when no time slot info is found in the seq and unseq directory --- .../thrift/impl/DataNodeInternalRPCServiceImpl.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 1796fa0f86d04..8bf53a07958e0 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3383,10 +3383,14 @@ private void processDataDirectoryForEarliestTimeslots(Map earliest long earliestTimeSlotId = Long.MIN_VALUE; + earliestTimeSlotId = findEarliestTimeslotInFiles(seqTsFileList, earliestTimeSlotId); earliestTimeSlotId = - findEarliestTimeslotInDatabase(seqTsFileList, earliestTimeSlotId); - earliestTimeSlotId = - findEarliestTimeslotInDatabase(unseqTsFileList, earliestTimeSlotId); + findEarliestTimeslotInFiles(unseqTsFileList, earliestTimeSlotId); + + if (earliestTimeSlotId == Long.MIN_VALUE) { + LOGGER.info("No time slot info is found in the seq and unseq directory"); + return; + } long finalEarliestTimeSlotId = earliestTimeSlotId; earliestTimeslots.compute( @@ -3403,7 +3407,7 @@ private void processDataDirectoryForEarliestTimeslots(Map earliest LOGGER.info("Process data directory for earliestTimeslots completed successfully"); } - private long findEarliestTimeslotInDatabase( + private long findEarliestTimeslotInFiles( List seqTsFileList, long earliestTimeSlotId) { for (TsFileResource tsFileResource : seqTsFileList) { long timeSlotId = tsFileResource.getTsFileID().timePartitionId; From 61a1b7b6e609cb67355c171bf5c034c0f9af66bc Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 20 Mar 2026 14:37:33 +0800 Subject: [PATCH 35/39] Fix some opinions --- ...PartitionTableIntegrityCheckProcedure.java | 24 ++++++++--------- .../impl/DataNodeInternalRPCServiceImpl.java | 26 ++++--------------- .../conf/iotdb-system.properties.template | 3 +++ 3 files changed, 20 insertions(+), 33 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 5730b3821e5d2..88b8deef496d4 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -49,6 +49,7 @@ import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.transport.TIOStreamTransport; import org.apache.thrift.transport.TTransport; +import org.apache.tsfile.utils.PublicBAOS; import org.apache.tsfile.utils.ReadWriteIOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -739,15 +740,14 @@ public void serialize(final DataOutputStream stream) throws IOException { for (Map.Entry entry : finalDataPartitionTables.entrySet()) { ReadWriteIOUtils.write(entry.getKey(), stream); - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos)) { - - TTransport transport = new TIOStreamTransport(dos); + try (final PublicBAOS publicBAOS = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { + TTransport transport = new TIOStreamTransport(outputStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); - entry.getValue().serialize(dos, protocol); + entry.getValue().serialize(outputStream, protocol); - byte[] data = baos.toByteArray(); + byte[] data = publicBAOS.getBuf(); stream.writeInt(data.length); stream.write(data); } catch (IOException | TException e) { @@ -764,12 +764,12 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(skipDataNodes.size()); for (TDataNodeConfiguration skipDataNode : skipDataNodes) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - TTransport transport = new TIOStreamTransport(baos); + try (final PublicBAOS publicBAOS = new PublicBAOS()) { + TTransport transport = new TIOStreamTransport(publicBAOS); TBinaryProtocol protocol = new TBinaryProtocol(transport); skipDataNode.write(protocol); - byte[] data = baos.toByteArray(); + byte[] data = publicBAOS.getBuf(); stream.writeInt(data.length); stream.write(data); } catch (TException e) { @@ -780,12 +780,12 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(failedDataNodes.size()); for (TDataNodeConfiguration failedDataNode : failedDataNodes) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) { - TTransport transport = new TIOStreamTransport(baos); + try (final PublicBAOS publicBAOS = new PublicBAOS()) { + TTransport transport = new TIOStreamTransport(publicBAOS); TBinaryProtocol protocol = new TBinaryProtocol(transport); failedDataNode.write(protocol); - byte[] data = baos.toByteArray(); + byte[] data = publicBAOS.getBuf(); stream.writeInt(data.length); stream.write(data); } catch (TException e) { diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java index 6215b1cdfe551..e2bfe4f6ad9cd 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/thrift/impl/DataNodeInternalRPCServiceImpl.java @@ -3148,7 +3148,7 @@ public TGetEarliestTimeslotsResp getEarliestTimeslots() { try { Map earliestTimeslots = new ConcurrentHashMap<>(); - processDataDirectoryForEarliestTimeslots(earliestTimeslots); + processDataRegionForEarliestTimeslots(earliestTimeslots); resp.setStatus(RpcUtils.getStatus(TSStatusCode.SUCCESS_STATUS)); resp.setDatabaseToEarliestTimeslot(earliestTimeslots); @@ -3345,7 +3345,7 @@ private void setResponseFields(Object resp, int errorCode, String message, TSSta * Scan the seq and unseq directory on every data region, then compute the earliest time slot id * of database */ - private void processDataDirectoryForEarliestTimeslots(Map earliestTimeslots) { + private void processDataRegionForEarliestTimeslots(Map earliestTimeslots) { final Set ignoreDatabase = new HashSet() { { @@ -3376,27 +3376,11 @@ private void processDataDirectoryForEarliestTimeslots(Map earliest return; } - tsFileManager.readLock(); - List seqTsFileList = tsFileManager.getTsFileList(true); - List unseqTsFileList = tsFileManager.getTsFileList(false); - tsFileManager.readUnlock(); - - long earliestTimeSlotId = Long.MIN_VALUE; - - earliestTimeSlotId = findEarliestTimeslotInFiles(seqTsFileList, earliestTimeSlotId); - earliestTimeSlotId = - findEarliestTimeslotInFiles(unseqTsFileList, earliestTimeSlotId); - - if (earliestTimeSlotId == Long.MIN_VALUE) { - LOGGER.info("No time slot info is found in the seq and unseq directory"); - return; - } - - long finalEarliestTimeSlotId = earliestTimeSlotId; + Set timePartitionIds = tsFileManager.getTimePartitions(); + final long earliestTimeSlotId = Collections.min(timePartitionIds); earliestTimeslots.compute( databaseName, - (k, v) -> - v == null ? finalEarliestTimeSlotId : Math.min(finalEarliestTimeSlotId, v)); + (k, v) -> v == null ? earliestTimeSlotId : Math.min(earliestTimeSlotId, v)); }, findEarliestTimeSlotExecutor); futures.add(regionFuture); diff --git a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template index 2e071c9761c4e..341f72cfb0e54 100644 --- a/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template +++ b/iotdb-core/node-commons/src/assembly/resources/conf/iotdb-system.properties.template @@ -742,16 +742,19 @@ failure_detector_phi_acceptable_pause_in_ms=10000 # Datatype: double(percentage) disk_space_warning_threshold=0.05 +# Purpose: for data partition repair # The number of threads used for parallel scanning in the partition table recovery # effectiveMode: restart # Datatype: Integer partition_table_recover_worker_num=10 +# Purpose: for data partition repair # Limit the number of bytes read per second from a file, the unit is MB # effectiveMode: restart # Datatype: Integer partition_table_recover_max_read_megabytes_per_second=10 +# Purpose: for data partition repair # Set a timeout to wait for all datanodes complete startup, the unit is ms # effectiveMode: restart # Datatype: Integer From 230068e8ac72fd6e04b6d40b8557f44264d7181f Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 20 Mar 2026 16:56:53 +0800 Subject: [PATCH 36/39] Use the Google guava RateLimiter instead of LeakyBucketRateLimiter; Adjust new method to compute progress of data partition table generation --- .../DataPartitionTableGenerator.java | 61 ++++---- .../dataregion/tsfile/TsFileResource.java | 4 +- .../timeindex/ArrayDeviceTimeIndex.java | 4 +- .../tsfile/timeindex/FileTimeIndex.java | 15 +- .../tsfile/timeindex/ITimeIndex.java | 5 +- .../apache/iotdb/commons/utils/IOUtils.java | 35 +++++ .../rateLimiter/LeakyBucketRateLimiter.java | 135 ------------------ 7 files changed, 86 insertions(+), 173 deletions(-) delete mode 100644 iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index b15dc715aebac..8fb38a6aa174b 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -27,18 +27,19 @@ import org.apache.iotdb.commons.partition.SeriesPartitionTable; import org.apache.iotdb.commons.partition.executor.SeriesPartitionExecutor; import org.apache.iotdb.commons.utils.TimePartitionUtils; -import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.storageengine.StorageEngine; import org.apache.iotdb.db.storageengine.dataregion.DataRegion; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileManager; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import com.google.common.util.concurrent.RateLimiter; import org.apache.tsfile.file.metadata.IDeviceID; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -47,7 +48,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; /** * Generator for DataPartitionTable by scanning tsfile resources. This class scans the data @@ -63,9 +63,9 @@ public class DataPartitionTableGenerator { private Map databasePartitionTableMap = new ConcurrentHashMap<>(); // Progress tracking - private final AtomicInteger processedFiles = new AtomicInteger(0); - private final AtomicInteger failedFiles = new AtomicInteger(0); - private final AtomicLong totalFiles = new AtomicLong(0); + private final AtomicInteger processedTimePartitions = new AtomicInteger(0); + private final AtomicInteger failedTimePartitions = new AtomicInteger(0); + private long totalTimePartitions = 0; // Configuration private final ExecutorService executor; @@ -73,14 +73,12 @@ public class DataPartitionTableGenerator { private final int seriesSlotNum; private final String seriesPartitionExecutorClass; - private final LeakyBucketRateLimiter limiter = - new LeakyBucketRateLimiter( + private final RateLimiter limiter = + RateLimiter.create( (long) - IoTDBDescriptor.getInstance() - .getConfig() - .getPartitionTableRecoverMaxReadMBsPerSecond() - * 1024 - * 1024); + IoTDBDescriptor.getInstance() + .getConfig() + .getPartitionTableRecoverMaxReadMBsPerSecond()); public static final Set IGNORE_DATABASE = new HashSet() { @@ -90,8 +88,6 @@ public class DataPartitionTableGenerator { } }; - public static final String SCAN_FILE_SUFFIX_NAME = ".tsfile"; - public DataPartitionTableGenerator( ExecutorService executor, Set databases, @@ -132,6 +128,14 @@ private void generateDataPartitionTableByMemory() { seriesPartitionExecutorClass, seriesSlotNum); try { + totalTimePartitions = + StorageEngine.getInstance().getAllDataRegions().stream() + .mapToLong( + dataRegion -> + (dataRegion == null) + ? 0 + : dataRegion.getTsFileManager().getTimePartitions().size()) + .sum(); for (DataRegion dataRegion : StorageEngine.getInstance().getAllDataRegions()) { CompletableFuture regionFuture = CompletableFuture.runAsync( @@ -178,7 +182,7 @@ private void generateDataPartitionTableByMemory() { }); } catch (Exception e) { LOG.error("Error processing data region: {}", dataRegion.getDatabaseName(), e); - failedFiles.incrementAndGet(); + failedTimePartitions.incrementAndGet(); errorMessage = "Failed to process data region: " + e.getMessage(); } }, @@ -192,8 +196,8 @@ private void generateDataPartitionTableByMemory() { status = TaskStatus.COMPLETED; LOG.info( "DataPartitionTable generation completed successfully. Processed: {}, Failed: {}", - processedFiles.get(), - failedFiles.get()); + processedTimePartitions.get(), + failedTimePartitions.get()); } catch (Exception e) { LOG.error("Failed to generate DataPartitionTable", e); status = TaskStatus.FAILED; @@ -205,10 +209,12 @@ private void constructDataPartitionMap( List seqTsFileList, SeriesPartitionExecutor seriesPartitionExecutor, Map dataPartitionMap) { + Set timeSlotIds = Collections.newSetFromMap(new ConcurrentHashMap<>()); + for (TsFileResource tsFileResource : seqTsFileList) { + long timeSlotId = tsFileResource.getTsFileID().timePartitionId; try { Set devices = tsFileResource.getDevices(limiter); - long timeSlotId = tsFileResource.getTsFileID().timePartitionId; int regionId = tsFileResource.getTsFileID().regionId; TConsensusGroupId consensusGroupId = new TConsensusGroupId(); @@ -225,14 +231,20 @@ private void constructDataPartitionMap( seriesSlotId, empty -> newSeriesPartitionTable(consensusGroupId, timeSlotId)) .putDataPartition(timePartitionSlot, consensusGroupId); } - processedFiles.incrementAndGet(); - totalFiles.incrementAndGet(); + if (!timeSlotIds.contains(timeSlotId)) { + timeSlotIds.add(timeSlotId); + processedTimePartitions.incrementAndGet(); + } } catch (Exception e) { - failedFiles.incrementAndGet(); - totalFiles.incrementAndGet(); + if (!timeSlotIds.contains(timeSlotId)) { + timeSlotIds.add(timeSlotId); + failedTimePartitions.incrementAndGet(); + } LOG.error("Failed to process tsfile {}, {}", tsFileResource.getTsFileID(), e.getMessage()); } } + + timeSlotIds.clear(); } private static SeriesPartitionTable newSeriesPartitionTable( @@ -254,9 +266,10 @@ public String getErrorMessage() { } public double getProgress() { - if (totalFiles.get() == 0) { + if (totalTimePartitions == 0) { return 0.0; } - return (double) (processedFiles.get() + failedFiles.get()) / totalFiles.get(); + return (double) (processedTimePartitions.get() + failedTimePartitions.get()) + / totalTimePartitions; } } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java index 58e94e603e3ac..bc81a6b28a851 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/TsFileResource.java @@ -27,7 +27,6 @@ import org.apache.iotdb.commons.pipe.datastructure.resource.PersistentResource; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TestOnly; -import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.conf.IoTDBConfig; import org.apache.iotdb.db.conf.IoTDBDescriptor; import org.apache.iotdb.db.exception.load.PartitionViolationException; @@ -51,6 +50,7 @@ import org.apache.iotdb.db.storageengine.dataregion.tsfile.timeindex.TimeIndexLevel; import org.apache.iotdb.db.storageengine.rescon.disk.TierManager; +import com.google.common.util.concurrent.RateLimiter; import org.apache.tsfile.file.metadata.IChunkMetadata; import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.file.metadata.ITimeSeriesMetadata; @@ -678,7 +678,7 @@ public Set getDevices() { return timeIndex.getDevices(file.getPath(), this); } - public Set getDevices(LeakyBucketRateLimiter limiter) { + public Set getDevices(RateLimiter limiter) { return timeIndex.getDevices(file.getPath(), this, limiter); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java index 9769ee2c4405e..caca8e9fdba44 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ArrayDeviceTimeIndex.java @@ -23,10 +23,10 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; import org.apache.iotdb.commons.utils.TimePartitionUtils; -import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import com.google.common.util.concurrent.RateLimiter; import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.file.metadata.IDeviceID.Deserializer; import org.apache.tsfile.utils.FilePathUtils; @@ -174,7 +174,7 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc @Override public Set getDevices( - String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + String tsFilePath, TsFileResource tsFileResource, RateLimiter limiter) { return deviceToIndex.keySet(); } diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java index b79ffc578e83f..2cf89a02626aa 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/FileTimeIndex.java @@ -21,11 +21,12 @@ import org.apache.iotdb.commons.path.PartialPath; import org.apache.iotdb.commons.utils.CommonDateTimeUtils; +import org.apache.iotdb.commons.utils.IOUtils; import org.apache.iotdb.commons.utils.TimePartitionUtils; -import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import com.google.common.util.concurrent.RateLimiter; import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.fileSystem.FSFactoryProducer; import org.apache.tsfile.utils.FilePathUtils; @@ -123,14 +124,14 @@ public Set getDevices(String tsFilePath, TsFileResource tsFileResourc @Override public Set getDevices( - String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter) { + String tsFilePath, TsFileResource tsFileResource, RateLimiter limiter) { tsFileResource.readLock(); try { - limiter.acquire(tsFileResource.getTsFileSize()); - - try (InputStream inputStream = - FSFactoryProducer.getFSFactory() - .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX)) { + try (IOUtils.RatelimitedInputStream inputStream = + new IOUtils.RatelimitedInputStream( + FSFactoryProducer.getFSFactory() + .getBufferedInputStream(tsFilePath + TsFileResource.RESOURCE_SUFFIX), + limiter)) { // The first byte is VERSION_NUMBER, second byte is timeIndexType. byte[] bytes = ReadWriteIOUtils.readBytes(inputStream, 2); diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java index 400c478df5054..114a207d75794 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/storageengine/dataregion/tsfile/timeindex/ITimeIndex.java @@ -20,10 +20,10 @@ package org.apache.iotdb.db.storageengine.dataregion.tsfile.timeindex; import org.apache.iotdb.commons.path.PartialPath; -import org.apache.iotdb.commons.utils.rateLimiter.LeakyBucketRateLimiter; import org.apache.iotdb.db.exception.load.PartitionViolationException; import org.apache.iotdb.db.storageengine.dataregion.tsfile.TsFileResource; +import com.google.common.util.concurrent.RateLimiter; import org.apache.tsfile.file.metadata.IDeviceID; import org.apache.tsfile.utils.Pair; import org.apache.tsfile.utils.ReadWriteIOUtils; @@ -80,8 +80,7 @@ ITimeIndex deserialize(InputStream inputStream, IDeviceID.Deserializer deseriali * * @return device names */ - Set getDevices( - String tsFilePath, TsFileResource tsFileResource, LeakyBucketRateLimiter limiter); + Set getDevices(String tsFilePath, TsFileResource tsFileResource, RateLimiter limiter); /** * @return whether end time is empty (Long.MIN_VALUE) diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/IOUtils.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/IOUtils.java index 8b63d29bd786e..d234c6be30518 100644 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/IOUtils.java +++ b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/IOUtils.java @@ -25,10 +25,12 @@ import org.apache.iotdb.commons.path.PartialPath; import com.google.common.base.Supplier; +import com.google.common.util.concurrent.RateLimiter; import java.io.DataInputStream; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.Map; @@ -289,4 +291,37 @@ public static Optional retryNoException( } return Optional.empty(); } + + public static class RatelimitedInputStream extends InputStream { + private RateLimiter rateLimiter; + private InputStream inner; + + public RatelimitedInputStream(InputStream inner, RateLimiter limiter) { + this.inner = inner; + this.rateLimiter = limiter; + } + + @Override + public int read() throws IOException { + rateLimiter.acquire(1); + return inner.read(); + } + + @Override + public int read(byte[] b) throws IOException { + rateLimiter.acquire(b.length); + return inner.read(b); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + rateLimiter.acquire(len); + return inner.read(b, off, len); + } + + @Override + public void close() throws IOException { + inner.close(); + } + } } diff --git a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java b/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java deleted file mode 100644 index 287030753b28a..0000000000000 --- a/iotdb-core/node-commons/src/main/java/org/apache/iotdb/commons/utils/rateLimiter/LeakyBucketRateLimiter.java +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.iotdb.commons.utils.rateLimiter; - -import java.util.concurrent.atomic.AtomicLong; -import java.util.concurrent.locks.LockSupport; - -/** - * A global leaky-bucket rate limiter for bytes throughput. Features: - Strict throughput limiting - * (no burst) - Smooth bandwidth shaping - Thread-safe - Fair for multi-thread - Low contention - */ -public class LeakyBucketRateLimiter { - /** bytes per second */ - private volatile long bytesPerSecond; - - /** start time */ - private final long startTimeNs; - - /** total consumed bytes */ - private final AtomicLong totalBytes = new AtomicLong(0); - - public LeakyBucketRateLimiter(long bytesPerSecond) { - if (bytesPerSecond <= 0) { - throw new IllegalArgumentException("bytesPerSecond must be > 0"); - } - this.bytesPerSecond = bytesPerSecond; - this.startTimeNs = System.nanoTime(); - } - - /** - * Acquire permission for reading bytes. - * - *

This method will block if reading too fast. - */ - public void acquire(long bytes) { - if (bytes <= 0) { - return; - } - - long currentTotal = totalBytes.addAndGet(bytes); - - long expectedTimeNs = expectedTimeNs(currentTotal); - long now = System.nanoTime(); - - long sleepNs = expectedTimeNs - now; - - if (sleepNs > 0) { - LockSupport.parkNanos(sleepNs); - } - } - - /** - * Try acquire without blocking. - * - * @return true if allowed immediately - */ - public boolean tryAcquire(long bytes) { - if (bytes <= 0) { - return true; - } - - long currentTotal = totalBytes.addAndGet(bytes); - - long expectedTimeNs = expectedTimeNs(currentTotal); - long now = System.nanoTime(); - - if (expectedTimeNs <= now) { - return true; - } - - // rollback - totalBytes.addAndGet(-bytes); - return false; - } - - /** Update rate dynamically. */ - public void setRate(long newBytesPerSecond) { - if (newBytesPerSecond <= 0) { - throw new IllegalArgumentException("bytesPerSecond must be > 0"); - } - this.bytesPerSecond = newBytesPerSecond; - } - - /** Current rate. */ - public long getRate() { - return bytesPerSecond; - } - - /** Total bytes processed. */ - public long getTotalBytes() { - return totalBytes.get(); - } - - /** - * Calculate the expected time using double (double can easily hold nanoseconds on the order of - * 10^18), then perform clamping and convert to long. Advantages: Extremely simple, zero - * exceptions thrown, and double precision is sufficient (nanosecond-level errors are negligible). - * Disadvantages: In extreme cases (when totalBytes is close to 2^63), double loses precision in - * the trailing digits. However, in IoTDB's actual scenarios, bytesPerSecond is typically between - * 10MB/s and 1GB/s, so this situation will not occur. - */ - private long expectedTimeNs(long totalBytes) { - if (totalBytes <= 0) { - return startTimeNs; - } - - // Use double for calculations to avoid overflow in long multiplication - double seconds = (double) totalBytes / bytesPerSecond; - double elapsedNsDouble = seconds * 1_000_000_000.0; - - if (elapsedNsDouble > Long.MAX_VALUE - startTimeNs) { - // clamp - return Long.MAX_VALUE; - } - - return startTimeNs + (long) elapsedNsDouble; - } -} From 899d1347838d5553a138de1a9760f45ed5f898ee Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 20 Mar 2026 17:15:29 +0800 Subject: [PATCH 37/39] Fix --- .../iotdb/db/partition/DataPartitionTableGenerator.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java index 8fb38a6aa174b..1b43cb2bc5446 100644 --- a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java +++ b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/partition/DataPartitionTableGenerator.java @@ -76,9 +76,11 @@ public class DataPartitionTableGenerator { private final RateLimiter limiter = RateLimiter.create( (long) - IoTDBDescriptor.getInstance() - .getConfig() - .getPartitionTableRecoverMaxReadMBsPerSecond()); + IoTDBDescriptor.getInstance() + .getConfig() + .getPartitionTableRecoverMaxReadMBsPerSecond() + * 1024 + * 1024); public static final Set IGNORE_DATABASE = new HashSet() { From 14a71abf85f4a97d34eef7a5d0a078b4cdd12cba Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 20 Mar 2026 17:34:41 +0800 Subject: [PATCH 38/39] Fix bug --- ...PartitionTableIntegrityCheckProcedure.java | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 88b8deef496d4..05cc3ec13024b 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -55,7 +55,6 @@ import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; @@ -705,19 +704,18 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(tableList.size()); for (DatabaseScopedDataPartitionTable table : tableList) { - try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream dos = new DataOutputStream(baos)) { + try (final PublicBAOS publicBAOS = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { - TTransport transport = new TIOStreamTransport(dos); + TTransport transport = new TIOStreamTransport(outputStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); - table.serialize(dos, protocol); + table.serialize(outputStream, protocol); - byte[] data = baos.toByteArray(); - // Length of data written for a single object - stream.writeInt(data.length); - // data written for a single object - stream.write(data); + byte[] buf = publicBAOS.getBuf(); + int size = publicBAOS.size(); + ReadWriteIOUtils.write(size, outputStream); + stream.write(buf, 0, size); } catch (IOException | TException e) { LOG.error( "[DataPartitionIntegrity] {} serialize failed for dataNodeId: {}", @@ -747,9 +745,10 @@ public void serialize(final DataOutputStream stream) throws IOException { entry.getValue().serialize(outputStream, protocol); - byte[] data = publicBAOS.getBuf(); - stream.writeInt(data.length); - stream.write(data); + byte[] buf = publicBAOS.getBuf(); + int size = publicBAOS.size(); + ReadWriteIOUtils.write(size, outputStream); + stream.write(buf, 0, size); } catch (IOException | TException e) { LOG.error( "[DataPartitionIntegrity] {} serialize finalDataPartitionTables failed", @@ -764,14 +763,16 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(skipDataNodes.size()); for (TDataNodeConfiguration skipDataNode : skipDataNodes) { - try (final PublicBAOS publicBAOS = new PublicBAOS()) { - TTransport transport = new TIOStreamTransport(publicBAOS); + try (final PublicBAOS publicBAOS = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { + TTransport transport = new TIOStreamTransport(outputStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); skipDataNode.write(protocol); - byte[] data = publicBAOS.getBuf(); - stream.writeInt(data.length); - stream.write(data); + byte[] buf = publicBAOS.getBuf(); + int size = publicBAOS.size(); + ReadWriteIOUtils.write(size, outputStream); + stream.write(buf, 0, size); } catch (TException e) { LOG.error("[DataPartitionIntegrity] Failed to serialize skipDataNode", e); throw new IOException("Failed to serialize skipDataNode", e); @@ -780,14 +781,16 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(failedDataNodes.size()); for (TDataNodeConfiguration failedDataNode : failedDataNodes) { - try (final PublicBAOS publicBAOS = new PublicBAOS()) { - TTransport transport = new TIOStreamTransport(publicBAOS); + try (final PublicBAOS publicBAOS = new PublicBAOS(); + final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { + TTransport transport = new TIOStreamTransport(outputStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); failedDataNode.write(protocol); - byte[] data = publicBAOS.getBuf(); - stream.writeInt(data.length); - stream.write(data); + byte[] buf = publicBAOS.getBuf(); + int size = publicBAOS.size(); + ReadWriteIOUtils.write(size, outputStream); + stream.write(buf, 0, size); } catch (TException e) { LOG.error("[DataPartitionIntegrity] Failed to serialize failedDataNode", e); throw new IOException("Failed to serialize failedDataNode", e); From fd43f1e2a8533a6a99917f1b15641be999fa4ca4 Mon Sep 17 00:00:00 2001 From: libo Date: Fri, 20 Mar 2026 18:43:46 +0800 Subject: [PATCH 39/39] Fix serialize bugs --- ...PartitionTableIntegrityCheckProcedure.java | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java index 05cc3ec13024b..0ff1ec91acdb0 100644 --- a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java +++ b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/partition/DataPartitionTableIntegrityCheckProcedure.java @@ -705,16 +705,16 @@ public void serialize(final DataOutputStream stream) throws IOException { for (DatabaseScopedDataPartitionTable table : tableList) { try (final PublicBAOS publicBAOS = new PublicBAOS(); - final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { + final DataOutputStream tmpStream = new DataOutputStream(publicBAOS)) { - TTransport transport = new TIOStreamTransport(outputStream); + TTransport transport = new TIOStreamTransport(tmpStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); - table.serialize(outputStream, protocol); + table.serialize(tmpStream, protocol); byte[] buf = publicBAOS.getBuf(); int size = publicBAOS.size(); - ReadWriteIOUtils.write(size, outputStream); + ReadWriteIOUtils.write(size, stream); stream.write(buf, 0, size); } catch (IOException | TException e) { LOG.error( @@ -739,15 +739,15 @@ public void serialize(final DataOutputStream stream) throws IOException { ReadWriteIOUtils.write(entry.getKey(), stream); try (final PublicBAOS publicBAOS = new PublicBAOS(); - final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { - TTransport transport = new TIOStreamTransport(outputStream); + final DataOutputStream tmpStream = new DataOutputStream(publicBAOS)) { + TTransport transport = new TIOStreamTransport(tmpStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); - entry.getValue().serialize(outputStream, protocol); + entry.getValue().serialize(tmpStream, protocol); byte[] buf = publicBAOS.getBuf(); int size = publicBAOS.size(); - ReadWriteIOUtils.write(size, outputStream); + ReadWriteIOUtils.write(size, stream); stream.write(buf, 0, size); } catch (IOException | TException e) { LOG.error( @@ -764,14 +764,14 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(skipDataNodes.size()); for (TDataNodeConfiguration skipDataNode : skipDataNodes) { try (final PublicBAOS publicBAOS = new PublicBAOS(); - final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { - TTransport transport = new TIOStreamTransport(outputStream); + final DataOutputStream tmpStream = new DataOutputStream(publicBAOS)) { + TTransport transport = new TIOStreamTransport(tmpStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); skipDataNode.write(protocol); byte[] buf = publicBAOS.getBuf(); int size = publicBAOS.size(); - ReadWriteIOUtils.write(size, outputStream); + ReadWriteIOUtils.write(size, stream); stream.write(buf, 0, size); } catch (TException e) { LOG.error("[DataPartitionIntegrity] Failed to serialize skipDataNode", e); @@ -782,14 +782,14 @@ public void serialize(final DataOutputStream stream) throws IOException { stream.writeInt(failedDataNodes.size()); for (TDataNodeConfiguration failedDataNode : failedDataNodes) { try (final PublicBAOS publicBAOS = new PublicBAOS(); - final DataOutputStream outputStream = new DataOutputStream(publicBAOS)) { - TTransport transport = new TIOStreamTransport(outputStream); + final DataOutputStream tmpStream = new DataOutputStream(publicBAOS)) { + TTransport transport = new TIOStreamTransport(tmpStream); TBinaryProtocol protocol = new TBinaryProtocol(transport); failedDataNode.write(protocol); byte[] buf = publicBAOS.getBuf(); int size = publicBAOS.size(); - ReadWriteIOUtils.write(size, outputStream); + ReadWriteIOUtils.write(size, stream); stream.write(buf, 0, size); } catch (TException e) { LOG.error("[DataPartitionIntegrity] Failed to serialize failedDataNode", e);