From 3b478d5307edef89ceaba013984ea951e3fb866f Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 20 Mar 2026 09:49:40 +0800 Subject: [PATCH 1/5] Update pyproject.toml --- iotdb-core/ainode/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iotdb-core/ainode/pyproject.toml b/iotdb-core/ainode/pyproject.toml index d92a466daf73..9a142fe72596 100644 --- a/iotdb-core/ainode/pyproject.toml +++ b/iotdb-core/ainode/pyproject.toml @@ -79,7 +79,7 @@ exclude = [ python = ">=3.11.0,<3.12.0" # ---- DL / HF stack ---- -torch = "^2.8.0,<2.9.0" +torch = "^2.9.0,<2.10.0" torchmetrics = "^1.8.0" transformers = "==4.56.2" tokenizers = ">=0.22.0,<=0.23.0" From 8dfa8fce7db0d0fe64fd93bf1193efebd2fad2df Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 20 Mar 2026 11:07:07 +0800 Subject: [PATCH 2/5] gracefully stoping AINode in CI env --- .../it/env/cluster/node/AINodeWrapper.java | 45 +++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java index 15c2e4761dda..3a30db6dcc30 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java @@ -38,6 +38,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Properties; +import java.util.concurrent.TimeUnit; import java.util.stream.Stream; import static org.apache.iotdb.it.env.cluster.ClusterConstant.AI_NODE_NAME; @@ -52,7 +53,8 @@ public class AINodeWrapper extends AbstractNodeWrapper { private final String seedConfigNode; private final int clusterIngressPort; - private static final String SCRIPT_FILE = "start-ainode.sh"; + private static final String START_SCRIPT_FILE = "start-ainode.sh"; + private static final String STOP_SCRIPT_FILE = "stop-ainode.sh"; private static final String SHELL_COMMAND = "bash"; @@ -165,8 +167,8 @@ public void start() { // start AINode List startCommand = new ArrayList<>(); startCommand.add(SHELL_COMMAND); - startCommand.add(filePrefix + File.separator + SCRIPT_PATH + File.separator + SCRIPT_FILE); - startCommand.add("-r"); + startCommand.add( + filePrefix + File.separator + SCRIPT_PATH + File.separator + START_SCRIPT_FILE); ProcessBuilder processBuilder = new ProcessBuilder(startCommand) @@ -179,6 +181,43 @@ public void start() { } } + @Override + public void stop() { + if (this.instance == null) { + return; + } + try { + // stop AINode + File stdoutFile = new File(getLogPath()); + String filePrefix = getNodePath(); + List stopCommand = new ArrayList<>(); + stopCommand.add(SHELL_COMMAND); + stopCommand.add( + filePrefix + File.separator + SCRIPT_PATH + File.separator + STOP_SCRIPT_FILE); + ProcessBuilder processBuilder = + new ProcessBuilder(stopCommand) + .redirectOutput(ProcessBuilder.Redirect.appendTo(stdoutFile)) + .redirectError(ProcessBuilder.Redirect.appendTo(stdoutFile)); + Process stopProcess = processBuilder.inheritIO().start(); + if (!stopProcess.waitFor(20, TimeUnit.SECONDS)) { + logger.warn("Node {} does not exit within 20s, killing it", getId()); + if (!this.instance.destroyForcibly().waitFor(10, TimeUnit.SECONDS)) { + logger.error("Cannot forcibly stop node {}", getId()); + } + } + int exitCode = stopProcess.exitValue(); + if (exitCode != 0) { + logger.warn("Node {}'s stop script exited with code {}", getId(), exitCode); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.error("Waiting node to shutdown error.", e); + } catch (IOException e) { + logger.error("Waiting node to shutdown error.", e); + } + logger.info("In test {} {} stopped.", getTestLogDirName(), getId()); + } + @Override public int getMetricPort() { // no metric currently From 6f463be03094945f6599f447d6df21cd6fd275a7 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 20 Mar 2026 12:11:14 +0800 Subject: [PATCH 3/5] real stop func --- .../org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java index 3a30db6dcc30..d452e19f381a 100644 --- a/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java +++ b/integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java @@ -218,6 +218,11 @@ public void stop() { logger.info("In test {} {} stopped.", getTestLogDirName(), getId()); } + @Override + public void stopForcibly() { + this.stop(); + } + @Override public int getMetricPort() { // no metric currently From db57cb8ef17814f6587aded78b22f44f02c9eba3 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 20 Mar 2026 14:03:16 +0800 Subject: [PATCH 4/5] Update script.py --- iotdb-core/ainode/iotdb/ainode/core/script.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/iotdb-core/ainode/iotdb/ainode/core/script.py b/iotdb-core/ainode/iotdb/ainode/core/script.py index 38653d7ceab9..86373a3e0656 100644 --- a/iotdb-core/ainode/iotdb/ainode/core/script.py +++ b/iotdb-core/ainode/iotdb/ainode/core/script.py @@ -15,11 +15,26 @@ # specific language governing permissions and limitations # under the License. # + import multiprocessing import sys +# PyInstaller multiprocessing support +# freeze_support() is essential for PyInstaller frozen executables on all platforms +# It detects if the current process is a multiprocessing child process +# If it is, it executes the child process target function and exits +# If it's not, it returns immediately and continues with main() execution +# This prevents child processes from executing the main application logic +if getattr(sys, "frozen", False): + # Call freeze_support() for both standard multiprocessing and torch.multiprocessing + multiprocessing.freeze_support() + multiprocessing.set_start_method("spawn", force=True) + import torch.multiprocessing as mp +mp.freeze_support() +mp.set_start_method("spawn", force=True) + from iotdb.ainode.core.ai_node import AINode from iotdb.ainode.core.log import Logger @@ -42,7 +57,6 @@ def main(): command = arguments[1] if command == "start": try: - mp.set_start_method("spawn", force=True) logger.info(f"Current multiprocess start method: {mp.get_start_method()}") logger.info("IoTDB-AINode is starting...") ai_node = AINode() @@ -55,15 +69,4 @@ def main(): if __name__ == "__main__": - # PyInstaller multiprocessing support - # freeze_support() is essential for PyInstaller frozen executables on all platforms - # It detects if the current process is a multiprocessing child process - # If it is, it executes the child process target function and exits - # If it's not, it returns immediately and continues with main() execution - # This prevents child processes from executing the main application logic - if getattr(sys, "frozen", False): - # Call freeze_support() for both standard multiprocessing and torch.multiprocessing - multiprocessing.freeze_support() - mp.freeze_support() - main() From 493a2effa99905f52fdc3db6871e95e5dfaf5dc2 Mon Sep 17 00:00:00 2001 From: Yongzao <532741407@qq.com> Date: Fri, 20 Mar 2026 14:38:40 +0800 Subject: [PATCH 5/5] Update cuda_backend.py --- .../core/device/backend/cuda_backend.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/iotdb-core/ainode/iotdb/ainode/core/device/backend/cuda_backend.py b/iotdb-core/ainode/iotdb/ainode/core/device/backend/cuda_backend.py index c7533cc4dd7e..553101bb8446 100644 --- a/iotdb-core/ainode/iotdb/ainode/core/device/backend/cuda_backend.py +++ b/iotdb-core/ainode/iotdb/ainode/core/device/backend/cuda_backend.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # +import time import torch @@ -24,6 +25,9 @@ class CUDABackend(BackendAdapter): type = BackendType.CUDA + def __init__(self) -> None: + self._safe_cuda_init() + def is_available(self) -> bool: return torch.cuda.is_available() @@ -37,3 +41,19 @@ def make_device(self, index: int | None) -> torch.device: def set_device(self, index: int) -> None: torch.cuda.set_device(index) + + def _safe_cuda_init(self) -> None: + # Safe CUDA initialization to avoid potential deadlocks + # This is a workaround for certain PyTorch versions where the first CUDA call can cause a long delay + # By calling a simple CUDA operation at startup, we can ensure that the CUDA context is initialized early + # and avoid unexpected delays during actual model loading or inference. + attempt_cnt = 3 + for attempt in range(attempt_cnt): + try: + if self.is_available(): + return + raise RuntimeError("CUDA not available") + except Exception as e: + print(f"CUDA init attempt {attempt + 1} failed: {e}") + if attempt < attempt_cnt: + time.sleep(1.5)