From edfe4cbf936ebed41f82500f8b183bea225976ee Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Tue, 18 Nov 2025 12:51:18 -0600 Subject: [PATCH 1/8] rocm tech support changes --- .../plugins/inband/rocm/rocm_collector.py | 19 +++++++++++++++++++ nodescraper/plugins/inband/rocm/rocmdata.py | 4 ++++ 2 files changed, 23 insertions(+) diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index 37470f68..eb186ea6 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -42,6 +42,9 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): "/opt/rocm/.info/version-rocm", "/opt/rocm/.info/version", ] + CMD_ROCMINFO = "rocminfo" + CMD_ROCM_VERSIONED_PATHS = "ls -v -d /opt/rocm-[3-7]* | tail -1" + CMD_ROCM_ALL_PATHS = "ls -v -d /opt/rocm*" def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: """Collect ROCm version data from the system. @@ -59,6 +62,22 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: res = self._run_sut_cmd(f"grep . {path}") if res.exit_code == 0: rocm_data = RocmDataModel(rocm_version=res.stdout) + + # Collect rocminfo output + rocminfo_res = self._run_sut_cmd(self.CMD_ROCMINFO) + if rocminfo_res.exit_code == 0: + rocm_data.rocminfo = rocminfo_res.stdout + + # Collect latest versioned ROCm path (rocm-[3-7]*) + versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_VERSIONED_PATHS) + if versioned_path_res.exit_code == 0: + rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout + + # Collect all ROCm paths + all_paths_res = self._run_sut_cmd(self.CMD_ROCM_ALL_PATHS) + if all_paths_res.exit_code == 0: + rocm_data.rocm_all_paths = all_paths_res.stdout + self._log_event( category="ROCM_VERSION_READ", description="ROCm version data collected", diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index 2c5388e8..07030075 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,6 +24,7 @@ # ############################################################################### import re +from typing import Optional from pydantic import field_validator @@ -32,6 +33,9 @@ class RocmDataModel(DataModel): rocm_version: str + rocminfo: Optional[str] = None + rocm_latest_versioned_path: Optional[str] = None + rocm_all_paths: Optional[str] = None @field_validator("rocm_version") @classmethod From 445b561471143e602037f4c606c2970e7097e991 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Tue, 18 Nov 2025 15:06:12 -0600 Subject: [PATCH 2/8] Store rocminfo and paths as lists for better JSON formatting, strip ANSI codes --- .../plugins/inband/rocm/rocm_collector.py | 41 +++++++++++++++---- nodescraper/plugins/inband/rocm/rocmdata.py | 6 +-- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index eb186ea6..70017ea3 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -23,6 +23,7 @@ # SOFTWARE. # ############################################################################### +import re from typing import Optional from nodescraper.base import InBandDataCollector @@ -46,6 +47,12 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): CMD_ROCM_VERSIONED_PATHS = "ls -v -d /opt/rocm-[3-7]* | tail -1" CMD_ROCM_ALL_PATHS = "ls -v -d /opt/rocm*" + @staticmethod + def _strip_ansi_codes(text: str) -> str: + """Remove ANSI escape codes from text.""" + ansi_escape = re.compile(r"\x1b\[[0-9;]*m") + return ansi_escape.sub("", text) + def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: """Collect ROCm version data from the system. @@ -63,28 +70,48 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: if res.exit_code == 0: rocm_data = RocmDataModel(rocm_version=res.stdout) - # Collect rocminfo output + # Collect rocminfo output as list of lines with ANSI codes stripped rocminfo_res = self._run_sut_cmd(self.CMD_ROCMINFO) if rocminfo_res.exit_code == 0: - rocm_data.rocminfo = rocminfo_res.stdout + # Split into lines and strip ANSI codes from each line + rocm_data.rocminfo = [ + self._strip_ansi_codes(line) + for line in rocminfo_res.stdout.strip().split("\n") + ] # Collect latest versioned ROCm path (rocm-[3-7]*) versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_VERSIONED_PATHS) if versioned_path_res.exit_code == 0: - rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout + rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip() - # Collect all ROCm paths + # Collect all ROCm paths as list all_paths_res = self._run_sut_cmd(self.CMD_ROCM_ALL_PATHS) if all_paths_res.exit_code == 0: - rocm_data.rocm_all_paths = all_paths_res.stdout + rocm_data.rocm_all_paths = [ + path.strip() + for path in all_paths_res.stdout.strip().split("\n") + if path.strip() + ] + + # Create concise summary for logging + log_summary = { + "rocm_version": rocm_data.rocm_version, + "rocminfo_lines_collected": ( + len(rocm_data.rocminfo) if rocm_data.rocminfo else 0 + ), + "rocm_latest_versioned_path": rocm_data.rocm_latest_versioned_path, + "rocm_paths_count": ( + len(rocm_data.rocm_all_paths) if rocm_data.rocm_all_paths else 0 + ), + } self._log_event( category="ROCM_VERSION_READ", description="ROCm version data collected", - data=rocm_data.model_dump(), + data=log_summary, priority=EventPriority.INFO, ) - self.result.message = f"ROCm: {rocm_data.model_dump()}" + self.result.message = f"ROCm version: {rocm_data.rocm_version}, Latest path: {rocm_data.rocm_latest_versioned_path}" self.result.status = ExecutionStatus.OK break else: diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index 07030075..4c5ae413 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,7 +24,7 @@ # ############################################################################### import re -from typing import Optional +from typing import List, Optional from pydantic import field_validator @@ -33,9 +33,9 @@ class RocmDataModel(DataModel): rocm_version: str - rocminfo: Optional[str] = None + rocminfo: Optional[List[str]] = None rocm_latest_versioned_path: Optional[str] = None - rocm_all_paths: Optional[str] = None + rocm_all_paths: Optional[List[str]] = None @field_validator("rocm_version") @classmethod From 10cdf0d399b645b8b0a4149f488fa77b6a6046f3 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Thu, 20 Nov 2025 13:22:25 -0600 Subject: [PATCH 3/8] initial rocm commands --- .../plugins/inband/rocm/rocm_collector.py | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index 70017ea3..a4bc797c 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -27,6 +27,7 @@ from typing import Optional from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult @@ -79,6 +80,11 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: for line in rocminfo_res.stdout.strip().split("\n") ] + # Add rocminfo output as a text file artifact + self.result.artifacts.append( + TextFileArtifact(filename="rocminfo.log", contents=rocminfo_res.stdout) + ) + # Collect latest versioned ROCm path (rocm-[3-7]*) versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_VERSIONED_PATHS) if versioned_path_res.exit_code == 0: @@ -93,25 +99,13 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: if path.strip() ] - # Create concise summary for logging - log_summary = { - "rocm_version": rocm_data.rocm_version, - "rocminfo_lines_collected": ( - len(rocm_data.rocminfo) if rocm_data.rocminfo else 0 - ), - "rocm_latest_versioned_path": rocm_data.rocm_latest_versioned_path, - "rocm_paths_count": ( - len(rocm_data.rocm_all_paths) if rocm_data.rocm_all_paths else 0 - ), - } - self._log_event( category="ROCM_VERSION_READ", description="ROCm version data collected", - data=log_summary, + data=rocm_data.model_dump(include={"rocm_version"}), priority=EventPriority.INFO, ) - self.result.message = f"ROCm version: {rocm_data.rocm_version}, Latest path: {rocm_data.rocm_latest_versioned_path}" + self.result.message = f"ROCm version: {rocm_data.rocm_version}" self.result.status = ExecutionStatus.OK break else: From e9db5df4d8e67068f26c6d0d92307fee018bed42 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Thu, 20 Nov 2025 14:28:55 -0600 Subject: [PATCH 4/8] initial rocm cmds --- nodescraper/plugins/inband/rocm/rocmdata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index 4c5ae413..faa41881 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,7 +24,7 @@ # ############################################################################### import re -from typing import List, Optional +from typing import List from pydantic import field_validator @@ -33,9 +33,9 @@ class RocmDataModel(DataModel): rocm_version: str - rocminfo: Optional[List[str]] = None - rocm_latest_versioned_path: Optional[str] = None - rocm_all_paths: Optional[List[str]] = None + rocminfo: List[str] = [] + rocm_latest_versioned_path: str = "" + rocm_all_paths: List[str] = [] @field_validator("rocm_version") @classmethod From 80a76df4b8d9f74a858ec9e47d61a8579b63d7c9 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Mon, 24 Nov 2025 12:11:11 -0600 Subject: [PATCH 5/8] adding more rocm commands --- .../plugins/inband/rocm/rocm_collector.py | 144 ++++++++++++++---- nodescraper/plugins/inband/rocm/rocmdata.py | 5 + 2 files changed, 116 insertions(+), 33 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index a4bc797c..0b22b23c 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -44,13 +44,26 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): "/opt/rocm/.info/version-rocm", "/opt/rocm/.info/version", ] - CMD_ROCMINFO = "rocminfo" - CMD_ROCM_VERSIONED_PATHS = "ls -v -d /opt/rocm-[3-7]* | tail -1" - CMD_ROCM_ALL_PATHS = "ls -v -d /opt/rocm*" + CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" + CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" + CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" + CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*" + CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'" + CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" + CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo" + CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/" @staticmethod def _strip_ansi_codes(text: str) -> str: - """Remove ANSI escape codes from text.""" + """ + Remove ANSI escape codes from text. + + Args: + text (str): The text string containing ANSI escape codes. + + Returns: + str: The text with ANSI escape codes removed. + """ ansi_escape = re.compile(r"\x1b\[[0-9;]*m") return ansi_escape.sub("", text) @@ -70,35 +83,6 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: res = self._run_sut_cmd(f"grep . {path}") if res.exit_code == 0: rocm_data = RocmDataModel(rocm_version=res.stdout) - - # Collect rocminfo output as list of lines with ANSI codes stripped - rocminfo_res = self._run_sut_cmd(self.CMD_ROCMINFO) - if rocminfo_res.exit_code == 0: - # Split into lines and strip ANSI codes from each line - rocm_data.rocminfo = [ - self._strip_ansi_codes(line) - for line in rocminfo_res.stdout.strip().split("\n") - ] - - # Add rocminfo output as a text file artifact - self.result.artifacts.append( - TextFileArtifact(filename="rocminfo.log", contents=rocminfo_res.stdout) - ) - - # Collect latest versioned ROCm path (rocm-[3-7]*) - versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_VERSIONED_PATHS) - if versioned_path_res.exit_code == 0: - rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip() - - # Collect all ROCm paths as list - all_paths_res = self._run_sut_cmd(self.CMD_ROCM_ALL_PATHS) - if all_paths_res.exit_code == 0: - rocm_data.rocm_all_paths = [ - path.strip() - for path in all_paths_res.stdout.strip().split("\n") - if path.strip() - ] - self._log_event( category="ROCM_VERSION_READ", description="ROCm version data collected", @@ -116,6 +100,100 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: priority=EventPriority.ERROR, ) + # Collect additional ROCm data if version was found + if rocm_data: + # Collect latest versioned ROCm path (rocm-[3-7]*) + versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST) + if versioned_path_res.exit_code == 0: + rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip() + + # Collect all ROCm paths as list + all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS) + if all_paths_res.exit_code == 0: + rocm_data.rocm_all_paths = [ + path.strip() + for path in all_paths_res.stdout.strip().split("\n") + if path.strip() + ] + + # Determine ROCm path for commands that need it + rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm" + + # Collect rocminfo output as list of lines with ANSI codes stripped + rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path) + rocminfo_res = self._run_sut_cmd(rocminfo_cmd) + rocminfo_artifact_content = "" + if rocminfo_res.exit_code == 0: + # Split into lines and strip ANSI codes from each line + rocm_data.rocminfo = [ + self._strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n") + ] + rocminfo_artifact_content += "=" * 80 + "\n" + rocminfo_artifact_content += "ROCMNFO OUTPUT\n" + rocminfo_artifact_content += "=" * 80 + "\n\n" + rocminfo_artifact_content += rocminfo_res.stdout + + # Collect ld.so.conf ROCm entries + ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF) + if ld_conf_res.exit_code == 0: + rocm_data.ld_conf_rocm = [ + line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip() + ] + + # Collect ROCm libraries from ldconfig + rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS) + if rocm_libs_res.exit_code == 0: + rocm_data.rocm_libs = [ + line.strip() + for line in rocm_libs_res.stdout.strip().split("\n") + if line.strip() + ] + + # Collect ROCm-related environment variables + env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS) + if env_vars_res.exit_code == 0: + rocm_data.env_vars = [ + line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip() + ] + + # Collect clinfo output + clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path) + clinfo_res = self._run_sut_cmd(clinfo_cmd) + + # Always append clinfo section to artifact, even if empty or failed + if rocminfo_artifact_content: + rocminfo_artifact_content += "\n\n" + rocminfo_artifact_content += "=" * 80 + "\n" + rocminfo_artifact_content += "CLINFO OUTPUT\n" + rocminfo_artifact_content += "=" * 80 + "\n\n" + + if clinfo_res.exit_code == 0: + rocm_data.clinfo = [ + self._strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n") + ] + rocminfo_artifact_content += clinfo_res.stdout + else: + # Add error information if clinfo failed + rocminfo_artifact_content += f"Command: {clinfo_res.command}\n" + rocminfo_artifact_content += f"Exit Code: {clinfo_res.exit_code}\n" + if clinfo_res.stderr: + rocminfo_artifact_content += f"Error: {clinfo_res.stderr}\n" + if clinfo_res.stdout: + rocminfo_artifact_content += f"Output: {clinfo_res.stdout}\n" + + # Add combined rocminfo and clinfo output as a text file artifact + if rocminfo_artifact_content: + self.result.artifacts.append( + TextFileArtifact(filename="rocminfo.log", contents=rocminfo_artifact_content) + ) + + # Collect KFD process list + kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC) + if kfd_proc_res.exit_code == 0: + rocm_data.kfd_proc = [ + proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip() + ] + if not rocm_data: self._log_event( category=EventCategory.OS, diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index faa41881..f0fb2618 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -36,6 +36,11 @@ class RocmDataModel(DataModel): rocminfo: List[str] = [] rocm_latest_versioned_path: str = "" rocm_all_paths: List[str] = [] + ld_conf_rocm: List[str] = [] + rocm_libs: List[str] = [] + env_vars: List[str] = [] + clinfo: List[str] = [] + kfd_proc: List[str] = [] @field_validator("rocm_version") @classmethod From 995025f63a68fddc17b7043d4c6e37dfb6986db0 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Wed, 26 Nov 2025 19:56:54 -0600 Subject: [PATCH 6/8] test collector fix --- .../plugins/inband/rocm/analyzer_args.py | 1 + .../plugins/inband/rocm/rocm_analyzer.py | 45 ++- test/unit/plugin/test_rocm_analyzer.py | 21 +- test/unit/plugin/test_rocm_collector.py | 366 ++++++++++++++++-- 4 files changed, 379 insertions(+), 54 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/analyzer_args.py b/nodescraper/plugins/inband/rocm/analyzer_args.py index 40a11ebc..70545b0c 100644 --- a/nodescraper/plugins/inband/rocm/analyzer_args.py +++ b/nodescraper/plugins/inband/rocm/analyzer_args.py @@ -32,6 +32,7 @@ class RocmAnalyzerArgs(BaseModel): exp_rocm: Union[str, list] = Field(default_factory=list) + exp_rocm_latest: str = Field(default="") @field_validator("exp_rocm", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/rocm/rocm_analyzer.py b/nodescraper/plugins/inband/rocm/rocm_analyzer.py index 17762af7..e9a344f6 100644 --- a/nodescraper/plugins/inband/rocm/rocm_analyzer.py +++ b/nodescraper/plugins/inband/rocm/rocm_analyzer.py @@ -61,17 +61,40 @@ def analyze_data( if data.rocm_version == rocm_version: self.result.message = "ROCm version matches expected" self.result.status = ExecutionStatus.OK + break + else: + # No matching version found + self.result.message = ( + f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}" + ) + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.SW_DRIVER, + description=f"{self.result.message}", + data={"expected": args.exp_rocm, "actual": data.rocm_version}, + priority=EventPriority.CRITICAL, + console_log=True, + ) + return self.result + + # validate rocm_latest if provided in args (only if version check passed) + if args.exp_rocm_latest: + if data.rocm_latest_versioned_path != args.exp_rocm_latest: + self.result.message = f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}" + self.result.status = ExecutionStatus.ERROR + self._log_event( + category=EventCategory.SW_DRIVER, + description=f"{self.result.message}", + data={ + "expected": args.exp_rocm_latest, + "actual": data.rocm_latest_versioned_path, + }, + priority=EventPriority.CRITICAL, + console_log=True, + ) return self.result + else: + # Update message to include rocm_latest validation result + self.result.message = f"ROCm version matches expected. ROCm latest path validated: {data.rocm_latest_versioned_path}" - self.result.message = ( - f"ROCm version mismatch! Expected: {rocm_version}, actual: {args.exp_rocm}" - ) - self.result.status = ExecutionStatus.ERROR - self._log_event( - category=EventCategory.SW_DRIVER, - description=f"{self.result.message}", - data={"expected": args.exp_rocm, "actual": data.rocm_version}, - priority=EventPriority.CRITICAL, - console_log=True, - ) return self.result diff --git a/test/unit/plugin/test_rocm_analyzer.py b/test/unit/plugin/test_rocm_analyzer.py index 18afed9d..9ecc7fb4 100644 --- a/test/unit/plugin/test_rocm_analyzer.py +++ b/test/unit/plugin/test_rocm_analyzer.py @@ -42,7 +42,7 @@ def analyzer(system_info): @pytest.fixture def model_obj(): - return RocmDataModel(rocm_version="6.2.0-66") + return RocmDataModel(rocm_version="6.2.0-66", rocm_latest_versioned_path="/opt/rocm-7.1.0") @pytest.fixture @@ -50,14 +50,16 @@ def config(): return { "rocm_version": ["6.2.0-66"], "invalid": "invalid", + "rocm_latest": "/opt/rocm-7.1.0", } def test_all_good_data(analyzer, model_obj, config): - args = RocmAnalyzerArgs(exp_rocm=config["rocm_version"]) + args = RocmAnalyzerArgs(exp_rocm=config["rocm_version"], exp_rocm_latest=config["rocm_latest"]) result = analyzer.analyze_data(model_obj, args) assert result.status == ExecutionStatus.OK - assert result.message == "ROCm version matches expected" + assert "ROCm version matches expected" in result.message + assert "ROCm latest path validated" in result.message assert all( event.priority not in {EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL} for event in result.events @@ -94,3 +96,16 @@ def test_unexpected_rocm_version(analyzer, model_obj): def test_invalid_user_config(analyzer, model_obj, config): result = analyzer.analyze_data(model_obj, None) assert result.status == ExecutionStatus.NOT_RAN + + +def test_rocm_latest_path_mismatch(analyzer, model_obj): + """Test that rocm_latest path mismatch is detected and logged""" + args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"], exp_rocm_latest="/opt/rocm-6.2.0") + result = analyzer.analyze_data(model_obj, args) + assert result.status == ExecutionStatus.ERROR + assert "ROCm latest path mismatch" in result.message + assert "/opt/rocm-6.2.0" in result.message # expected + assert "/opt/rocm-7.1.0" in result.message # actual + for event in result.events: + assert event.priority == EventPriority.CRITICAL + assert event.category == EventCategory.SW_DRIVER.value diff --git a/test/unit/plugin/test_rocm_collector.py b/test/unit/plugin/test_rocm_collector.py index 60e63f28..9be93f1a 100644 --- a/test/unit/plugin/test_rocm_collector.py +++ b/test/unit/plugin/test_rocm_collector.py @@ -23,68 +23,354 @@ # SOFTWARE. # ############################################################################### -import copy +from unittest.mock import MagicMock import pytest from nodescraper.enums.eventcategory import EventCategory -from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.executionstatus import ExecutionStatus -from nodescraper.plugins.inband.rocm.analyzer_args import RocmAnalyzerArgs -from nodescraper.plugins.inband.rocm.rocm_analyzer import RocmAnalyzer -from nodescraper.plugins.inband.rocm.rocmdata import RocmDataModel +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.rocm.rocm_collector import RocmCollector @pytest.fixture -def model_obj(): - return RocmDataModel(rocm_version="6.2.0-66") +def collector(system_info, conn_mock): + return RocmCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) -@pytest.fixture -def analyzer(system_info): - return RocmAnalyzer(system_info=system_info) +def test_collect_rocm_version_success(collector): + """Test successful collection of ROCm version from version-rocm file""" + collector._run_sut_cmd = MagicMock( + return_value=MagicMock( + exit_code=0, + stdout="6.2.0-66", + command="grep . /opt/rocm/.info/version-rocm", + ) + ) + result, data = collector.collect_data() -def test_all_good_data(analyzer, model_obj): - args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"]) - result = analyzer.analyze_data(model_obj, args=args) assert result.status == ExecutionStatus.OK - assert result.message == "ROCm version matches expected" - assert all( - event.priority not in [EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL] - for event in result.events + assert data is not None + assert data.rocm_version == "6.2.0-66" + assert "ROCm version: 6.2.0-66" in result.message + + +def test_collect_rocm_version_fallback(collector): + """Test fallback to version file when version-rocm fails""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock(exit_code=1, stdout="", command="grep . /opt/rocm/.info/version-rocm"), + MagicMock(exit_code=0, stdout="6.2.0-66", command="grep . /opt/rocm/.info/version"), + ] ) + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_version == "6.2.0-66" -def test_no_config_data(analyzer, model_obj): - result = analyzer.analyze_data(model_obj) - assert result.status == ExecutionStatus.NOT_RAN +def test_collect_rocm_version_not_found(collector): + """Test when ROCm version cannot be found""" + collector._run_sut_cmd = MagicMock( + return_value=MagicMock( + exit_code=1, + stdout="", + stderr="No such file or directory", + command="grep . /opt/rocm/.info/version-rocm", + ) + ) -def test_invalid_rocm_version(analyzer, model_obj): - modified_model = copy.deepcopy(model_obj) - modified_model.rocm_version = "some_invalid_version" - args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"]) - result = analyzer.analyze_data(modified_model, args=args) + result, data = collector.collect_data() assert result.status == ExecutionStatus.ERROR - assert "ROCm version mismatch!" in result.message - for event in result.events: - assert event.priority == EventPriority.CRITICAL - assert event.category == EventCategory.SW_DRIVER.value + assert data is None + assert "ROCm version not found" in result.message + assert any(event.category == EventCategory.OS.value for event in result.events) -def test_unexpected_rocm_version(analyzer, model_obj): - args = RocmAnalyzerArgs(exp_rocm=["9.8.7-65", "1.2.3-45"]) - result = analyzer.analyze_data(model_obj, args=args) +def test_collect_all_rocm_data(collector): + """Test collection of all ROCm data including tech support commands""" + # Mock all command outputs in sequence + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # Latest versioned path + MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), + # All ROCm paths + MagicMock(exit_code=0, stdout="/opt/rocm\n/opt/rocm-6.2.0\n/opt/rocm-7.1.0"), + # rocminfo output + MagicMock( + exit_code=0, + stdout="ROCk module is loaded\nAgent 1\n Name: AMD Instinct MI300X\n Marketing Name: MI300X", + ), + # ld.so.conf entries + MagicMock( + exit_code=0, + stdout="/etc/ld.so.conf.d/rocm.conf:/opt/rocm/lib\n/etc/ld.so.conf.d/rocm.conf:/opt/rocm/lib64", + ), + # ROCm libraries from ldconfig + MagicMock( + exit_code=0, + stdout="librocm_smi64.so.7 (libc6,x86-64) => /opt/rocm/lib/librocm_smi64.so.7\nlibhsa-runtime64.so.1 (libc6,x86-64) => /opt/rocm/lib/libhsa-runtime64.so.1", + ), + # Environment variables + MagicMock( + exit_code=0, + stdout="ROCM_PATH=/opt/rocm\nHIP_VISIBLE_DEVICES=0,1\nHSA_OVERRIDE_GFX_VERSION=11.0.0", + ), + # clinfo output + MagicMock( + exit_code=0, + stdout="Platform #0: AMD Accelerated Parallel Processing\n Device #0: gfx942:sramecc+:xnack-", + ), + # KFD process list + MagicMock(exit_code=0, stdout="1234\n5678"), + ] + ) - assert result.status == ExecutionStatus.ERROR - assert "ROCm version mismatch!" in result.message - for event in result.events: - assert event.priority == EventPriority.CRITICAL - assert event.category == EventCategory.SW_DRIVER.value + result, data = collector.collect_data() + + # Verify result status + assert result.status == ExecutionStatus.OK + assert data is not None + + # Verify ROCm version + assert data.rocm_version == "6.2.0-66" + + # Verify ROCm latest path + assert data.rocm_latest_versioned_path == "/opt/rocm-7.1.0" + + # Verify all ROCm paths + assert data.rocm_all_paths == ["/opt/rocm", "/opt/rocm-6.2.0", "/opt/rocm-7.1.0"] + + # Verify rocminfo output + assert len(data.rocminfo) == 4 + assert "ROCk module is loaded" in data.rocminfo[0] + assert "AMD Instinct MI300X" in data.rocminfo[2] + + # Verify ld.so.conf entries + assert len(data.ld_conf_rocm) == 2 + assert "/etc/ld.so.conf.d/rocm.conf:/opt/rocm/lib" in data.ld_conf_rocm + + # Verify ROCm libraries + assert len(data.rocm_libs) == 2 + assert any("librocm_smi64" in lib for lib in data.rocm_libs) + assert any("libhsa-runtime64" in lib for lib in data.rocm_libs) + + # Verify environment variables + assert len(data.env_vars) == 3 + assert "ROCM_PATH=/opt/rocm" in data.env_vars + assert "HIP_VISIBLE_DEVICES=0,1" in data.env_vars + + # Verify clinfo output + assert len(data.clinfo) == 2 + assert "AMD Accelerated Parallel Processing" in data.clinfo[0] + + # Verify KFD process list + assert len(data.kfd_proc) == 2 + assert "1234" in data.kfd_proc + assert "5678" in data.kfd_proc + + # Verify artifact was created + assert len(result.artifacts) == 1 + assert result.artifacts[0].filename == "rocminfo.log" + assert "ROCMNFO OUTPUT" in result.artifacts[0].contents + assert "CLINFO OUTPUT" in result.artifacts[0].contents + + +def test_collect_with_ansi_codes_stripped(collector): + """Test that ANSI escape codes are stripped from rocminfo and clinfo""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # Latest versioned path + MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), + # All ROCm paths + MagicMock(exit_code=0, stdout="/opt/rocm"), + # rocminfo with ANSI codes + MagicMock(exit_code=0, stdout="\x1b[32mAgent 1\x1b[0m\n\x1b[33mName: MI300X\x1b[0m"), + # Other commands return empty + MagicMock(exit_code=1, stdout=""), # ld.so.conf + MagicMock(exit_code=1, stdout=""), # rocm_libs + MagicMock(exit_code=1, stdout=""), # env_vars + # clinfo with ANSI codes + MagicMock(exit_code=0, stdout="\x1b[1mPlatform #0\x1b[0m: AMD"), + MagicMock(exit_code=1, stdout=""), # kfd_proc + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + + # Verify ANSI codes are stripped from rocminfo + assert len(data.rocminfo) == 2 + assert "\x1b" not in data.rocminfo[0] + assert "\x1b" not in data.rocminfo[1] + assert "Agent 1" in data.rocminfo[0] + assert "Name: MI300X" in data.rocminfo[1] + # Verify ANSI codes are stripped from clinfo + assert len(data.clinfo) == 1 + assert "\x1b" not in data.clinfo[0] + assert "Platform #0: AMD" in data.clinfo[0] + + +def test_collect_with_clinfo_failure(collector): + """Test that clinfo failure is handled gracefully and captured in artifact""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # Latest versioned path + MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), + # All ROCm paths + MagicMock(exit_code=0, stdout="/opt/rocm"), + # rocminfo success + MagicMock(exit_code=0, stdout="ROCk module loaded"), + # Other commands + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + # clinfo failure + MagicMock( + exit_code=1, + stdout="", + stderr="clinfo: command not found", + command="/opt/rocm-7.1.0/opencl/bin/*/clinfo", + ), + # kfd_proc + MagicMock(exit_code=0, stdout=""), + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data.clinfo == [] + + # Verify artifact contains error information + assert len(result.artifacts) == 1 + artifact_content = result.artifacts[0].contents + assert "CLINFO OUTPUT" in artifact_content + assert "Exit Code: 1" in artifact_content + assert "clinfo: command not found" in artifact_content + + +def test_collect_minimal_data(collector): + """Test collection when only version is available""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # All subsequent commands fail + MagicMock(exit_code=1, stdout=""), # latest path + MagicMock(exit_code=1, stdout=""), # all paths + MagicMock(exit_code=1, stdout=""), # rocminfo + MagicMock(exit_code=1, stdout=""), # ld.so.conf + MagicMock(exit_code=1, stdout=""), # rocm_libs + MagicMock(exit_code=1, stdout=""), # env_vars + MagicMock(exit_code=1, stdout=""), # clinfo + MagicMock(exit_code=1, stdout=""), # kfd_proc + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_version == "6.2.0-66" + + # Verify optional fields have default values + assert data.rocm_latest_versioned_path == "" + assert data.rocm_all_paths == [] + assert data.rocminfo == [] + assert data.ld_conf_rocm == [] + assert data.rocm_libs == [] + assert data.env_vars == [] + assert data.clinfo == [] + assert data.kfd_proc == [] + + +def test_collect_empty_list_handling(collector): + """Test that empty stdout results in empty lists, not lists with empty strings""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + # ROCm version + MagicMock(exit_code=0, stdout="6.2.0-66"), + # Latest versioned path (empty) + MagicMock(exit_code=0, stdout=""), + # All ROCm paths (empty) + MagicMock(exit_code=0, stdout=""), + # rocminfo (empty) + MagicMock(exit_code=0, stdout=""), + # Other commands return empty strings + MagicMock(exit_code=0, stdout=""), + MagicMock(exit_code=0, stdout=""), + MagicMock(exit_code=0, stdout=""), + MagicMock(exit_code=0, stdout=""), + MagicMock(exit_code=0, stdout=""), + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + + # Verify empty lists don't contain empty strings + assert data.rocm_all_paths == [] + assert data.rocminfo == [] + assert data.ld_conf_rocm == [] + assert data.rocm_libs == [] + assert data.env_vars == [] + assert data.clinfo == [] + assert data.kfd_proc == [] + + +def test_rocm_path_determination(collector): + """Test that rocm_path is correctly determined for subsequent commands""" + # Test with latest versioned path available + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock(exit_code=0, stdout="6.2.0-66"), + MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), + MagicMock(exit_code=0, stdout="/opt/rocm"), + MagicMock(exit_code=0, stdout="rocminfo output"), + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=0, stdout="clinfo output"), + MagicMock(exit_code=1, stdout=""), + ] + ) + + result, data = collector.collect_data() + + # The collector should use /opt/rocm-7.1.0 as rocm_path + # We can verify this by checking that rocminfo and clinfo were successfully collected + assert data.rocm_latest_versioned_path == "/opt/rocm-7.1.0" + assert len(data.rocminfo) > 0 + assert len(data.clinfo) > 0 + + +def test_invalid_rocm_version_format(collector): + """Test that invalid ROCm version format raises validation error""" + collector._run_sut_cmd = MagicMock( + return_value=MagicMock( + exit_code=0, + stdout="invalid_version_format", + ) + ) -def test_invalid_user_config(analyzer, model_obj): - result = analyzer.analyze_data(model_obj, None) - assert result.status == ExecutionStatus.NOT_RAN + # Should raise ValueError due to pydantic validation + with pytest.raises(ValueError, match="ROCm version has invalid format"): + result, data = collector.collect_data() From 688deeecc22a607d85ef12391eeea4b707ef6962 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Thu, 27 Nov 2025 11:35:03 -0600 Subject: [PATCH 7/8] addressed review comments --- .../plugins/inband/rocm/rocm_analyzer.py | 2 +- .../plugins/inband/rocm/rocm_collector.py | 80 ++++----- nodescraper/utils.py | 15 ++ test/unit/plugin/test_rocm_collector.py | 156 ++++-------------- 4 files changed, 93 insertions(+), 160 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocm_analyzer.py b/nodescraper/plugins/inband/rocm/rocm_analyzer.py index e9a344f6..1131d665 100644 --- a/nodescraper/plugins/inband/rocm/rocm_analyzer.py +++ b/nodescraper/plugins/inband/rocm/rocm_analyzer.py @@ -77,7 +77,7 @@ def analyze_data( ) return self.result - # validate rocm_latest if provided in args (only if version check passed) + # validate rocm_latest if provided in args if args.exp_rocm_latest: if data.rocm_latest_versioned_path != args.exp_rocm_latest: self.result.message = f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}" diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index 0b22b23c..27acd8d4 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -30,6 +30,17 @@ from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult +from nodescraper.utils import ( + CMD_CLINFO, + CMD_ENV_VARS, + CMD_KFD_PROC, + CMD_LD_CONF, + CMD_ROCM_DIRS, + CMD_ROCM_LATEST, + CMD_ROCM_LIBS, + CMD_ROCMINFO, + CMD_VERSION_PATHS, +) from .rocmdata import RocmDataModel @@ -40,18 +51,6 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} DATA_MODEL = RocmDataModel - CMD_VERSION_PATHS = [ - "/opt/rocm/.info/version-rocm", - "/opt/rocm/.info/version", - ] - CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" - CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" - CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" - CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*" - CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'" - CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" - CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo" - CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/" @staticmethod def _strip_ansi_codes(text: str) -> str: @@ -73,29 +72,36 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: Returns: tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available. """ - version_paths = [ - "/opt/rocm/.info/version-rocm", - "/opt/rocm/.info/version", - ] - rocm_data = None - for path in self.CMD_VERSION_PATHS: + for path in CMD_VERSION_PATHS: res = self._run_sut_cmd(f"grep . {path}") if res.exit_code == 0: - rocm_data = RocmDataModel(rocm_version=res.stdout) - self._log_event( - category="ROCM_VERSION_READ", - description="ROCm version data collected", - data=rocm_data.model_dump(include={"rocm_version"}), - priority=EventPriority.INFO, - ) - self.result.message = f"ROCm version: {rocm_data.rocm_version}" - self.result.status = ExecutionStatus.OK - break + try: + rocm_data = RocmDataModel(rocm_version=res.stdout) + self._log_event( + category="ROCM_VERSION_READ", + description="ROCm version data collected", + data=rocm_data.model_dump(include={"rocm_version"}), + priority=EventPriority.INFO, + ) + self.result.message = f"ROCm version: {rocm_data.rocm_version}" + self.result.status = ExecutionStatus.OK + break + except ValueError as e: + self._log_event( + category=EventCategory.OS, + description=f"Invalid ROCm version format: {res.stdout}", + data={"version": res.stdout, "error": str(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.message = f"Invalid ROCm version format: {res.stdout}" + self.result.status = ExecutionStatus.ERROR + return self.result, None else: self._log_event( category=EventCategory.OS, - description=f"Unable to read ROCm version from {version_paths}", + description=f"Unable to read ROCm version from {CMD_VERSION_PATHS}", data={"raw_output": res.stdout}, priority=EventPriority.ERROR, ) @@ -103,12 +109,12 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: # Collect additional ROCm data if version was found if rocm_data: # Collect latest versioned ROCm path (rocm-[3-7]*) - versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST) + versioned_path_res = self._run_sut_cmd(CMD_ROCM_LATEST) if versioned_path_res.exit_code == 0: rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip() # Collect all ROCm paths as list - all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS) + all_paths_res = self._run_sut_cmd(CMD_ROCM_DIRS) if all_paths_res.exit_code == 0: rocm_data.rocm_all_paths = [ path.strip() @@ -120,7 +126,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm" # Collect rocminfo output as list of lines with ANSI codes stripped - rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path) + rocminfo_cmd = CMD_ROCMINFO.format(rocm_path=rocm_path) rocminfo_res = self._run_sut_cmd(rocminfo_cmd) rocminfo_artifact_content = "" if rocminfo_res.exit_code == 0: @@ -134,14 +140,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: rocminfo_artifact_content += rocminfo_res.stdout # Collect ld.so.conf ROCm entries - ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF) + ld_conf_res = self._run_sut_cmd(CMD_LD_CONF) if ld_conf_res.exit_code == 0: rocm_data.ld_conf_rocm = [ line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip() ] # Collect ROCm libraries from ldconfig - rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS) + rocm_libs_res = self._run_sut_cmd(CMD_ROCM_LIBS) if rocm_libs_res.exit_code == 0: rocm_data.rocm_libs = [ line.strip() @@ -150,14 +156,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: ] # Collect ROCm-related environment variables - env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS) + env_vars_res = self._run_sut_cmd(CMD_ENV_VARS) if env_vars_res.exit_code == 0: rocm_data.env_vars = [ line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip() ] # Collect clinfo output - clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path) + clinfo_cmd = CMD_CLINFO.format(rocm_path=rocm_path) clinfo_res = self._run_sut_cmd(clinfo_cmd) # Always append clinfo section to artifact, even if empty or failed @@ -188,7 +194,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: ) # Collect KFD process list - kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC) + kfd_proc_res = self._run_sut_cmd(CMD_KFD_PROC) if kfd_proc_res.exit_code == 0: rocm_data.kfd_proc = [ proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip() diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 9b1fb88c..3c26984d 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -201,3 +201,18 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str: middle = base[:-3] if base.endswith(".gz") else base return f"{prefix}{middle}.log" + + +# ROCm Plugin Command Constants +CMD_VERSION_PATHS = [ + "/opt/rocm/.info/version-rocm", + "/opt/rocm/.info/version", +] +CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" +CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" +CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" +CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*" +CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'" +CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" +CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo" +CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/" diff --git a/test/unit/plugin/test_rocm_collector.py b/test/unit/plugin/test_rocm_collector.py index 9be93f1a..2b419ad1 100644 --- a/test/unit/plugin/test_rocm_collector.py +++ b/test/unit/plugin/test_rocm_collector.py @@ -66,6 +66,15 @@ def test_collect_rocm_version_fallback(collector): side_effect=[ MagicMock(exit_code=1, stdout="", command="grep . /opt/rocm/.info/version-rocm"), MagicMock(exit_code=0, stdout="6.2.0-66", command="grep . /opt/rocm/.info/version"), + # Additional commands after finding version + MagicMock(exit_code=1, stdout=""), # latest path + MagicMock(exit_code=1, stdout=""), # all paths + MagicMock(exit_code=1, stdout=""), # rocminfo + MagicMock(exit_code=1, stdout=""), # ld.so.conf + MagicMock(exit_code=1, stdout=""), # rocm_libs + MagicMock(exit_code=1, stdout=""), # env_vars + MagicMock(exit_code=1, stdout=""), # clinfo + MagicMock(exit_code=1, stdout=""), # kfd_proc ] ) @@ -103,18 +112,18 @@ def test_collect_all_rocm_data(collector): # ROCm version MagicMock(exit_code=0, stdout="6.2.0-66"), # Latest versioned path - MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), + MagicMock(exit_code=0, stdout="/opt/rocm-1.1.0"), # All ROCm paths - MagicMock(exit_code=0, stdout="/opt/rocm\n/opt/rocm-6.2.0\n/opt/rocm-7.1.0"), + MagicMock(exit_code=0, stdout="/opt/rocm\n/opt/rocm-1.2.3\n/opt/rocm-5.6.0"), # rocminfo output MagicMock( exit_code=0, - stdout="ROCk module is loaded\nAgent 1\n Name: AMD Instinct MI300X\n Marketing Name: MI300X", + stdout="ROCk module is loaded\nAgent 1\n Name: AMD Instinct MI1234XYZ\n Marketing Name: MI1234XYZ", ), # ld.so.conf entries MagicMock( exit_code=0, - stdout="/etc/ld.so.conf.d/rocm.conf:/opt/rocm/lib\n/etc/ld.so.conf.d/rocm.conf:/opt/rocm/lib64", + stdout="/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib\n/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib64", ), # ROCm libraries from ldconfig MagicMock( @@ -124,12 +133,12 @@ def test_collect_all_rocm_data(collector): # Environment variables MagicMock( exit_code=0, - stdout="ROCM_PATH=/opt/rocm\nHIP_VISIBLE_DEVICES=0,1\nHSA_OVERRIDE_GFX_VERSION=11.0.0", + stdout="ROCM_PATH=/opt/rocm\nSLURM_MPI_TYPE=pmi2\n__LMOD_REF_COUNT_MODULEPATH=/share/contrib-modules/.mfiles/Core:1\nMODULEPATH=/share/contrib-modules/", ), # clinfo output MagicMock( exit_code=0, - stdout="Platform #0: AMD Accelerated Parallel Processing\n Device #0: gfx942:sramecc+:xnack-", + stdout="Number of platforms: 1\nPlatform Name: AMD Accelerated Parallel Processing\nPlatform Vendor: Advanced Micro Devices, Inc.\nPlatform Version: OpenCL 2.0 AMD-APP (XXXX.X)\nPlatform Profile: FULL_PROFILE\nPlatform Extensions: cl_khr_icd cl_khr_il_program", ), # KFD process list MagicMock(exit_code=0, stdout="1234\n5678"), @@ -146,19 +155,20 @@ def test_collect_all_rocm_data(collector): assert data.rocm_version == "6.2.0-66" # Verify ROCm latest path - assert data.rocm_latest_versioned_path == "/opt/rocm-7.1.0" + assert data.rocm_latest_versioned_path == "/opt/rocm-1.1.0" # Verify all ROCm paths - assert data.rocm_all_paths == ["/opt/rocm", "/opt/rocm-6.2.0", "/opt/rocm-7.1.0"] + assert data.rocm_all_paths == ["/opt/rocm", "/opt/rocm-1.2.3", "/opt/rocm-5.6.0"] # Verify rocminfo output assert len(data.rocminfo) == 4 assert "ROCk module is loaded" in data.rocminfo[0] - assert "AMD Instinct MI300X" in data.rocminfo[2] + assert "AMD Instinct MI1234XYZ" in data.rocminfo[2] # Verify ld.so.conf entries assert len(data.ld_conf_rocm) == 2 - assert "/etc/ld.so.conf.d/rocm.conf:/opt/rocm/lib" in data.ld_conf_rocm + assert "/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib" in data.ld_conf_rocm + assert "/etc/ld.so.conf.d/10-rocm-opencl.conf:/opt/rocm-7.0.0/lib64" in data.ld_conf_rocm # Verify ROCm libraries assert len(data.rocm_libs) == 2 @@ -166,13 +176,13 @@ def test_collect_all_rocm_data(collector): assert any("libhsa-runtime64" in lib for lib in data.rocm_libs) # Verify environment variables - assert len(data.env_vars) == 3 + assert len(data.env_vars) == 4 assert "ROCM_PATH=/opt/rocm" in data.env_vars - assert "HIP_VISIBLE_DEVICES=0,1" in data.env_vars + assert "MODULEPATH=/share/contrib-modules/" in data.env_vars # Verify clinfo output - assert len(data.clinfo) == 2 - assert "AMD Accelerated Parallel Processing" in data.clinfo[0] + assert len(data.clinfo) == 6 + assert "AMD Accelerated Parallel Processing" in data.clinfo[1] # Verify KFD process list assert len(data.kfd_proc) == 2 @@ -186,45 +196,6 @@ def test_collect_all_rocm_data(collector): assert "CLINFO OUTPUT" in result.artifacts[0].contents -def test_collect_with_ansi_codes_stripped(collector): - """Test that ANSI escape codes are stripped from rocminfo and clinfo""" - collector._run_sut_cmd = MagicMock( - side_effect=[ - # ROCm version - MagicMock(exit_code=0, stdout="6.2.0-66"), - # Latest versioned path - MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), - # All ROCm paths - MagicMock(exit_code=0, stdout="/opt/rocm"), - # rocminfo with ANSI codes - MagicMock(exit_code=0, stdout="\x1b[32mAgent 1\x1b[0m\n\x1b[33mName: MI300X\x1b[0m"), - # Other commands return empty - MagicMock(exit_code=1, stdout=""), # ld.so.conf - MagicMock(exit_code=1, stdout=""), # rocm_libs - MagicMock(exit_code=1, stdout=""), # env_vars - # clinfo with ANSI codes - MagicMock(exit_code=0, stdout="\x1b[1mPlatform #0\x1b[0m: AMD"), - MagicMock(exit_code=1, stdout=""), # kfd_proc - ] - ) - - result, data = collector.collect_data() - - assert result.status == ExecutionStatus.OK - - # Verify ANSI codes are stripped from rocminfo - assert len(data.rocminfo) == 2 - assert "\x1b" not in data.rocminfo[0] - assert "\x1b" not in data.rocminfo[1] - assert "Agent 1" in data.rocminfo[0] - assert "Name: MI300X" in data.rocminfo[1] - - # Verify ANSI codes are stripped from clinfo - assert len(data.clinfo) == 1 - assert "\x1b" not in data.clinfo[0] - assert "Platform #0: AMD" in data.clinfo[0] - - def test_collect_with_clinfo_failure(collector): """Test that clinfo failure is handled gracefully and captured in artifact""" collector._run_sut_cmd = MagicMock( @@ -243,9 +214,9 @@ def test_collect_with_clinfo_failure(collector): MagicMock(exit_code=1, stdout=""), # clinfo failure MagicMock( - exit_code=1, + exit_code=127, stdout="", - stderr="clinfo: command not found", + stderr="No such file or directory", command="/opt/rocm-7.1.0/opencl/bin/*/clinfo", ), # kfd_proc @@ -262,8 +233,8 @@ def test_collect_with_clinfo_failure(collector): assert len(result.artifacts) == 1 artifact_content = result.artifacts[0].contents assert "CLINFO OUTPUT" in artifact_content - assert "Exit Code: 1" in artifact_content - assert "clinfo: command not found" in artifact_content + assert "Exit Code: 127" in artifact_content + assert "No such file or directory" in artifact_content def test_collect_minimal_data(collector): @@ -301,69 +272,8 @@ def test_collect_minimal_data(collector): assert data.kfd_proc == [] -def test_collect_empty_list_handling(collector): - """Test that empty stdout results in empty lists, not lists with empty strings""" - collector._run_sut_cmd = MagicMock( - side_effect=[ - # ROCm version - MagicMock(exit_code=0, stdout="6.2.0-66"), - # Latest versioned path (empty) - MagicMock(exit_code=0, stdout=""), - # All ROCm paths (empty) - MagicMock(exit_code=0, stdout=""), - # rocminfo (empty) - MagicMock(exit_code=0, stdout=""), - # Other commands return empty strings - MagicMock(exit_code=0, stdout=""), - MagicMock(exit_code=0, stdout=""), - MagicMock(exit_code=0, stdout=""), - MagicMock(exit_code=0, stdout=""), - MagicMock(exit_code=0, stdout=""), - ] - ) - - result, data = collector.collect_data() - - assert result.status == ExecutionStatus.OK - - # Verify empty lists don't contain empty strings - assert data.rocm_all_paths == [] - assert data.rocminfo == [] - assert data.ld_conf_rocm == [] - assert data.rocm_libs == [] - assert data.env_vars == [] - assert data.clinfo == [] - assert data.kfd_proc == [] - - -def test_rocm_path_determination(collector): - """Test that rocm_path is correctly determined for subsequent commands""" - # Test with latest versioned path available - collector._run_sut_cmd = MagicMock( - side_effect=[ - MagicMock(exit_code=0, stdout="6.2.0-66"), - MagicMock(exit_code=0, stdout="/opt/rocm-7.1.0"), - MagicMock(exit_code=0, stdout="/opt/rocm"), - MagicMock(exit_code=0, stdout="rocminfo output"), - MagicMock(exit_code=1, stdout=""), - MagicMock(exit_code=1, stdout=""), - MagicMock(exit_code=1, stdout=""), - MagicMock(exit_code=0, stdout="clinfo output"), - MagicMock(exit_code=1, stdout=""), - ] - ) - - result, data = collector.collect_data() - - # The collector should use /opt/rocm-7.1.0 as rocm_path - # We can verify this by checking that rocminfo and clinfo were successfully collected - assert data.rocm_latest_versioned_path == "/opt/rocm-7.1.0" - assert len(data.rocminfo) > 0 - assert len(data.clinfo) > 0 - - def test_invalid_rocm_version_format(collector): - """Test that invalid ROCm version format raises validation error""" + """Test that invalid ROCm version format is handled gracefully""" collector._run_sut_cmd = MagicMock( return_value=MagicMock( exit_code=0, @@ -371,6 +281,8 @@ def test_invalid_rocm_version_format(collector): ) ) - # Should raise ValueError due to pydantic validation - with pytest.raises(ValueError, match="ROCm version has invalid format"): - result, data = collector.collect_data() + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.ERROR + assert data is None + assert len(result.events) >= 1 From 748f3a53a766cf3022d1ab54f5d7050b24a3ebc4 Mon Sep 17 00:00:00 2001 From: Jaspal Singh Date: Mon, 1 Dec 2025 09:44:41 -0600 Subject: [PATCH 8/8] cmd fixed variables and utils --- .../plugins/inband/rocm/rocm_collector.py | 63 ++++++++----------- nodescraper/utils.py | 25 ++++---- 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index 27acd8d4..f7692e45 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -23,24 +23,13 @@ # SOFTWARE. # ############################################################################### -import re from typing import Optional from nodescraper.base import InBandDataCollector from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult -from nodescraper.utils import ( - CMD_CLINFO, - CMD_ENV_VARS, - CMD_KFD_PROC, - CMD_LD_CONF, - CMD_ROCM_DIRS, - CMD_ROCM_LATEST, - CMD_ROCM_LIBS, - CMD_ROCMINFO, - CMD_VERSION_PATHS, -) +from nodescraper.utils import strip_ansi_codes from .rocmdata import RocmDataModel @@ -51,20 +40,18 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} DATA_MODEL = RocmDataModel - - @staticmethod - def _strip_ansi_codes(text: str) -> str: - """ - Remove ANSI escape codes from text. - - Args: - text (str): The text string containing ANSI escape codes. - - Returns: - str: The text with ANSI escape codes removed. - """ - ansi_escape = re.compile(r"\x1b\[[0-9;]*m") - return ansi_escape.sub("", text) + CMD_VERSION_PATHS = [ + "/opt/rocm/.info/version-rocm", + "/opt/rocm/.info/version", + ] + CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" + CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" + CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" + CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*" + CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'" + CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" + CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo" + CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/" def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: """Collect ROCm version data from the system. @@ -73,7 +60,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available. """ rocm_data = None - for path in CMD_VERSION_PATHS: + for path in self.CMD_VERSION_PATHS: res = self._run_sut_cmd(f"grep . {path}") if res.exit_code == 0: try: @@ -101,7 +88,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: else: self._log_event( category=EventCategory.OS, - description=f"Unable to read ROCm version from {CMD_VERSION_PATHS}", + description=f"Unable to read ROCm version from {self.CMD_VERSION_PATHS}", data={"raw_output": res.stdout}, priority=EventPriority.ERROR, ) @@ -109,12 +96,12 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: # Collect additional ROCm data if version was found if rocm_data: # Collect latest versioned ROCm path (rocm-[3-7]*) - versioned_path_res = self._run_sut_cmd(CMD_ROCM_LATEST) + versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST) if versioned_path_res.exit_code == 0: rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip() # Collect all ROCm paths as list - all_paths_res = self._run_sut_cmd(CMD_ROCM_DIRS) + all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS) if all_paths_res.exit_code == 0: rocm_data.rocm_all_paths = [ path.strip() @@ -126,13 +113,13 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm" # Collect rocminfo output as list of lines with ANSI codes stripped - rocminfo_cmd = CMD_ROCMINFO.format(rocm_path=rocm_path) + rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path) rocminfo_res = self._run_sut_cmd(rocminfo_cmd) rocminfo_artifact_content = "" if rocminfo_res.exit_code == 0: # Split into lines and strip ANSI codes from each line rocm_data.rocminfo = [ - self._strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n") + strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n") ] rocminfo_artifact_content += "=" * 80 + "\n" rocminfo_artifact_content += "ROCMNFO OUTPUT\n" @@ -140,14 +127,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: rocminfo_artifact_content += rocminfo_res.stdout # Collect ld.so.conf ROCm entries - ld_conf_res = self._run_sut_cmd(CMD_LD_CONF) + ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF) if ld_conf_res.exit_code == 0: rocm_data.ld_conf_rocm = [ line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip() ] # Collect ROCm libraries from ldconfig - rocm_libs_res = self._run_sut_cmd(CMD_ROCM_LIBS) + rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS) if rocm_libs_res.exit_code == 0: rocm_data.rocm_libs = [ line.strip() @@ -156,14 +143,14 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: ] # Collect ROCm-related environment variables - env_vars_res = self._run_sut_cmd(CMD_ENV_VARS) + env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS) if env_vars_res.exit_code == 0: rocm_data.env_vars = [ line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip() ] # Collect clinfo output - clinfo_cmd = CMD_CLINFO.format(rocm_path=rocm_path) + clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path) clinfo_res = self._run_sut_cmd(clinfo_cmd) # Always append clinfo section to artifact, even if empty or failed @@ -175,7 +162,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: if clinfo_res.exit_code == 0: rocm_data.clinfo = [ - self._strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n") + strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n") ] rocminfo_artifact_content += clinfo_res.stdout else: @@ -194,7 +181,7 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: ) # Collect KFD process list - kfd_proc_res = self._run_sut_cmd(CMD_KFD_PROC) + kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC) if kfd_proc_res.exit_code == 0: rocm_data.kfd_proc = [ proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip() diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 3c26984d..8e5461bb 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -203,16 +203,15 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str: return f"{prefix}{middle}.log" -# ROCm Plugin Command Constants -CMD_VERSION_PATHS = [ - "/opt/rocm/.info/version-rocm", - "/opt/rocm/.info/version", -] -CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" -CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" -CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" -CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*" -CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'" -CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" -CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo" -CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/" +def strip_ansi_codes(text: str) -> str: + """ + Remove ANSI escape codes from text. + + Args: + text (str): The text string containing ANSI escape codes. + + Returns: + str: The text with ANSI escape codes removed. + """ + ansi_escape = re.compile(r"\x1b\[[0-9;]*m") + return ansi_escape.sub("", text)