Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nodescraper/plugins/inband/rocm/analyzer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

class RocmAnalyzerArgs(AnalyzerArgs):
exp_rocm: Union[str, list] = Field(default_factory=list)
exp_rocm_latest: str = Field(default="")

@field_validator("exp_rocm", mode="before")
@classmethod
Expand Down
45 changes: 34 additions & 11 deletions nodescraper/plugins/inband/rocm/rocm_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,40 @@ def analyze_data(
if data.rocm_version == rocm_version:
self.result.message = "ROCm version matches expected"
self.result.status = ExecutionStatus.OK
break
else:
# No matching version found
self.result.message = (
f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}"
)
self.result.status = ExecutionStatus.ERROR
self._log_event(
category=EventCategory.SW_DRIVER,
description=f"{self.result.message}",
data={"expected": args.exp_rocm, "actual": data.rocm_version},
priority=EventPriority.CRITICAL,
console_log=True,
)
return self.result

# validate rocm_latest if provided in args
if args.exp_rocm_latest:
if data.rocm_latest_versioned_path != args.exp_rocm_latest:
self.result.message = f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}"
self.result.status = ExecutionStatus.ERROR
self._log_event(
category=EventCategory.SW_DRIVER,
description=f"{self.result.message}",
data={
"expected": args.exp_rocm_latest,
"actual": data.rocm_latest_versioned_path,
},
priority=EventPriority.CRITICAL,
console_log=True,
)
return self.result
else:
# Update message to include rocm_latest validation result
self.result.message = f"ROCm version matches expected. ROCm latest path validated: {data.rocm_latest_versioned_path}"

self.result.message = (
f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}"
)
self.result.status = ExecutionStatus.ERROR
self._log_event(
category=EventCategory.SW_DRIVER,
description=f"{self.result.message}",
data={"expected": args.exp_rocm, "actual": data.rocm_version},
priority=EventPriority.CRITICAL,
console_log=True,
)
return self.result
143 changes: 127 additions & 16 deletions nodescraper/plugins/inband/rocm/rocm_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
from typing import Optional

from nodescraper.base import InBandDataCollector
from nodescraper.connection.inband import TextFileArtifact
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
from nodescraper.models import TaskResult
from nodescraper.utils import strip_ansi_codes

from .rocmdata import RocmDataModel

Expand All @@ -42,40 +44,149 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]):
"/opt/rocm/.info/version-rocm",
"/opt/rocm/.info/version",
]
CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"

def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
"""Collect ROCm version data from the system.

Returns:
tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
"""
version_paths = [
"/opt/rocm/.info/version-rocm",
"/opt/rocm/.info/version",
]

rocm_data = None
for path in self.CMD_VERSION_PATHS:
res = self._run_sut_cmd(f"grep . {path}")
if res.exit_code == 0:
rocm_data = RocmDataModel(rocm_version=res.stdout)
self._log_event(
category="ROCM_VERSION_READ",
description="ROCm version data collected",
data=rocm_data.model_dump(),
priority=EventPriority.INFO,
)
self.result.message = f"ROCm: {rocm_data.model_dump()}"
self.result.status = ExecutionStatus.OK
break
try:
rocm_data = RocmDataModel(rocm_version=res.stdout)
self._log_event(
category="ROCM_VERSION_READ",
description="ROCm version data collected",
data=rocm_data.model_dump(include={"rocm_version"}),
priority=EventPriority.INFO,
)
self.result.message = f"ROCm version: {rocm_data.rocm_version}"
self.result.status = ExecutionStatus.OK
break
except ValueError as e:
self._log_event(
category=EventCategory.OS,
description=f"Invalid ROCm version format: {res.stdout}",
data={"version": res.stdout, "error": str(e)},
priority=EventPriority.ERROR,
console_log=True,
)
self.result.message = f"Invalid ROCm version format: {res.stdout}"
self.result.status = ExecutionStatus.ERROR
return self.result, None
else:
self._log_event(
category=EventCategory.OS,
description=f"Unable to read ROCm version from {version_paths}",
description=f"Unable to read ROCm version from {self.CMD_VERSION_PATHS}",
data={"raw_output": res.stdout},
priority=EventPriority.ERROR,
)

# Collect additional ROCm data if version was found
if rocm_data:
# Collect latest versioned ROCm path (rocm-[3-7]*)
versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST)
if versioned_path_res.exit_code == 0:
rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip()

# Collect all ROCm paths as list
all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS)
if all_paths_res.exit_code == 0:
rocm_data.rocm_all_paths = [
path.strip()
for path in all_paths_res.stdout.strip().split("\n")
if path.strip()
]

# Determine ROCm path for commands that need it
rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm"

# Collect rocminfo output as list of lines with ANSI codes stripped
rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path)
rocminfo_res = self._run_sut_cmd(rocminfo_cmd)
rocminfo_artifact_content = ""
if rocminfo_res.exit_code == 0:
# Split into lines and strip ANSI codes from each line
rocm_data.rocminfo = [
strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n")
]
rocminfo_artifact_content += "=" * 80 + "\n"
rocminfo_artifact_content += "ROCMNFO OUTPUT\n"
rocminfo_artifact_content += "=" * 80 + "\n\n"
rocminfo_artifact_content += rocminfo_res.stdout

# Collect ld.so.conf ROCm entries
ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF)
if ld_conf_res.exit_code == 0:
rocm_data.ld_conf_rocm = [
line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip()
]

# Collect ROCm libraries from ldconfig
rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS)
if rocm_libs_res.exit_code == 0:
rocm_data.rocm_libs = [
line.strip()
for line in rocm_libs_res.stdout.strip().split("\n")
if line.strip()
]

# Collect ROCm-related environment variables
env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS)
if env_vars_res.exit_code == 0:
rocm_data.env_vars = [
line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip()
]

# Collect clinfo output
clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path)
clinfo_res = self._run_sut_cmd(clinfo_cmd)

# Always append clinfo section to artifact, even if empty or failed
if rocminfo_artifact_content:
rocminfo_artifact_content += "\n\n"
rocminfo_artifact_content += "=" * 80 + "\n"
rocminfo_artifact_content += "CLINFO OUTPUT\n"
rocminfo_artifact_content += "=" * 80 + "\n\n"

if clinfo_res.exit_code == 0:
rocm_data.clinfo = [
strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n")
]
rocminfo_artifact_content += clinfo_res.stdout
else:
# Add error information if clinfo failed
rocminfo_artifact_content += f"Command: {clinfo_res.command}\n"
rocminfo_artifact_content += f"Exit Code: {clinfo_res.exit_code}\n"
if clinfo_res.stderr:
rocminfo_artifact_content += f"Error: {clinfo_res.stderr}\n"
if clinfo_res.stdout:
rocminfo_artifact_content += f"Output: {clinfo_res.stdout}\n"

# Add combined rocminfo and clinfo output as a text file artifact
if rocminfo_artifact_content:
self.result.artifacts.append(
TextFileArtifact(filename="rocminfo.log", contents=rocminfo_artifact_content)
)

# Collect KFD process list
kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC)
if kfd_proc_res.exit_code == 0:
rocm_data.kfd_proc = [
proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip()
]

if not rocm_data:
self._log_event(
category=EventCategory.OS,
Expand Down
9 changes: 9 additions & 0 deletions nodescraper/plugins/inband/rocm/rocmdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#
###############################################################################
import re
from typing import List

from pydantic import field_validator

Expand All @@ -32,6 +33,14 @@

class RocmDataModel(DataModel):
rocm_version: str
rocminfo: List[str] = []
rocm_latest_versioned_path: str = ""
rocm_all_paths: List[str] = []
ld_conf_rocm: List[str] = []
rocm_libs: List[str] = []
env_vars: List[str] = []
clinfo: List[str] = []
kfd_proc: List[str] = []

@field_validator("rocm_version")
@classmethod
Expand Down
14 changes: 14 additions & 0 deletions nodescraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,17 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str:

middle = base[:-3] if base.endswith(".gz") else base
return f"{prefix}{middle}.log"


def strip_ansi_codes(text: str) -> str:
"""
Remove ANSI escape codes from text.

Args:
text (str): The text string containing ANSI escape codes.

Returns:
str: The text with ANSI escape codes removed.
"""
ansi_escape = re.compile(r"\x1b\[[0-9;]*m")
return ansi_escape.sub("", text)
21 changes: 18 additions & 3 deletions test/unit/plugin/test_rocm_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,22 +42,24 @@ def analyzer(system_info):

@pytest.fixture
def model_obj():
return RocmDataModel(rocm_version="6.2.0-66")
return RocmDataModel(rocm_version="6.2.0-66", rocm_latest_versioned_path="/opt/rocm-7.1.0")


@pytest.fixture
def config():
return {
"rocm_version": ["6.2.0-66"],
"invalid": "invalid",
"rocm_latest": "/opt/rocm-7.1.0",
}


def test_all_good_data(analyzer, model_obj, config):
args = RocmAnalyzerArgs(exp_rocm=config["rocm_version"])
args = RocmAnalyzerArgs(exp_rocm=config["rocm_version"], exp_rocm_latest=config["rocm_latest"])
result = analyzer.analyze_data(model_obj, args)
assert result.status == ExecutionStatus.OK
assert result.message == "ROCm version matches expected"
assert "ROCm version matches expected" in result.message
assert "ROCm latest path validated" in result.message
assert all(
event.priority not in {EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL}
for event in result.events
Expand Down Expand Up @@ -94,3 +96,16 @@ def test_unexpected_rocm_version(analyzer, model_obj):
def test_invalid_user_config(analyzer, model_obj, config):
result = analyzer.analyze_data(model_obj, None)
assert result.status == ExecutionStatus.NOT_RAN


def test_rocm_latest_path_mismatch(analyzer, model_obj):
"""Test that rocm_latest path mismatch is detected and logged"""
args = RocmAnalyzerArgs(exp_rocm=["6.2.0-66"], exp_rocm_latest="/opt/rocm-6.2.0")
result = analyzer.analyze_data(model_obj, args)
assert result.status == ExecutionStatus.ERROR
assert "ROCm latest path mismatch" in result.message
assert "/opt/rocm-6.2.0" in result.message # expected
assert "/opt/rocm-7.1.0" in result.message # actual
for event in result.events:
assert event.priority == EventPriority.CRITICAL
assert event.category == EventCategory.SW_DRIVER.value
Loading