From f08d57477c7378c565ae40cc2a94aecf2662868a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 20 Aug 2025 14:52:30 -0500 Subject: [PATCH 01/38] admi_smi folder --- nodescraper/plugins/inband/amdsmi/__init__.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 nodescraper/plugins/inband/amdsmi/__init__.py diff --git a/nodescraper/plugins/inband/amdsmi/__init__.py b/nodescraper/plugins/inband/amdsmi/__init__.py new file mode 100644 index 00000000..2b18caf7 --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/__init__.py @@ -0,0 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .dmesg_plugin import AmdsmiPlugin + +__all__ = ["AmdsmiPlugin"] From 1eac292827a83c63e804bf7cecf44c0418a0c645 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 20 Aug 2025 15:49:47 -0500 Subject: [PATCH 02/38] need to fix collect_data call --- nodescraper/plugins/inband/amdsmi/__init__.py | 2 +- .../plugins/inband/amdsmi/amdsmi_collector.py | 566 ++++++++++ .../plugins/inband/amdsmi/amdsmi_plugin.py | 37 + .../plugins/inband/amdsmi/amdsmidata.py | 988 ++++++++++++++++++ nodescraper/utils.py | 46 +- 5 files changed, 1637 insertions(+), 2 deletions(-) create mode 100644 nodescraper/plugins/inband/amdsmi/amdsmi_collector.py create mode 100644 nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py create mode 100644 nodescraper/plugins/inband/amdsmi/amdsmidata.py diff --git a/nodescraper/plugins/inband/amdsmi/__init__.py b/nodescraper/plugins/inband/amdsmi/__init__.py index 2b18caf7..ec4a6f86 100644 --- a/nodescraper/plugins/inband/amdsmi/__init__.py +++ b/nodescraper/plugins/inband/amdsmi/__init__.py @@ -23,6 +23,6 @@ # SOFTWARE. # ############################################################################### -from .dmesg_plugin import AmdsmiPlugin +from .amdsmi_plugin import AmdsmiPlugin __all__ = ["AmdsmiPlugin"] diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py new file mode 100644 index 00000000..d31982ea --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -0,0 +1,566 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import io +import json +import re +from typing import TypeVar + +from packaging.version import Version as PackageVersion +from pydantic import BaseModel, ValidationError + +from nodescraper.base.inbandcollectortask import InBandDataCollector +from nodescraper.connection.inband.inband import BaseFileArtifact, CommandArtifact +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.models import TaskResult +from nodescraper.models.datamodel import FileModel +from nodescraper.plugins.inband.amdsmi.amdsmidata import ( + AmdSmiData, + AmdSmiListItem, + AmdSmiMetric, + AmdSmiStatic, + AmdSmiTstData, + AmdSmiVersion, + BadPages, + Fw, + Partition, + Processes, + Topo, + XgmiLinks, + XgmiMetrics, +) +from nodescraper.utils import get_exception_details, get_exception_traceback + +T = TypeVar("T", bound=BaseModel) + + +class AmdSmiCollector(InBandDataCollector[AmdSmiData, None]): + """class for collection of inband tool amd-smi data.""" + + AMD_SMI_EXE = "amd-smi" + + SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} + + DATA_MODEL = AmdSmiData + + def _check_amdsmi_installed(self) -> bool: + """Return if amd-smi is installed""" + + cmd_ret: CommandArtifact = self._run_system_command("which amd-smi") + return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout) + + def _check_command_supported(self, command: str) -> bool: + """Log an event if the command is missing""" + if command not in self.amd_smi_commands: + self._log_event( + category=EventCategory.APPLICATION, + description=f"amd-smi does not support command: `{command}`, it was not found in the help output", + priority=EventPriority.INFO, + ) + return False + return True + + def build_amdsmi_sub_data( + self, amd_smi_data_model: type[T], json_data: list[dict] | None + ) -> list[T] | T | None: + try: + if json_data is None: + self._log_event( + category=EventCategory.APPLICATION, + description="No data returned from amd-smi sub command", + priority=EventPriority.ERROR, + ) + return None + validated_data = [] + if isinstance(json_data, list): + for data in json_data: + if not isinstance(data, dict): + self._log_event( + category=EventCategory.APPLICATION, + description="Invalid data type for amd-smi sub data", + data={ + "data_type": type(data).__name__, + "model_name": amd_smi_data_model.__name__, + }, + priority=EventPriority.WARNING, + ) + return None + validated_data.append(amd_smi_data_model(**data)) + elif isinstance(json_data, dict): + return amd_smi_data_model(**json_data) + else: + raise ValidationError( + f"Invalid data type for amd-smi sub data: {type(json_data).__name__}", + model=amd_smi_data_model, + ) + return validated_data + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Failed to build amd-smi model {amd_smi_data_model.__name__}", + data=get_exception_traceback(e), + priority=EventPriority.WARNING, + ) + return None + + def _get_amdsmi_data(self) -> AmdSmiData | None: + """Returns amd-smi tool data formatted as a AmdSmiData object + + Returns None if tool is not installed or if drivers are not loaded + + Returns: + Union[AmdSmiData, None]: AmdSmiData object or None on failure + """ + if not self._check_amdsmi_installed(): + self._log_event( + category=EventCategory.APPLICATION, + description="amd-smi is not installed", + priority=EventPriority.WARNING, + console_log=True, + ) + self.result.status = ExecutionStatus.NOT_RAN + return None + try: + version = self._get_amdsmi_version() + bad_pages = self.get_bad_pages() + processes = self.get_process() + partition = self.get_partition() + firmware = self.get_firmware() + topology = self.get_topology() + amdsmi_metric = self.get_metric() + amdsmi_static = self.get_static() + gpu_list = self.get_gpu_list() + xgmi_metric = self.get_xgmi_data_metric() + if xgmi_metric is None: + xgmi_metric = {"metric": {}, "link": {}} + cper_data = self.get_cper_data() + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi sub commands", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.EXECUTION_FAILURE + return None + + gpu_list_model = self.build_amdsmi_sub_data(AmdSmiListItem, gpu_list) + topo_data_model = self.build_amdsmi_sub_data(Topo, topology) + bad_pages_model = self.build_amdsmi_sub_data(BadPages, bad_pages) + partition_data_model = self.build_amdsmi_sub_data(Partition, partition) + process_data_model = self.build_amdsmi_sub_data(Processes, processes) + firmware_model = self.build_amdsmi_sub_data(Fw, firmware) + amdsmi_metric_model = self.build_amdsmi_sub_data(AmdSmiMetric, amdsmi_metric) + amdsmi_static_model = self.build_amdsmi_sub_data(AmdSmiStatic, amdsmi_static) + xgmi_metric_model = self.build_amdsmi_sub_data(XgmiMetrics, xgmi_metric["metric"]) + xgmi_link_model = self.build_amdsmi_sub_data(XgmiLinks, xgmi_metric["link"]) + try: + amd_smi_data = AmdSmiData( + version=version, + gpu_list=gpu_list_model, + process=process_data_model, + partition=partition_data_model, + topology=topo_data_model, + static=amdsmi_static_model, + metric=amdsmi_metric_model, + firmware=firmware_model, + bad_pages=bad_pages_model, + amdsmitst_data=self.get_amdsmitst_data(version), + xgmi_link=xgmi_link_model, + xgmi_metric=xgmi_metric_model, + cper_data=cper_data, + ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiData model", + data=get_exception_details(e), + priority=EventPriority.ERROR, + ) + return None + + return amd_smi_data + + def _get_amdsmi_version(self) -> AmdSmiVersion | None: + """Get amdsmi version and data.""" + ret = self._run_amd_smi_dict("version") + version_data = self.build_amdsmi_sub_data(AmdSmiVersion, ret) + if version_data: + return version_data[0] + return None + + def _run_amd_smi_dict( + self, cmd: str, sudo: bool = False, raise_event=True + ) -> dict | list[dict] | None: + """Run amd-smi command with json output. + + Args: + ---- + cmd (str): command to run + + Returns: + ------- + dict: dict of output + """ + cmd += " --json" + cmd_ret = self._run_amd_smi(cmd, sudo=sudo) + if cmd_ret: + try: + return json.loads(cmd_ret) + except json.JSONDecodeError as e: + if raise_event: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Error parsing command: `{cmd}` json data", + data={"cmd": cmd, "exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + else: + return None + + def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: + """Run amd-smi command + + Args: + ---- + cmd (str): command to run + + Returns: + ------- + str: str of output + """ + cmd_ret: CommandArtifact = self._run_system_command(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) + if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi command", + data={ + "command": cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + else: + return cmd_ret.stdout + + def get_gpu_list(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi list""" + LIST_CMD = "list" + if not self._check_command_supported(LIST_CMD): + # If the command is not supported, return None + return None + return self._run_amd_smi_dict(LIST_CMD) + + def get_process(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi process""" + PROCESS_CMD = "process" + if not self._check_command_supported(PROCESS_CMD): + # If the command is not supported, return None + return None + return self._run_amd_smi_dict(PROCESS_CMD) + + def get_partition(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi process""" + PARTITION_CMD = "partition" + if not self._check_command_supported(PARTITION_CMD): + # If the command is not supported, return None + return None + return self._run_amd_smi_dict(PARTITION_CMD) + + def get_topology(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi topology""" + TOPO_CMD = "topology" + if not self._check_command_supported(TOPO_CMD): + # If the command is not supported, return None + return None + return self._run_amd_smi_dict(TOPO_CMD) + + def get_static(self) -> list[dict] | None: + """Get data in dict format from cmd: amdsmi static""" + STATIC_CMD = "static" + if not self._check_command_supported(STATIC_CMD): + # If the command is not supported, return None + return None + static_data = self._run_amd_smi_dict(f"{STATIC_CMD} -g all") + if static_data is None: + return None + if "gpu_data" in static_data: + static_data = static_data["gpu_data"] + static_data_gpus = [] + for static in static_data: + if "gpu" in static: + static_data_gpus.append(static) + return static_data_gpus + + def get_metric(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi metric""" + METRIC_CMD = "metric" + if not self._check_command_supported(METRIC_CMD): + # If the command is not supported, return None + return None + metric_data = self._run_amd_smi_dict(f"{METRIC_CMD} -g all") + if metric_data is None: + return None + if "gpu_data" in metric_data: + metric_data = metric_data["gpu_data"] + metric_data_gpus = [] + for metric in metric_data: + if "gpu" in metric: + metric_data_gpus.append(metric) + return metric_data_gpus + + def get_firmware(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi firmware""" + FW_CMD = "firmware" + if not self._check_command_supported(FW_CMD): + # If the command is not supported, return None + return None + return self._run_amd_smi_dict(FW_CMD) + + def get_bad_pages(self) -> list[dict] | None: + """Get data as a list of dict from cmd: amdsmi bad-pages""" + BAD_PAGE_CMD = "bad-pages" + if self._check_command_supported(BAD_PAGE_CMD): + # If the command is supported, run it + return self._run_amd_smi_dict(BAD_PAGE_CMD) + return None + + def get_xgmi_data_metric(self) -> dict[str, list[dict]] | None: + """Get data as a list of dict from cmd: amdsmi xgmi""" + XGMI_CMD = "xgmi" + if not self._check_command_supported(XGMI_CMD): + # If the command is not supported, return None + return None + xgmi_metric_data = self._run_amd_smi_dict(f"{XGMI_CMD} -m") + if xgmi_metric_data is None: + xgmi_metric_data = [] + elif "xgmi_metric" in xgmi_metric_data: + xgmi_metric_data = xgmi_metric_data["xgmi_metric"] + if len(xgmi_metric_data) == 1: + xgmi_metric_data = xgmi_metric_data[0] + xgmi_link_data = self._run_amd_smi_dict(f"{XGMI_CMD} -l", raise_event=False) + if isinstance(xgmi_link_data, dict) and "link_status" in xgmi_link_data: + xgmi_link_data = xgmi_link_data["link_status"] + if xgmi_link_data is None: + xgmi_link_data_str = self._run_amd_smi(f"{XGMI_CMD} -l --json") + if xgmi_link_data_str is None: + return { + "metric": xgmi_metric_data, + "link": [], + } + invalid_json_start = xgmi_link_data_str.find("]\n[") + if invalid_json_start != -1: + xgmi_link_data_str = xgmi_link_data_str[invalid_json_start + 2 :] + try: + xgmi_link_data = json.loads(xgmi_link_data_str) + except json.JSONDecodeError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error parsing xgmi link data", + data={ + "xgmi_link_data": xgmi_link_data_str, + "exception": get_exception_traceback(e), + }, + priority=EventPriority.WARNING, + console_log=True, + ) + xgmi_metric_data = [] + return { + "metric": xgmi_metric_data, + "link": xgmi_link_data, + } + + def get_cper_data(self) -> list[FileModel]: + CPER_CMD = "ras" + if not self._check_command_supported(CPER_CMD): + # If the command is not supported, return an empty list + return [] + AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" + # Ensure the cper folder exists but is empty + self._run_system_command( + f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json", + sudo=False, + ) + cper_cmd = self._run_amd_smi(f"{CPER_CMD} --cper --folder={AMD_SMI_CPER_FOLDER}", sudo=True) + if cper_cmd is None: + # Error was already logged in _run_amd_smi + return [] + # search that a CPER is actually created here + regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd) + if not regex_cper_search: + # Early exit if no CPER files were created + return [] + # tar the cper folder + self._run_system_command( + f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", + sudo=True, + ) + # Load teh tar files + cper_zip: BaseFileArtifact = self.ib_interface.read_file( + f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False + ) + self._log_file_artifact( + cper_zip.filename, + cper_zip.contents, + ) + io_bytes = io.BytesIO(cper_zip.contents) + del cper_zip # Free memory after reading the file + try: + with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file: + cper_data = [] + for member in tar_file.getmembers(): + if member.isfile() and member.name.endswith(".cper"): + file_content = tar_file.extractfile(member) + if file_content is not None: + # Decode the content, ignoring errors to avoid issues with binary data + # that may not be valid UTF-8 + file_content_bytes = file_content.read() + else: + file_content_bytes = b"" + cper_data.append( + FileModel(file_contents=file_content_bytes, file_name=member.name) + ) + # Since we do not log the cper data in the data model create an invent informing the user if CPER created + if cper_data: + self._log_event( + category=EventCategory.APPLICATION, + description="CPER data has been extracted from amd-smi", + data={ + "cper_count": len(cper_data), + }, + priority=EventPriority.INFO, + ) + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error extracting cper data", + data={ + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return [] + return cper_data + + def get_amdsmitst_data(self, amdsmi_version: AmdSmiVersion | None) -> AmdSmiTstData: + """Get data in dict format from cmd: amdsmi amdsmitst""" + MIN_FUNCTIONAL_AMDSMITST_ROCM_VERSION = PackageVersion("6.4.2") + amdsmitst_data = AmdSmiTstData() + if self.system_interaction_level != SystemInteractionLevel.DISRUPTIVE: + return amdsmitst_data + # This test is disruptive, so we only run it if the system interaction level is set to DISRUPTIVE + if ( + amdsmi_version is None + or amdsmi_version.rocm_version is None + or MIN_FUNCTIONAL_AMDSMITST_ROCM_VERSION > PackageVersion(amdsmi_version.rocm_version) + ): + # In versions of ROCm prior to 6.4.1, the amdsmitst had a bug that would cause the sclk to get pinned + # To a constant value, so we do not run the test for older rocm see: SWDEV-496150 + self.logger.info("Skipping amdsmitst test due to Version incompatibility") + return amdsmitst_data + amdsmitst_cmd: str = "/opt/rocm/share/amd_smi/tests/amdsmitst" + cmd_ret: CommandArtifact = self._run_system_command(amdsmitst_cmd, sudo=True) + if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amdsmitst command", + data={ + "command": amdsmitst_cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.WARNING, + console_log=True, + ) + return amdsmitst_data + + passed_test_pat = r"\[\s+OK\s+\] (.*?) \(\d+ ms\)" + skipped_test_pat = r"\[\s+SKIPPED\s+\] (.*?) \(\d+ ms\)" + failed_test_pat = r"\[\s+FAILED\s+\] (.*?) \(\d+ ms\)" + + for ret_line in cmd_ret.stdout.splitlines(): + if match := re.match(passed_test_pat, ret_line): + amdsmitst_data.passed_tests.append(match.group(1)) + elif match := re.match(skipped_test_pat, ret_line): + amdsmitst_data.skipped_tests.append(match.group(1)) + elif match := re.match(failed_test_pat, ret_line): + amdsmitst_data.failed_tests.append(match.group(1)) + + amdsmitst_data.passed_test_count = len(amdsmitst_data.passed_tests) + amdsmitst_data.skipped_test_count = len(amdsmitst_data.skipped_tests) + amdsmitst_data.failed_test_count = len(amdsmitst_data.failed_tests) + + return amdsmitst_data + + def detect_amdsmi_commands(self) -> set[str]: + r"""Runs the help command to determine if a amd-smi command can be used. + + Uses the regex `^\s{4}(\w+)\s` to find all commands in the help output. + + Returns: + set[str]: _description_ + """ + command_pattern = re.compile(r"^\s{4}([\w\-]+)\s", re.MULTILINE) + + # run command with help + help_output = self._run_amd_smi("-h") + if help_output is None: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi help command", + priority=EventPriority.ERROR, + console_log=True, + ) + return set() + # Find all matches in the provided output + commands = command_pattern.findall(help_output) + return set(commands) + + def collect_data( + self, + **kwargs, + ) -> tuple[TaskResult, AmdSmiData | None]: + try: + self.amd_smi_commands = self.detect_amdsmi_commands() + amd_smi_data = self._get_amdsmi_data() + return self.result, amd_smi_data + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi collector", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.EXECUTION_FAILURE + return self.result, None diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py new file mode 100644 index 00000000..66e011ef --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py @@ -0,0 +1,37 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .amdsmi_collector import AmdSmiCollector +from .amdsmidata import AmdSmiData + + +class AmdsmiPlugin(InBandDataPlugin[AmdSmiData, None, None]): + """Plugin for collection and analysis of amdsmi data""" + + DATA_MODEL = AmdSmiData + + COLLECTOR = AmdSmiCollector diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py new file mode 100644 index 00000000..712bba0e --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -0,0 +1,988 @@ +from enum import Enum +from typing import Any, List, Optional + +from pydantic import ( + AliasChoices, + BaseModel, + ConfigDict, + Field, + NonNegativeFloat, + NonNegativeInt, + computed_field, + field_validator, +) + +from nodescraper.models.datamodel import DataModel, FileModel +from nodescraper.utils import find_annotation_in_container + + +def na_to_none(values: int | str): + if values == "N/A": + return None + return values + + +def na_to_none_list(values: list[int | str]) -> List[int | str | None]: + ret_list: List[int | str | None] = values.copy() + for i in range(len(ret_list)): + if ret_list[i] == "N/A": + ret_list[i] = None + return ret_list + + +def na_to_none_dict(values: dict[str, int | str]) -> dict[str, int | str | None]: + ret_dict: dict[str, int | str | None] = values.copy() + for key in ret_dict: + if ret_dict[key] == "N/A": + ret_dict[key] = None + return ret_dict + + +class AmdSmiBaseModel(BaseModel): + """Base model for AMD SMI data models. + + This is used to ensure that all AMD SMI data models have the same + configuration and validation. + """ + + model_config = ConfigDict( + str_min_length=1, + str_strip_whitespace=True, + populate_by_name=True, + extra="forbid", # Forbid extra fields not defined in the model + ) + + # During building if a field contains a ValueUnit in its tuple, convert input into a ValueUnit + def __init__(self, **data): + # Convert all fields that are supposed to be ValueUnit to ValueUnit if they are int | str | float + for field_name, field_type in self.model_fields.items(): + annotation = field_type.annotation + target_type, container = find_annotation_in_container(annotation, ValueUnit) + if target_type is None: + continue + + if field_name in data and isinstance(data[field_name], (int, str, float)): + # If the field is a primitive type, convert it to ValueUnit dict and let validtor handle it + data[field_name] = { + "value": data[field_name], + "unit": "", + } + + super().__init__(**data) + + +class ValueUnit(BaseModel): + """A model for a value with a unit.""" + + value: int | str | float + # value: int | str | float + unit: str = "" + + +# a = ValueUnit(23) + + +class EccState(Enum): + ENABLED = "ENABLED" + DISABLED = "DISABLED" + NONE = "NONE" + PARITY = "PARITY" + SING_C = "SING_C" + MULT_UC = "MULT_UC" + POISON = "POISON" + NA = "N/A" + + +### STATIC DATA ### + + +class StaticAsic(BaseModel): + market_name: str + vendor_id: str + vendor_name: str + subvendor_id: str + device_id: str + subsystem_id: str + rev_id: str + asic_serial: str + oam_id: int + num_compute_units: int + target_graphics_version: str + + +class StaticBus(AmdSmiBaseModel): + bdf: str + max_pcie_width: ValueUnit + max_pcie_speed: ValueUnit + pcie_interface_version: str + slot_type: str + + +class StaticVbios(BaseModel): + name: str + build_date: str + part_number: str + version: str + + +class StaticLimit(AmdSmiBaseModel): + max_power: ValueUnit | None + min_power: ValueUnit | None + socket_power: ValueUnit | None + slowdown_edge_temperature: ValueUnit | None + slowdown_hotspot_temperature: ValueUnit | None + slowdown_vram_temperature: ValueUnit | None + shutdown_edge_temperature: ValueUnit | None + shutdown_hotspot_temperature: ValueUnit | None + shutdown_vram_temperature: ValueUnit | None + na_validator = field_validator( + "max_power", + "min_power", + "socket_power", + "slowdown_edge_temperature", + "slowdown_hotspot_temperature", + "slowdown_vram_temperature", + "shutdown_edge_temperature", + "shutdown_hotspot_temperature", + "shutdown_vram_temperature", + mode="before", + )(na_to_none) + + +class StaticDriver(BaseModel): + name: str + version: str + + +class StaticBoard(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + + amdsmi_model_number: str = Field( + alias="model_number" + ) # Model number is a reserved keyword for pydantic + product_serial: str + fru_id: str + product_name: str + manufacturer_name: str + + +class StaticRas(BaseModel): + eeprom_version: str + parity_schema: EccState + single_bit_schema: EccState + double_bit_schema: EccState + poison_schema: EccState + ecc_block_state: dict[str, EccState] + + +class StaticPartition(BaseModel): + # The name for compute_partition has changed we will support both for now + + compute_partition: str = Field( + validation_alias=AliasChoices("compute_partition", "accelerator_partition") + ) + memory_partition: str + partition_id: int + + +class StaticPolicy(BaseModel): + policy_id: int + policy_description: str + + +class StaticSocPstate(BaseModel): + num_supported: int + current_id: int + policies: List[StaticPolicy] + + +class StaticXgmiPlpd(BaseModel): + num_supported: int + current_id: int + plpds: List[StaticPolicy] + + +class StaticNuma(BaseModel): + node: int + affinity: int + + +class StaticVram(AmdSmiBaseModel): + type: str + vendor: str | None + size: ValueUnit | None + bit_width: ValueUnit | None + max_bandwidth: ValueUnit | None = None + na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")( + na_to_none + ) + + +class StaticCacheInfoItem(AmdSmiBaseModel): + cache: ValueUnit + cache_properties: List[str] + cache_size: ValueUnit | None + cache_level: ValueUnit + max_num_cu_shared: ValueUnit + num_cache_instance: ValueUnit + na_validator = field_validator("cache_size", mode="before")(na_to_none) + + +class StaticFrequencyLevels(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + + Level_0: str = Field(..., alias="Level 0") + Level_1: str | None = Field(default=None, alias="Level 1") + Level_2: str | None = Field(default=None, alias="Level 2") + + +class StaticClockData(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + frequency_levels: StaticFrequencyLevels + + current_level: int | None = Field(..., alias="current level") + na_validator = field_validator("current_level", mode="before")(na_to_none) + + +class AmdSmiStatic(BaseModel): + gpu: int + asic: StaticAsic + bus: StaticBus + vbios: StaticVbios | None + limit: StaticLimit | None + driver: StaticDriver + board: StaticBoard + ras: StaticRas + soc_pstate: StaticSocPstate | None + xgmi_plpd: StaticXgmiPlpd | None + process_isolation: str + numa: StaticNuma + vram: StaticVram + cache_info: List[StaticCacheInfoItem] + partition: StaticPartition | None = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ + clock: dict[str, StaticClockData | None] | None = None + na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) + na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( + na_to_none + ) + + +### Metric Data ### + + +class MetricUsage(BaseModel): + gfx_activity: ValueUnit | None + umc_activity: ValueUnit | None + mm_activity: ValueUnit | None + vcn_activity: list[ValueUnit | str | None] + jpeg_activity: list[ValueUnit | str | None] + gfx_busy_inst: dict[str, list[ValueUnit | str | None]] | None + jpeg_busy: dict[str, list[ValueUnit | str | None]] | None + vcn_busy: dict[str, list[ValueUnit | str | None]] | None + na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")( + na_to_none_list + ) + na_validator = field_validator( + "gfx_activity", + "umc_activity", + "mm_activity", + "gfx_busy_inst", + "jpeg_busy", + "vcn_busy", + mode="before", + )(na_to_none) + + +class MetricPower(BaseModel): + socket_power: ValueUnit | None + gfx_voltage: ValueUnit | None + soc_voltage: ValueUnit | None + mem_voltage: ValueUnit | None + throttle_status: str | None + power_management: str | None + na_validator = field_validator( + "socket_power", + "gfx_voltage", + "soc_voltage", + "mem_voltage", + "throttle_status", + "power_management", + mode="before", + )(na_to_none) + + +class MetricClockData(BaseModel): + clk: ValueUnit | None + min_clk: ValueUnit | None + max_clk: ValueUnit | None + clk_locked: int | str | dict | None + deep_sleep: int | str | dict | None + na_validator = field_validator( + "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before" + )(na_to_none) + + +class MetricTemperature(BaseModel): + edge: ValueUnit | None + hotspot: ValueUnit | None + mem: ValueUnit | None + na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none) + + +class MetricPcie(BaseModel): + width: int | None + speed: ValueUnit | None + bandwidth: ValueUnit | None + replay_count: int | None + l0_to_recovery_count: int | None + replay_roll_over_count: int | None + nak_sent_count: int | None + nak_received_count: int | None + current_bandwidth_sent: int | None + current_bandwidth_received: int | None + max_packet_size: int | None + lc_perf_other_end_recovery: int | None + na_validator = field_validator( + "width", + "speed", + "bandwidth", + "replay_count", + "l0_to_recovery_count", + "replay_roll_over_count", + "nak_sent_count", + "nak_received_count", + "current_bandwidth_sent", + "current_bandwidth_received", + "max_packet_size", + "lc_perf_other_end_recovery", + mode="before", + )(na_to_none) + + +class MetricEccTotals(BaseModel): + total_correctable_count: int | None + total_uncorrectable_count: int | None + total_deferred_count: int | None + cache_correctable_count: int | None + cache_uncorrectable_count: int | None + na_validator = field_validator( + "total_correctable_count", + "total_uncorrectable_count", + "total_deferred_count", + "cache_correctable_count", + "cache_uncorrectable_count", + mode="before", + )(na_to_none) + + +class MetricErrorCounts(BaseModel): + correctable_count: str | None + uncorrectable_count: str | None + deferred_count: str | None + na_validator = field_validator( + "correctable_count", "uncorrectable_count", "deferred_count", mode="before" + )(na_to_none) + + +class MetricFan(BaseModel): + speed: ValueUnit | None + max: ValueUnit | None + rpm: ValueUnit | None + usage: ValueUnit | None + na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none) + + +class MetricVoltageCurve(BaseModel): + point_0_frequency: ValueUnit | None + point_0_voltage: ValueUnit | None + point_1_frequency: ValueUnit | None + point_1_voltage: ValueUnit | None + point_2_frequency: ValueUnit | None + point_2_voltage: ValueUnit | None + + na_validator = field_validator( + "point_0_frequency", + "point_0_voltage", + "point_1_frequency", + "point_1_voltage", + "point_2_frequency", + "point_2_voltage", + mode="before", + )(na_to_none) + + +class MetricEnergy(BaseModel): + total_energy_consumption: ValueUnit | None + na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none) + + +class MetricMemUsage(BaseModel): + total_vram: ValueUnit | None + used_vram: ValueUnit | None + free_vram: ValueUnit | None + total_visible_vram: ValueUnit | None + used_visible_vram: ValueUnit | None + free_visible_vram: ValueUnit | None + total_gtt: ValueUnit | None + used_gtt: ValueUnit | None + free_gtt: ValueUnit | None + na_validator = field_validator( + "total_vram", + "used_vram", + "free_vram", + "total_visible_vram", + "used_visible_vram", + "free_visible_vram", + "total_gtt", + "used_gtt", + "free_gtt", + mode="before", + )(na_to_none) + + +class MetricThrottleVu(BaseModel): + value: dict[str, list[int | str]] + unit: str = "" + + +class MetricThrottle(AmdSmiBaseModel): + # At some point in time these changed from being int -> ValueUnit + + accumulation_counter: MetricThrottleVu | ValueUnit | None = None + + gfx_clk_below_host_limit_accumulated: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_power_accumulated: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_power_violation_activity: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_power_violation_status: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_violation_activity: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_violation_accumulated: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_violation_status: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_thermal_violation_accumulated: MetricThrottleVu | ValueUnit | None = ( + None + ) + gfx_clk_below_host_limit_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + hbm_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + hbm_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + hbm_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + low_utilization_violation_accumulated: MetricThrottleVu | ValueUnit | None = None + low_utilization_violation_activity: MetricThrottleVu | ValueUnit | None = None + low_utilization_violation_status: MetricThrottleVu | ValueUnit | None = None + ppt_accumulated: MetricThrottleVu | ValueUnit | None = None + ppt_violation_activity: MetricThrottleVu | ValueUnit | None = None + ppt_violation_status: MetricThrottleVu | ValueUnit | None = None + prochot_accumulated: MetricThrottleVu | ValueUnit | None = None + prochot_violation_activity: MetricThrottleVu | ValueUnit | None = None + prochot_violation_status: MetricThrottleVu | ValueUnit | None = None + socket_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + socket_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + socket_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + vr_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + vr_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + vr_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + + na_validator = field_validator( + "accumulation_counter", + "gfx_clk_below_host_limit_accumulated", + "gfx_clk_below_host_limit_power_accumulated", + "gfx_clk_below_host_limit_power_violation_activity", + "gfx_clk_below_host_limit_power_violation_status", + "gfx_clk_below_host_limit_violation_activity", + "gfx_clk_below_host_limit_violation_accumulated", + "gfx_clk_below_host_limit_violation_status", + "gfx_clk_below_host_limit_thermal_violation_accumulated", + "gfx_clk_below_host_limit_thermal_violation_activity", + "gfx_clk_below_host_limit_thermal_violation_status", + "hbm_thermal_accumulated", + "hbm_thermal_violation_activity", + "hbm_thermal_violation_status", + "low_utilization_violation_accumulated", + "low_utilization_violation_activity", + "low_utilization_violation_status", + "ppt_accumulated", + "ppt_violation_activity", + "ppt_violation_status", + "prochot_accumulated", + "prochot_violation_activity", + "prochot_violation_status", + "socket_thermal_accumulated", + "socket_thermal_violation_activity", + "socket_thermal_violation_status", + "vr_thermal_accumulated", + "vr_thermal_violation_activity", + "vr_thermal_violation_status", + mode="before", + )(na_to_none) + + +class EccData(BaseModel): + "ECC counts collected per ecc block" + + correctable_count: int | None = 0 + uncorrectable_count: int | None = 0 + deferred_count: int | None = 0 + + na_validator = field_validator( + "correctable_count", "uncorrectable_count", "deferred_count", mode="before" + )(na_to_none) + + +class AmdSmiMetric(BaseModel): + gpu: int + usage: MetricUsage + power: MetricPower + clock: dict[str, MetricClockData] + temperature: MetricTemperature + pcie: MetricPcie + ecc: MetricEccTotals + ecc_blocks: dict[str, EccData] | str + fan: MetricFan + voltage_curve: MetricVoltageCurve + perf_level: str | dict | None + xgmi_err: str | dict | None + energy: MetricEnergy | None + mem_usage: MetricMemUsage + throttle: MetricThrottle + + na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) + + @field_validator("ecc_blocks", mode="before") + @classmethod + def validate_ecc_blocks(cls, value: dict[str, EccData] | str) -> dict[str, EccData]: + """Validate the ecc_blocks field.""" + if isinstance(value, str): + # If it's a string, we assume it's "N/A" and return an empty dict + return {} + return value + + @field_validator("energy", mode="before") + @classmethod + def validate_energy(cls, value: Any | None) -> MetricEnergy | None: + """Validate the energy field.""" + if value == "N/A" or value is None: + return None + return value + + +### LINK DATA ### + + +class LinkStatusTable(Enum): + UP = "U" + DOWN = "D" + DISABLED = "X" + + +class BiDirectionalTable(Enum): + SELF = "SELF" + TRUE = "T" + + +class DmaTable(Enum): + SELF = "SELF" + TRUE = "T" + + +class AtomicsTable(Enum): + SELF = "SELF" + TRUE = "64,32" + THIRTY_TWO = "32" + SIXTY_FOUR = "64" + + +class LinkTypes(Enum): + XGMI = "XGMI" + PCIE = "PCIE" + SELF = "SELF" + + +class AccessTable(Enum): + ENABLED = "ENABLED" + DISABLED = "DISABLED" + + +# XGMI +class XgmiLink(BaseModel): + gpu: int + bdf: str + read: ValueUnit | None + write: ValueUnit | None + na_validator = field_validator("read", "write", mode="before")(na_to_none) + + +class XgmiLinkMetrics(BaseModel): + bit_rate: ValueUnit | None + max_bandwidth: ValueUnit | None + link_type: str + links: List[XgmiLink] + na_validator = field_validator("max_bandwidth", "bit_rate", mode="before")(na_to_none) + + +class XgmiMetrics(BaseModel): + gpu: int + bdf: str + link_metrics: XgmiLinkMetrics + + +class XgmiLinks(BaseModel): + gpu: int + bdf: str + link_status: list[LinkStatusTable] + + +class CoherentTable(Enum): + COHERANT = "C" + NON_COHERANT = "NC" + SELF = "SELF" + + +# TOPO + + +class TopoLink(BaseModel): + gpu: int + bdf: str + weight: int + link_status: AccessTable + link_type: LinkTypes + num_hops: int + bandwidth: str + # The below fields are sometimes missing, so we use Optional + coherent: CoherentTable | None = None + atomics: AtomicsTable | None = None + dma: DmaTable | None = None + bi_dir: BiDirectionalTable | None = None + + @computed_field + @property + def bandwidth_from(self) -> int | None: + """Get the bandwidth from the link.""" + bw_split = self.bandwidth.split("-") + if len(bw_split) == 2: + return int(bw_split[0]) + else: + # If the bandwidth is not in the expected format, return None + return None + + @computed_field + @property + def bandwidth_to(self) -> int | None: + """Get the bandwidth to the link.""" + bw_split = self.bandwidth.split("-") + if len(bw_split) == 2: + return int(bw_split[1]) + else: + # If the bandwidth is not in the expected format, return None + return None + + +class Topo(BaseModel): + gpu: int + bdf: str + links: List[TopoLink] + + +# PROCESS DATA +class ProcessMemoryUsage(BaseModel): + gtt_mem: ValueUnit | None + cpu_mem: ValueUnit | None + vram_mem: ValueUnit | None + na_validator = field_validator("gtt_mem", "cpu_mem", "vram_mem", mode="before")(na_to_none) + + +class ProcessUsage(BaseModel): + gfx: ValueUnit | None + enc: ValueUnit | None + na_validator = field_validator("gfx", "enc", mode="before")(na_to_none) + + +class ProcessInfo(BaseModel): + name: str + pid: int + memory_usage: ProcessMemoryUsage + mem_usage: ValueUnit | None + usage: ProcessUsage + na_validator = field_validator("mem_usage", mode="before")(na_to_none) + + +class ProcessListItem(BaseModel): + process_info: ProcessInfo | str + + +class Processes(BaseModel): + gpu: int + process_list: List[ProcessListItem] + + +# FW +class FwListItem(BaseModel): + fw_id: str + fw_version: str + + +class Fw(BaseModel): + gpu: int + fw_list: List[FwListItem] + + +# AMD SMI LIST +class AmdSmiListItem(BaseModel): + gpu: int + bdf: str + uuid: str + kfd_id: int + node_id: int + partition_id: int + + +# PAGES +class PageData(BaseModel): + page_address: int | str + page_size: int | str + status: str + + +class BadPages(BaseModel): + gpu: int + retired: str | PageData | list[PageData] + pending: str | PageData | list[PageData] + un_res: str | PageData | list[PageData] + + +class AmdSmiMetricPcieData(BaseModel): + "Data in pcie subfield of metrics command" + + width: NonNegativeInt + speed: NonNegativeFloat + bandwidth: Optional[NonNegativeFloat] = 0 + replay_count: Optional[int] = 0 + l0_to_recovery_count: Optional[int] = 0 + replay_roll_over_count: Optional[int] = 0 + nak_sent_count: Optional[int] = 0 + nak_received_count: Optional[int] = 0 + + +class AmdSmiMetricEccData(BaseModel): + "ECC info collected per ecc block" + + umc: EccData = EccData() + sdma: EccData = EccData() + gfx: EccData = EccData() + mmhub: EccData = EccData() + pcie_bif: EccData = EccData() + hdp: EccData = EccData() + xgmi_wafl: EccData = EccData() + + +class AmdSmiTstData(BaseModel): + "Summary of amdsmitst results, with list and count of passing/skipped/failed tests" + + passed_tests: list[str] = Field(default_factory=list) + skipped_tests: list[str] = Field(default_factory=list) + failed_tests: list[str] = Field(default_factory=list) + passed_test_count: int = 0 + skipped_test_count: int = 0 + failed_test_count: int = 0 + + +class AmdSmiVersion(BaseModel): + """Contains the versioning info for amd-smi""" + + tool: str | None = None + version: str | None = None + amdsmi_library_version: str | None = None + rocm_version: str | None = None + amdgpu_version: str | None = None + amd_hsmp_driver_version: str | None = None + + +class PartitionCurrent(BaseModel): + """Contains the Current Partition data for the GPUs""" + + gpu_id: int + memory: str | None = None + accelerator_type: str | None = None + accelerator_profile_index: str | int | None = None + partition_id: str | int | None = None # Right now this is a string but it looks like an int + + +class PartitionMemory(BaseModel): + """Memory Partition data""" + + gpu_id: int + memory_partition_caps: str | None = None + current_partition_id: str | None = None + + +class PartitionProfiles(AmdSmiBaseModel): + """Partition Profiles data""" + + gpu_id: int + profile_index: str | None = None + memory_partition_caps: str | None = None + accelerator_type: str | None = None + partition_id: str | None = None + num_partitions: str | None = None + num_resources: str | None = None + resource_index: str | None = None + resource_type: str | None = None + resource_instances: str | None = None + resources_shared: str | None = None + + +class PartitionResources(AmdSmiBaseModel): + """Partition Resources""" + + # This does not have gpu_id field for some reason. + # gpu_id: int + resource_index: str | None = None + resource_type: str | None = None + resource_instances: str | None = None + resources_shared: str | None = None + + +# Partition info +class Partition(BaseModel): + """Contains the partition info for amd-smi""" + + current_partition: list[PartitionCurrent] = Field(default_factory=list) + memory_partition: list[PartitionMemory] = Field(default_factory=list) + # Right now partition_profiles and partition_resources is all N/A by amd-smi so placeholder dict until better defined + partition_profiles: list[dict] = Field(default_factory=list) + partition_resources: list[dict] = Field(default_factory=list) + + +class AmdSmiData(DataModel): + """Data model for amd-smi data. + + Optionals are used to allow for the data to be missing, + This makes the data class more flexible for the analyzer + which consumes only the required data. If any more data is + required for the analyzer then they should not be set to + default. + """ + + model_config = ConfigDict( + str_min_length=1, + str_strip_whitespace=True, + populate_by_name=True, + ) + + version: AmdSmiVersion | None = None + gpu_list: list[AmdSmiListItem] | None = Field(default_factory=list) + partition: Partition | None = None + process: list[Processes] | None = Field(default_factory=list) + topology: list[Topo] | None = Field(default_factory=list) + static: list[AmdSmiStatic] | None = Field(default_factory=list) + metric: list[AmdSmiMetric] | None = Field(default_factory=list) + firmware: list[Fw] | None = Field(default_factory=list) + bad_pages: list[BadPages] | None = Field(default_factory=list) + xgmi_metric: list[XgmiMetrics] | None = Field(default_factory=list) + xgmi_link: list[XgmiLinks] | None = Field(default_factory=list) + cper_data: list[FileModel] | None = Field(default_factory=list) + amdsmitst_data: AmdSmiTstData + + def get_list(self, gpu: int) -> AmdSmiListItem | None: + """Get the gpu list item for the given gpu id.""" + if self.gpu_list is None: + return None + for item in self.gpu_list: + if item.gpu == gpu: + return item + return None + + def get_static(self, gpu: int) -> AmdSmiStatic | None: + """Get the static data for the given gpu id.""" + if self.static is None: + return None + for item in self.static: + if item.gpu == gpu: + return item + return None + + def get_metric(self, gpu: int) -> AmdSmiMetric | None: + """Get the metric data for the given gpu id.""" + if self.metric is None: + return None + for item in self.metric: + if item.gpu == gpu: + return item + return None + + def get_process(self, gpu: int) -> Processes | None: + """Get the process data for the given gpu id.""" + if self.process is None: + return None + for item in self.process: + if item.gpu == gpu: + return item + return None + + def get_topology(self, gpu: int) -> Topo | None: + """Get the topology data for the given gpu id.""" + if self.topology is None: + return None + for item in self.topology: + if item.gpu == gpu: + return item + return None + + def get_firmware(self, gpu: int) -> Fw | None: + """Get the firmware data for the given gpu id.""" + if self.firmware is None: + return None + for item in self.firmware: + if item.gpu == gpu: + return item + return None + + def get_bad_pages(self, gpu: int) -> BadPages | None: + """Get the bad pages data for the given gpu id.""" + if self.bad_pages is None: + return None + for item in self.bad_pages: + if item.gpu == gpu: + return item + return None + + @property + def amdsmimetricpcie_data(self) -> dict[int, AmdSmiMetricPcieData]: + """Get the pcie data for the given gpu id.""" + return { + item.gpu: AmdSmiMetricPcieData( + width=item.pcie.width if item.pcie.width else 0, + speed=float(item.pcie.speed.value) if item.pcie.speed else 0, + bandwidth=float(item.pcie.bandwidth.value) if item.pcie.bandwidth else 0, + replay_count=item.pcie.replay_count, + l0_to_recovery_count=item.pcie.l0_to_recovery_count, + replay_roll_over_count=item.pcie.replay_roll_over_count, + nak_sent_count=item.pcie.nak_sent_count, + nak_received_count=item.pcie.nak_received_count, + ) + for item in self.metric or [] + } + + @property + def amdsmimetricecc_data(self) -> dict[int, AmdSmiMetricEccData]: + """Get the ecc data for the given gpu id.""" + amdsmimetric_ret = {} + for item in self.metric or []: + if isinstance(item.ecc_blocks, str): + # If ecc_blocks is a string, it means no ECC data is available + continue + amdsmimetric_ret[item.gpu] = AmdSmiMetricEccData( + umc=item.ecc_blocks.get("UMC", EccData()), + sdma=item.ecc_blocks.get("SDMA", EccData()), + gfx=item.ecc_blocks.get("GFX", EccData()), + mmhub=item.ecc_blocks.get("MMHUB", EccData()), + pcie_bif=item.ecc_blocks.get("PCIE_BIF", EccData()), + hdp=item.ecc_blocks.get("HDP", EccData()), + xgmi_wafl=item.ecc_blocks.get("XGMI_WAFL", EccData()), + ) + return amdsmimetric_ret diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 3118e6e3..0e208d4a 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -27,7 +27,7 @@ import re import traceback from enum import Enum -from typing import TypeVar +from typing import Any, TypeVar T = TypeVar("T") @@ -169,3 +169,47 @@ def bytes_to_human_readable(input_bytes: int) -> str: gb = round(mb / 1000, 2) return f"{gb}GB" + + +def find_annotation_in_container( + annotation, target_type +) -> tuple[Any, list[Any]] | tuple[None, list[Any]]: + """Recursively search for a target type in an annotation and return the target type and the containers + supported container types are generic types, Callable, Tuple, Union, Literal, Final, ClassVar + and Annotated. If the target type is not found then None is returned. + + Examples: + find_annotation_in_container(Union[int, str], int) -> int, [Union[int, str]] + find_annotation_in_container(int | dict[str, list[MyClass]], MyClass) -> MyClass, [list,dict,union] + find_annotation_in_container(Union[int, str], MyClass) -> None, [] + + Parameters + ---------- + annotation : type + A type annotation to search for the target type in. + target_type : type + The target type to search for. + + Returns + ------- + tuple[Any, list[Any]] | tuple[None, []] + The target type and the containers if found, otherwise None and an empty list. + """ + containers: list[Any] = [] + origin = get_origin(annotation) + args = get_args(annotation) + if len(args) == 0 and issubclass(annotation, target_type): + return annotation, containers + if isinstance(args, tuple): + for item in args: + item_args = get_args(item) + if len(item_args) > 0: + result, container = find_annotation_in_container(item, target_type) + containers += container + if result: + containers.append(origin) + return result, containers + if len(get_args(item)) == 0 and issubclass(item, target_type): + containers.append(origin) + return item, containers + return None, [] From b6be391a9405e2e411b238a58248de4af15542ee Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 21 Aug 2025 13:25:36 -0500 Subject: [PATCH 03/38] updated --- .../plugins/inband/amdsmi/amdsmi_collector.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index d31982ea..68588543 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -26,17 +26,20 @@ import io import json import re +from tarfile import TarFile from typing import TypeVar from packaging.version import Version as PackageVersion from pydantic import BaseModel, ValidationError from nodescraper.base.inbandcollectortask import InBandDataCollector + +# from nodescraper.models.datamodel import FileModel +from nodescraper.connection.inband import BinaryFileArtifact, TextFileArtifact from nodescraper.connection.inband.inband import BaseFileArtifact, CommandArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.models import TaskResult -from nodescraper.models.datamodel import FileModel from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiData, AmdSmiListItem, @@ -69,7 +72,7 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiData, None]): def _check_amdsmi_installed(self) -> bool: """Return if amd-smi is installed""" - cmd_ret: CommandArtifact = self._run_system_command("which amd-smi") + cmd_ret: CommandArtifact = self._run_sut_cmd("which amd-smi") return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout) def _check_command_supported(self, command: str) -> bool: @@ -255,7 +258,7 @@ def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: ------- str: str of output """ - cmd_ret: CommandArtifact = self._run_system_command(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) + cmd_ret: CommandArtifact = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: self._log_event( category=EventCategory.APPLICATION, @@ -399,14 +402,14 @@ def get_xgmi_data_metric(self) -> dict[str, list[dict]] | None: "link": xgmi_link_data, } - def get_cper_data(self) -> list[FileModel]: + def get_cper_data(self) -> list[TextFileArtifact]: CPER_CMD = "ras" if not self._check_command_supported(CPER_CMD): # If the command is not supported, return an empty list return [] AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" # Ensure the cper folder exists but is empty - self._run_system_command( + self._run_sut_cmd( f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json", sudo=False, ) @@ -420,7 +423,7 @@ def get_cper_data(self) -> list[FileModel]: # Early exit if no CPER files were created return [] # tar the cper folder - self._run_system_command( + self._run_sut_cmd( f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", sudo=True, ) @@ -447,7 +450,7 @@ def get_cper_data(self) -> list[FileModel]: else: file_content_bytes = b"" cper_data.append( - FileModel(file_contents=file_content_bytes, file_name=member.name) + BinaryFileArtifact(filename=member.name, contents=file_content_bytes) ) # Since we do not log the cper data in the data model create an invent informing the user if CPER created if cper_data: @@ -489,7 +492,7 @@ def get_amdsmitst_data(self, amdsmi_version: AmdSmiVersion | None) -> AmdSmiTstD self.logger.info("Skipping amdsmitst test due to Version incompatibility") return amdsmitst_data amdsmitst_cmd: str = "/opt/rocm/share/amd_smi/tests/amdsmitst" - cmd_ret: CommandArtifact = self._run_system_command(amdsmitst_cmd, sudo=True) + cmd_ret: CommandArtifact = self._run_sut_cmd(amdsmitst_cmd, sudo=True) if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: self._log_event( category=EventCategory.APPLICATION, @@ -548,7 +551,7 @@ def detect_amdsmi_commands(self) -> set[str]: def collect_data( self, - **kwargs, + args=None, ) -> tuple[TaskResult, AmdSmiData | None]: try: self.amd_smi_commands = self.detect_amdsmi_commands() From dc62a1f42029b8a39d9a4b1ce260a3a2c658a77f Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 21 Aug 2025 15:54:40 -0500 Subject: [PATCH 04/38] added sudo for all subcmds --- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 68588543..20fb59ce 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -230,7 +230,7 @@ def _run_amd_smi_dict( dict: dict of output """ cmd += " --json" - cmd_ret = self._run_amd_smi(cmd, sudo=sudo) + cmd_ret = self._run_amd_smi(cmd, sudo=True) if cmd_ret: try: return json.loads(cmd_ret) @@ -259,7 +259,8 @@ def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: str: str of output """ cmd_ret: CommandArtifact = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) - if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + + if cmd_ret.exit_code != 0: self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi command", @@ -272,8 +273,8 @@ def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: console_log=True, ) return None - else: - return cmd_ret.stdout + + return cmd_ret.stdout or "" def get_gpu_list(self) -> list[dict] | None: """Get data as a list of dict from cmd: amdsmi list""" From 6d03cd2446c8a850640729f0a68695027e855214 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 21 Aug 2025 16:07:41 -0500 Subject: [PATCH 05/38] fixed utils --- nodescraper/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 0e208d4a..d9987ccc 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -27,7 +27,7 @@ import re import traceback from enum import Enum -from typing import Any, TypeVar +from typing import Any, TypeVar, get_args, get_origin T = TypeVar("T") From 058da99f9b6bfcfe3221599206c6889de1dd97b2 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 22 Aug 2025 15:30:48 -0500 Subject: [PATCH 06/38] moved utesdt --- test/unit/plugin/test_amdsmi_collector.py | 309 ++++++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 test/unit/plugin/test_amdsmi_collector.py diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py new file mode 100644 index 00000000..22bb0ea8 --- /dev/null +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -0,0 +1,309 @@ +import json +from pathlib import Path +from unittest.mock import MagicMock + +from errorscraper.config.config import SystemInteractionLevel +from errorscraper.datacollector.inband.amdsmi import ( + AmdSmiCollector, + AmdSmiData, + AmdSmiTstData, +) +from errorscraper.datamodel.inband.amdsmidata import ( + AmdSmiListItem, + AmdSmiMetric, + AmdSmiStatic, + AmdSmiVersion, + BadPages, + Fw, + Processes, + Topo, +) +from errorscraper.interfaces.inband import CommandArtifact, FileArtifact +from errorscraper.taskresult import TaskStatus +from scraper_test_base import ScraperTestBase + + +class TestAmdSmiCollection(ScraperTestBase): + """Test the amdsmi collector""" + + def setUp(self) -> None: + super().setUp() + json_text = (Path(self.fixtures_path) / "test_amdsmi_collector_mi355.json").read_text() + self.fixture_dict = json.loads(json_text) + self.test_collector = AmdSmiCollector( + system_info=self.system_info_mi300x, + system_interaction_level=SystemInteractionLevel.STANDARD, + ib_interface=self.ib_interface, + ) + + self.ib_interface.run_command = MagicMock() + self.ib_interface.run_command.side_effect = self.mock_with_fixture + + def mock_with_fixture(self, *args, **kwargs): + """Mock the interface to return the fixture data""" + for artifact in self.fixture_dict: + a_cmd = artifact["command"] + a_cmd_sudo_pass = f"sudo -S -p '' {kwargs['command']}" + a_cmd_sudo = f"sudo {kwargs['command']}" + if a_cmd == kwargs["command"] or a_cmd_sudo == a_cmd or a_cmd == a_cmd_sudo_pass: + return CommandArtifact(**artifact) + + def test_data_collection_config(self) -> None: + """Test checks for the tool being installed and that the check aborts with the proper + task status when amd-smi is not installed""" + self.ib_interface.run_command.return_value = CommandArtifact( + command="which amd-smi", + stdout="/usr/bin/amd-smi", + stderr="", + exit_code=0, + ) + is_installed = self.test_collector._check_amdsmi_installed() + self.assertTrue(is_installed) + self.ib_interface.run_command.side_effect = None + self.ib_interface.run_command.return_value = CommandArtifact( + command="which amd-smi", + stdout="", + stderr="command not found", + exit_code=1, + ) + is_installed = self.test_collector._check_amdsmi_installed() + self.assertFalse(is_installed) + + res, data = self.test_collector.collect_data() + self.assertEqual(res.status, TaskStatus.NOT_RAN) + self.assertIsNone(data) + + def test_amd_smi_data_and_commands(self) -> None: + """Test basic AMD SMI data collection that all methods return correct types""" + amd_smi_return_dict_cmds = { + "gpu_list": (self.test_collector.get_gpu_list, AmdSmiListItem), + "process": (self.test_collector.get_process, Processes), + "topology": (self.test_collector.get_topology, Topo), + "static": (self.test_collector.get_static, AmdSmiStatic), + "metric": (self.test_collector.get_metric, AmdSmiMetric), + "firmware": (self.test_collector.get_firmware, Fw), + "bad_pages": (self.test_collector.get_bad_pages, BadPages), + } + result_data = {} + self.test_collector.amd_smi_commands = self.test_collector.detect_amdsmi_commands() + for cmd_name, amd_smi_cmd_obj in amd_smi_return_dict_cmds.items(): + result_data[cmd_name] = amd_smi_cmd_obj[0]() + + data = amd_smi_cmd_obj[1](**result_data[cmd_name][0]) + self.assertIsInstance(data, amd_smi_cmd_obj[1]) + self.assertIsNotNone(result_data[cmd_name]) + + def test_amd_smi_mi325(self): + json_text = (Path(self.fixtures_path) / "test_amdsmi_collector_mi325.json").read_text() + self.fixture_dict = json.loads(json_text) + + res, data = self.test_collector.collect_data() + self.assertEqual(res.status, TaskStatus.OK) + self.assertIsInstance(data, AmdSmiData) + # Check + self.assertEqual(data.gpu_list[0].bdf, "0000:09:00.0") + self.assertEqual( + data.process[0].process_list[0].process_info.name, + "rvs", + ) + self.assertEqual( + data.process[0].process_list[0].process_info.pid, + 206506, + ) + self.assertEqual(data.get_topology(0).links[0].num_hops, 0) + self.assertEqual(data.get_static(0).asic.device_id, "0x74a5") + self.assertEqual(data.metric[0].pcie.width, 16) + self.assertEqual(data.firmware[0].fw_list[0].fw_version, "177") + self.assertEqual(data.bad_pages[0].retired, "No bad pages found.") + self.assertEqual(data.xgmi_link[0].bdf, "0000:09:00.0") + self.assertEqual(data.xgmi_metric[0].link_metrics.bit_rate.value, 32) + + def test_amd_smi_tst_data(self) -> None: + """Test the AMD SMI test data collection, ensure it can built list and counts of tests of each status""" + # Example takes pertinent snippets from actual full output + self.test_collector.system_interaction_level = SystemInteractionLevel.DISRUPTIVE + version_data_pass = AmdSmiVersion( + tool="AMDSMI Tool", + version="25.5.1+c11e6492", + amdsmi_library_version="25.5.1", + rocm_version="6.4.2", + amdgpu_version="6.12.12", + amd_hsmp_driver_version="N/A", + ) + version_data_old = AmdSmiVersion( + tool="AMDSMI Tool", + version="25.5.1+c11e6492", + amdsmi_library_version="25.5.1", + rocm_version="6.4.0", + amdgpu_version="6.12.12", + amd_hsmp_driver_version="N/A", + ) + + amdsmitst_data = self.test_collector.get_amdsmitst_data(version_data_old) + self.assertIsInstance(amdsmitst_data, AmdSmiTstData) + self.assertEqual(amdsmitst_data.passed_test_count, 0) + self.assertEqual(amdsmitst_data.failed_test_count, 0) + self.assertEqual(amdsmitst_data.skipped_test_count, 0) + amdsmitst_data = self.test_collector.get_amdsmitst_data(version_data_pass) + self.assertIsInstance(amdsmitst_data, AmdSmiTstData) + self.assertEqual(amdsmitst_data.passed_test_count, 3) + self.assertEqual(amdsmitst_data.failed_test_count, 2) + self.assertEqual(amdsmitst_data.skipped_test_count, 1) + self.assertTrue("amdsmitstReadOnly.TestVersionRead" in amdsmitst_data.passed_tests) + self.assertTrue("amdsmitstReadWrite.TestXGMIReadWrite" in amdsmitst_data.skipped_tests) + self.assertTrue("amdsmitstReadWrite.TestPerfDeterminism" in amdsmitst_data.failed_tests) + self.ib_interface.run_command.side_effect = None + + self.ib_interface.run_command.return_value = CommandArtifact( + command="/opt/rocm/share/amd_smi_tests/amdsmitsts/", + stdout="", + stderr="No such file or directory", + exit_code=255, + ) + amdsmitst_data = self.test_collector.get_amdsmitst_data(version_data_pass) + self.assertEqual(amdsmitst_data, AmdSmiTstData()) + + def test_task_body_bad_data_collected(self): + """Test the task body when the data collection fails""" + self.ib_interface.run_command.side_effect = [ + CommandArtifact( + command="which amd-smi", + stdout="/usr/bin/amd-smi", + stderr="", + exit_code=0, + ) + ] * 100 + res, data = self.test_collector.collect_data() + self.assertEqual(res.status, TaskStatus.ERRORS_DETECTED) + self.assertIsInstance(data, AmdSmiData) + self.assertEqual( + res.events[0].description, + "Error parsing command: `version --json` json data", + ) + + def test_amdsmi_collector_350(self): + """Test the AMD SMI collector with a MI350x fixture""" + json_text = (Path(self.fixtures_path) / "test_amdsmi_collector_mi350.json").read_text() + self.fixture_dict = json.loads(json_text) + fixture_tar_file = Path(self.fixtures_path) / "amd_smi_cper.tar.gz" + with open(fixture_tar_file, "rb") as f: + tar_bytes = f.read() + self.ib_interface.read_file.return_value = FileArtifact( + filename="amd_smi_cper.tar.gz", + contents=tar_bytes, + ) + + res, data = self.test_collector.collect_data() + self.assertEqual(res.status, TaskStatus.OK) + self.assertIsInstance(data, AmdSmiData) + self.assertIsNotNone(data.gpu_list) + self.assertIsNotNone(data.process) + self.assertIsNotNone(data.topology) + self.assertIsNotNone(data.static) + self.assertIsNotNone(data.metric) + self.assertIsNotNone(data.firmware) + self.assertIsNotNone(data.bad_pages) + self.assertIsNotNone(data.xgmi_metric) + self.assertIsNotNone(data.xgmi_link) + self.assertIsNotNone(data.cper_data) + + def test_amdsmi_cper_collection(self): + """Test the AMD SMI collector with a MI350x fixture for CPER collection""" + fixture_tar_file = Path(self.fixtures_path) / "amd_smi_cper.tar.gz" + self.ib_interface.run_command.side_effect = [ + CommandArtifact( + command="which amd-smi", + stdout="/usr/bin/amd-smi", + stderr="", + exit_code=0, + ), + CommandArtifact( + command="sudo -S -p '' amd-smi ras --cper --folder=/tmp/amd_smi_cper --afid", + stdout="""Dumping CPER file header entries in folder /tmp/cpers + timestamp gpu_id severity file_name + 2025/06/17 21:45:30 0 corrected corrected_0.cper + """, + stderr="", + exit_code=0, + ), + CommandArtifact( + command="tar -czf /tmp/amd_smi_cper.tar.gz -C /tmp/amd_smi_cper .", + stdout="tar", + stderr="", + exit_code=0, + ), + ] + # read tar file into bytes + with open(fixture_tar_file, "rb") as f: + tar_bytes = f.read() + self.ib_interface.read_file.return_value = FileArtifact( + filename="amd_smi_cper.tar.gz", + contents=tar_bytes, + ) + self.test_collector.amd_smi_commands = {"ras"} + amd_data = self.test_collector.get_cper_data() + + self.assertEqual(len(amd_data), 1) + self.assertEqual(len(amd_data[0].file_contents), 4256) + self.assertEqual(amd_data[0].file_name, "./corrected_0.cper") + + def test_amdsmi_cper_no_cpers(self): + """Test the AMD SMI collector with a MI350x fixture for CPER collection with no CPER data""" + self.ib_interface.run_command.side_effect = [ + CommandArtifact( + command="which amd-smi", + stdout="/usr/bin/amd-smi", + stderr="", + exit_code=0, + ), + CommandArtifact( + command="mkdir -p /tmp/amd_smi_cper && rm /tmp/amd_smi_cper/*.cper && rm /tmp/amd_smi_cper/*.json", + stdout="", + stderr="", + exit_code=0, + ), + CommandArtifact( + command="sudo -S -p '' amd-smi ras --cper --folder=/tmp/amd_smi_cper --afid", + stdout="""Dumping CPER file header entries in folder /tmp/cpers + timestamp gpu_id severity file_name + + """, + stderr="", + exit_code=0, + ), + ] + self.test_collector.amd_smi_commands = {"ras"} + + amd_data = self.test_collector.get_cper_data() + self.assertEqual(len(amd_data), 0) + + def test_detect_amdsmi_commands(self): + """Test the detection of AMD SMI commands""" + self.ib_interface.run_command.side_effect = [ + CommandArtifact( + command="amd-smi -h", + stdout="AMD System Management Interface | Version: 25.3.0+ede62f2 | ROCm version: 6.4.0 |\nPlatform: Linux Baremetal\n\noptions:\n -h, --help show this help message and exit\n\nAMD-SMI Commands:\n Descriptions:\n version Display version information\n list List GPU information\n static Gets static information about the specified GPU\n firmware (ucode) Gets firmware information about the specified GPU\n bad-pages Gets bad page information about the specified GPU\n metric Gets metric/performance information about the specified GPU\n process Lists general process information running on the specified GPU\n event Displays event information for the given GPU\n topology Displays topology information of the devices\n set Set options for devices\n reset Reset options for devices\n monitor (dmon) Monitor metrics for target devices\n xgmi Displays xgmi information of the devices\n partition Displays partition information of the devices\n", + stderr="", + exit_code=0, + ), + ] + commands = self.test_collector.detect_amdsmi_commands() + self.assertEqual( + commands, + { + "version", + "list", + "static", + "firmware", + "bad-pages", + "metric", + "process", + "event", + "topology", + "set", + "reset", + "monitor", + "xgmi", + "partition", + }, + ) From 93541abaaca98e1673ddc1d9c7186a53c7cf03d7 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 26 Aug 2025 09:20:47 -0500 Subject: [PATCH 07/38] updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 106 ++++++++++++++++-- 1 file changed, 94 insertions(+), 12 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 20fb59ce..5d4127e4 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -31,6 +31,29 @@ from packaging.version import Version as PackageVersion from pydantic import BaseModel, ValidationError +from amdsmi import ( + amdsmi_init, + amdsmi_shut_down, + amdsmi_get_processor_handles, + amdsmi_get_lib_version, + amdsmi_get_rocm_version, + amdsmi_get_gpu_device_uuid, + amdsmi_get_gpu_device_bdf, + amdsmi_get_gpu_kfd_info, + amdsmi_get_fw_info, + amdsmi_get_gpu_process_list, + amdsmi_get_gpu_compute_process_info, + amdsmi_get_gpu_bad_page_info, + amdsmi_get_gpu_memory_reserved_pages, + amdsmi_get_gpu_compute_partition, + amdsmi_get_gpu_memory_partition, + amdsmi_get_gpu_accelerator_partition_profile, + amdsmi_get_xgmi_info, + amdsmi_get_gpu_metrics_info, # you can expand mapping later + amdsmi_get_pcie_info, # optional; for deeper metric/topo mapping + amdsmi_get_gpu_cper_entries, + amdsmi_get_afids_from_cper, +) from nodescraper.base.inbandcollectortask import InBandDataCollector @@ -69,6 +92,19 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiData, None]): DATA_MODEL = AmdSmiData + def _get_handles(self): + try: + return amdsmi_get_processor_handles() + except AmdSmiException as e: + self._log_event( + category=EventCategory.APPLICATION, + description="amdsmi_get_processor_handles failed", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + return [] + def _check_amdsmi_installed(self) -> bool: """Return if amd-smi is installed""" @@ -209,12 +245,34 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: return amd_smi_data def _get_amdsmi_version(self) -> AmdSmiVersion | None: - """Get amdsmi version and data.""" - ret = self._run_amd_smi_dict("version") - version_data = self.build_amdsmi_sub_data(AmdSmiVersion, ret) - if version_data: - return version_data[0] - return None + """Get lib/rocm versions.""" + try: + lib_ver = amdsmi_get_lib_version() or "" + rocm_ver = amdsmi_get_rocm_version() or "" + except AmdSmiException as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to read AMD SMI versions", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return None + + return AmdSmiVersion( + tool="amdsmi", + version=lib_ver, + amdsmi_library_version=lib_ver, + rocm_version=rocm_ver, + # amdgpu_version / amd_hsmp_driver_version unavailable via py API??? + ) + + #def _get_amdsmi_version(self) -> AmdSmiVersion | None: + # """Get amdsmi version and data.""" + # ret = self._run_amd_smi_dict("version") + # version_data = self.build_amdsmi_sub_data(AmdSmiVersion, ret) + # if version_data: + # return version_data[0] + # return None def _run_amd_smi_dict( self, cmd: str, sudo: bool = False, raise_event=True @@ -351,12 +409,36 @@ def get_firmware(self) -> list[dict] | None: return self._run_amd_smi_dict(FW_CMD) def get_bad_pages(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi bad-pages""" - BAD_PAGE_CMD = "bad-pages" - if self._check_command_supported(BAD_PAGE_CMD): - # If the command is supported, run it - return self._run_amd_smi_dict(BAD_PAGE_CMD) - return None + devices = self._get_handles() + out: list[dict] = [] + for idx, h in enumerate(devices): + try: + bad = amdsmi_get_gpu_bad_page_info(h) or {} + res = amdsmi_get_gpu_memory_reserved_pages(h) or {} + out.append( + { + "gpu": idx, + "retired": bad.get("retired", "N/A"), + "pending": bad.get("pending", "N/A"), + "un_res": res.get("unres", "N/A"), + } + ) + except AmdSmiException as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Bad pages collection failed", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return out + + #def get_bad_pages(self) -> list[dict] | None: + # """Get data as a list of dict from cmd: amdsmi bad-pages""" + # BAD_PAGE_CMD = "bad-pages" + # if self._check_command_supported(BAD_PAGE_CMD): + # # If the command is supported, run it + # return self._run_amd_smi_dict(BAD_PAGE_CMD) + # return None def get_xgmi_data_metric(self) -> dict[str, list[dict]] | None: """Get data as a list of dict from cmd: amdsmi xgmi""" From cfc9ca4b27378f05c954f14ad75252cb5a5ff4f6 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Wed, 3 Sep 2025 12:51:59 -0500 Subject: [PATCH 08/38] update --- .../plugins/inband/amdsmi/amdsmi_collector.py | 397 +++++++++++++----- .../plugins/inband/amdsmi/amdsmidata.py | 11 + 2 files changed, 292 insertions(+), 116 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 5d4127e4..c5ace29c 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -28,12 +28,17 @@ import re from tarfile import TarFile from typing import TypeVar +import sys from packaging.version import Version as PackageVersion from pydantic import BaseModel, ValidationError +import amdsmi from amdsmi import ( + AmdSmiInitFlags, amdsmi_init, amdsmi_shut_down, + amdsmi_get_gpu_board_info, + amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles, amdsmi_get_lib_version, amdsmi_get_rocm_version, @@ -48,11 +53,7 @@ amdsmi_get_gpu_compute_partition, amdsmi_get_gpu_memory_partition, amdsmi_get_gpu_accelerator_partition_profile, - amdsmi_get_xgmi_info, - amdsmi_get_gpu_metrics_info, # you can expand mapping later - amdsmi_get_pcie_info, # optional; for deeper metric/topo mapping - amdsmi_get_gpu_cper_entries, - amdsmi_get_afids_from_cper, + AmdSmiException, ) from nodescraper.base.inbandcollectortask import InBandDataCollector @@ -93,9 +94,11 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiData, None]): DATA_MODEL = AmdSmiData def _get_handles(self): + """Get processor handles.""" try: return amdsmi_get_processor_handles() - except AmdSmiException as e: + except amdsmi.AmdSmiException as e: + print("Exception1: %s" % e) self._log_event( category=EventCategory.APPLICATION, description="amdsmi_get_processor_handles failed", @@ -123,10 +126,11 @@ def _check_command_supported(self, command: str) -> bool: return True def build_amdsmi_sub_data( - self, amd_smi_data_model: type[T], json_data: list[dict] | None + self, amd_smi_data_model: type[T], json_data: list[dict] | dict | None ) -> list[T] | T | None: try: if json_data is None: + print("JSON is none") self._log_event( category=EventCategory.APPLICATION, description="No data returned from amd-smi sub command", @@ -183,6 +187,7 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: self.result.status = ExecutionStatus.NOT_RAN return None try: + self.amd_smi_commands = self.detect_amdsmi_commands() version = self._get_amdsmi_version() bad_pages = self.get_bad_pages() processes = self.get_process() @@ -197,6 +202,7 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: xgmi_metric = {"metric": {}, "link": {}} cper_data = self.get_cper_data() except Exception as e: + print(e) self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi sub commands", @@ -263,32 +269,14 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None: version=lib_ver, amdsmi_library_version=lib_ver, rocm_version=rocm_ver, - # amdgpu_version / amd_hsmp_driver_version unavailable via py API??? ) - #def _get_amdsmi_version(self) -> AmdSmiVersion | None: - # """Get amdsmi version and data.""" - # ret = self._run_amd_smi_dict("version") - # version_data = self.build_amdsmi_sub_data(AmdSmiVersion, ret) - # if version_data: - # return version_data[0] - # return None - def _run_amd_smi_dict( self, cmd: str, sudo: bool = False, raise_event=True ) -> dict | list[dict] | None: - """Run amd-smi command with json output. - - Args: - ---- - cmd (str): command to run - - Returns: - ------- - dict: dict of output - """ + """Run amd-smi command with json output.""" cmd += " --json" - cmd_ret = self._run_amd_smi(cmd, sudo=True) + cmd_ret = self._run_amd_smi(cmd, sudo=True if sudo else False) if cmd_ret: try: return json.loads(cmd_ret) @@ -306,16 +294,7 @@ def _run_amd_smi_dict( return None def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: - """Run amd-smi command - - Args: - ---- - cmd (str): command to run - - Returns: - ------- - str: str of output - """ + """Run amd-smi command""" cmd_ret: CommandArtifact = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) if cmd_ret.exit_code != 0: @@ -335,123 +314,219 @@ def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: return cmd_ret.stdout or "" def get_gpu_list(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi list""" - LIST_CMD = "list" - if not self._check_command_supported(LIST_CMD): - # If the command is not supported, return None - return None - return self._run_amd_smi_dict(LIST_CMD) + devices = self._get_handles() + out: list[dict] = [] + for idx, h in enumerate(devices): + try: + uuid = amdsmi_get_gpu_device_uuid(h) or "" + bdf = amdsmi_get_gpu_device_bdf(h) or "" + kfd = amdsmi_get_gpu_kfd_info(h) or {} + + # Name via board/ASIC info + name = None + try: + board = amdsmi_get_gpu_board_info(h) or {} + name = board.get("product_name") # preferred + except amdsmi.AmdSmiException: + pass + if not name: + try: + asic = amdsmi_get_gpu_asic_info(h) or {} + name = asic.get("market_name") # fallback + except amdsmi.AmdSmiException: + pass + + out.append({ + "gpu": idx, + "name": name or "unknown", + "bdf": bdf, + "uuid": uuid, + "kfd_id": int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, + "node_id": int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, + "partition_id": 0, + }) + except AmdSmiException as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build gpu list entry from API", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return out def get_process(self) -> list[dict] | None: """Get data as a list of dict from cmd: amdsmi process""" - PROCESS_CMD = "process" - if not self._check_command_supported(PROCESS_CMD): - # If the command is not supported, return None - return None - return self._run_amd_smi_dict(PROCESS_CMD) + devices = self._get_handles() + out: list[dict] = [] + for idx, h in enumerate(devices): + try: + pids = amdsmi_get_gpu_process_list(h) or [] + plist = [] + for pid in pids: + try: + pinfo = amdsmi_get_gpu_compute_process_info(h, pid) or {} + plist.append({"process_info": { + "name": pinfo.get("name", str(pid)), + "pid": int(pid), + "memory_usage": { + "gtt_mem": {"value": pinfo.get("gtt_mem", 0), "unit": "B"}, + "cpu_mem": {"value": pinfo.get("cpu_mem", 0), "unit": "B"}, + "vram_mem": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, + }, + "mem_usage": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, + "usage": { + "gfx": {"value": pinfo.get("gfx", 0), "unit": "%"}, + "enc": {"value": pinfo.get("enc", 0), "unit": "%"}, + }, + }}) + except AmdSmiException: + plist.append({"process_info": str(pid)}) + out.append({"gpu": idx, "process_list": plist}) + except AmdSmiException as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Process collection failed", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return out - def get_partition(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi process""" - PARTITION_CMD = "partition" - if not self._check_command_supported(PARTITION_CMD): - # If the command is not supported, return None - return None - return self._run_amd_smi_dict(PARTITION_CMD) + def get_partition(self) -> dict | None: + """Collect partition info via AMDSMI; degrade gracefully if unsupported.""" + devices = self._get_handles() + current: list[dict] = [] + memparts: list[dict] = [] + profiles: list[dict] = [] + resources: list[dict] = [] + for idx, h in enumerate(devices): + c = self._smi_try(amdsmi_get_gpu_compute_partition, h, default={}) or {} + m = self._smi_try(amdsmi_get_gpu_memory_partition, h, default={}) or {} + p = self._smi_try(amdsmi_get_gpu_accelerator_partition_profile, h, default={}) or {} + c_dict = c if isinstance(c, dict) else {} + m_dict = m if isinstance(m, dict) else {} + profiles.append(p if isinstance(p, dict) else {}) + current.append({ + "gpu_id": idx, + "memory": c_dict.get("memory"), + "accelerator_type": c_dict.get("accelerator_type"), + "accelerator_profile_index": c_dict.get("accelerator_profile_index"), + "partition_id": c_dict.get("partition_id"), + }) + memparts.append({ + "gpu_id": idx, + "memory_partition_caps": m_dict.get("memory_partition_caps"), + "current_partition_id": m_dict.get("current_partition_id"), + }) + return { + "current_partition": current, + "memory_partition": memparts, + "partition_profiles": profiles, + "partition_resources": resources, + } def get_topology(self) -> list[dict] | None: """Get data as a list of dict from cmd: amdsmi topology""" TOPO_CMD = "topology" + if not hasattr(self, "amd_smi_commands"): + self.amd_smi_commands = self.detect_amdsmi_commands() if not self._check_command_supported(TOPO_CMD): - # If the command is not supported, return None return None return self._run_amd_smi_dict(TOPO_CMD) def get_static(self) -> list[dict] | None: """Get data in dict format from cmd: amdsmi static""" STATIC_CMD = "static" + if not hasattr(self, "amd_smi_commands"): + self.amd_smi_commands = self.detect_amdsmi_commands() if not self._check_command_supported(STATIC_CMD): - # If the command is not supported, return None return None static_data = self._run_amd_smi_dict(f"{STATIC_CMD} -g all") if static_data is None: return None - if "gpu_data" in static_data: + if isinstance(static_data, dict) and "gpu_data" in static_data: static_data = static_data["gpu_data"] static_data_gpus = [] for static in static_data: - if "gpu" in static: + if isinstance(static, dict) and "gpu" in static: static_data_gpus.append(static) return static_data_gpus def get_metric(self) -> list[dict] | None: """Get data as a list of dict from cmd: amdsmi metric""" METRIC_CMD = "metric" + if not hasattr(self, "amd_smi_commands"): + self.amd_smi_commands = self.detect_amdsmi_commands() if not self._check_command_supported(METRIC_CMD): - # If the command is not supported, return None return None metric_data = self._run_amd_smi_dict(f"{METRIC_CMD} -g all") if metric_data is None: return None - if "gpu_data" in metric_data: + if isinstance(metric_data, dict) and "gpu_data" in metric_data: metric_data = metric_data["gpu_data"] metric_data_gpus = [] for metric in metric_data: - if "gpu" in metric: + if isinstance(metric, dict) and "gpu" in metric: metric_data_gpus.append(metric) return metric_data_gpus def get_firmware(self) -> list[dict] | None: """Get data as a list of dict from cmd: amdsmi firmware""" - FW_CMD = "firmware" - if not self._check_command_supported(FW_CMD): - # If the command is not supported, return None - return None - return self._run_amd_smi_dict(FW_CMD) - - def get_bad_pages(self) -> list[dict] | None: devices = self._get_handles() out: list[dict] = [] for idx, h in enumerate(devices): try: - bad = amdsmi_get_gpu_bad_page_info(h) or {} - res = amdsmi_get_gpu_memory_reserved_pages(h) or {} + fw_list = amdsmi_get_fw_info(h) or [] out.append( { "gpu": idx, - "retired": bad.get("retired", "N/A"), - "pending": bad.get("pending", "N/A"), - "un_res": res.get("unres", "N/A"), + "fw_list": [{"fw_id": f.get("fw_id", ""), "fw_version": f.get("fw_version", "")} for f in fw_list if isinstance(f, dict)], } ) except AmdSmiException as e: self._log_event( category=EventCategory.APPLICATION, - description="Bad pages collection failed", + description="amdsmi_get_fw_info failed", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, ) return out - #def get_bad_pages(self) -> list[dict] | None: - # """Get data as a list of dict from cmd: amdsmi bad-pages""" - # BAD_PAGE_CMD = "bad-pages" - # if self._check_command_supported(BAD_PAGE_CMD): - # # If the command is supported, run it - # return self._run_amd_smi_dict(BAD_PAGE_CMD) - # return None + def get_bad_pages(self) -> list[dict] | None: + devices = self._get_handles() + print("devices: %s" % (devices,)) + out: list[dict] = [] + for idx, h in enumerate(devices): + bad_list = self._smi_try(amdsmi_get_gpu_bad_page_info, h, default=[]) or [] + res_list = self._smi_try(amdsmi_get_gpu_memory_reserved_pages, h, default=[]) or [] + + retired = sum(1 for b in bad_list if isinstance(b, dict) and str(b.get("status", "")).lower() == "retired") + pending = sum(1 for b in bad_list if isinstance(b, dict) and str(b.get("status", "")).lower() == "pending") + + out.append( + { + "gpu": idx, + "retired": retired, + "pending": pending, + "un_res": len(res_list), + "bad_pages": bad_list, + "reserved_pages": res_list, + } + ) + return out def get_xgmi_data_metric(self) -> dict[str, list[dict]] | None: """Get data as a list of dict from cmd: amdsmi xgmi""" XGMI_CMD = "xgmi" + if not hasattr(self, "amd_smi_commands"): + self.amd_smi_commands = self.detect_amdsmi_commands() if not self._check_command_supported(XGMI_CMD): - # If the command is not supported, return None return None xgmi_metric_data = self._run_amd_smi_dict(f"{XGMI_CMD} -m") if xgmi_metric_data is None: xgmi_metric_data = [] - elif "xgmi_metric" in xgmi_metric_data: + elif isinstance(xgmi_metric_data, dict) and "xgmi_metric" in xgmi_metric_data: xgmi_metric_data = xgmi_metric_data["xgmi_metric"] - if len(xgmi_metric_data) == 1: + if isinstance(xgmi_metric_data, list) and len(xgmi_metric_data) == 1: xgmi_metric_data = xgmi_metric_data[0] xgmi_link_data = self._run_amd_smi_dict(f"{XGMI_CMD} -l", raise_event=False) if isinstance(xgmi_link_data, dict) and "link_status" in xgmi_link_data: @@ -487,30 +562,25 @@ def get_xgmi_data_metric(self) -> dict[str, list[dict]] | None: def get_cper_data(self) -> list[TextFileArtifact]: CPER_CMD = "ras" + if not hasattr(self, "amd_smi_commands"): + self.amd_smi_commands = self.detect_amdsmi_commands() if not self._check_command_supported(CPER_CMD): - # If the command is not supported, return an empty list return [] AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" - # Ensure the cper folder exists but is empty self._run_sut_cmd( f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json", sudo=False, ) cper_cmd = self._run_amd_smi(f"{CPER_CMD} --cper --folder={AMD_SMI_CPER_FOLDER}", sudo=True) if cper_cmd is None: - # Error was already logged in _run_amd_smi return [] - # search that a CPER is actually created here regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd) if not regex_cper_search: - # Early exit if no CPER files were created return [] - # tar the cper folder self._run_sut_cmd( f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", sudo=True, ) - # Load teh tar files cper_zip: BaseFileArtifact = self.ib_interface.read_file( f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False ) @@ -519,7 +589,7 @@ def get_cper_data(self) -> list[TextFileArtifact]: cper_zip.contents, ) io_bytes = io.BytesIO(cper_zip.contents) - del cper_zip # Free memory after reading the file + del cper_zip try: with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file: cper_data = [] @@ -527,15 +597,12 @@ def get_cper_data(self) -> list[TextFileArtifact]: if member.isfile() and member.name.endswith(".cper"): file_content = tar_file.extractfile(member) if file_content is not None: - # Decode the content, ignoring errors to avoid issues with binary data - # that may not be valid UTF-8 file_content_bytes = file_content.read() else: file_content_bytes = b"" cper_data.append( BinaryFileArtifact(filename=member.name, contents=file_content_bytes) ) - # Since we do not log the cper data in the data model create an invent informing the user if CPER created if cper_data: self._log_event( category=EventCategory.APPLICATION, @@ -564,14 +631,11 @@ def get_amdsmitst_data(self, amdsmi_version: AmdSmiVersion | None) -> AmdSmiTstD amdsmitst_data = AmdSmiTstData() if self.system_interaction_level != SystemInteractionLevel.DISRUPTIVE: return amdsmitst_data - # This test is disruptive, so we only run it if the system interaction level is set to DISRUPTIVE if ( amdsmi_version is None or amdsmi_version.rocm_version is None or MIN_FUNCTIONAL_AMDSMITST_ROCM_VERSION > PackageVersion(amdsmi_version.rocm_version) ): - # In versions of ROCm prior to 6.4.1, the amdsmitst had a bug that would cause the sclk to get pinned - # To a constant value, so we do not run the test for older rocm see: SWDEV-496150 self.logger.info("Skipping amdsmitst test due to Version incompatibility") return amdsmitst_data amdsmitst_cmd: str = "/opt/rocm/share/amd_smi/tests/amdsmitst" @@ -595,12 +659,17 @@ def get_amdsmitst_data(self, amdsmi_version: AmdSmiVersion | None) -> AmdSmiTstD failed_test_pat = r"\[\s+FAILED\s+\] (.*?) \(\d+ ms\)" for ret_line in cmd_ret.stdout.splitlines(): - if match := re.match(passed_test_pat, ret_line): - amdsmitst_data.passed_tests.append(match.group(1)) - elif match := re.match(skipped_test_pat, ret_line): - amdsmitst_data.skipped_tests.append(match.group(1)) - elif match := re.match(failed_test_pat, ret_line): - amdsmitst_data.failed_tests.append(match.group(1)) + m = re.match(passed_test_pat, ret_line) + if m: + amdsmitst_data.passed_tests.append(m.group(1)) + continue + m = re.match(skipped_test_pat, ret_line) + if m: + amdsmitst_data.skipped_tests.append(m.group(1)) + continue + m = re.match(failed_test_pat, ret_line) + if m: + amdsmitst_data.failed_tests.append(m.group(1)) amdsmitst_data.passed_test_count = len(amdsmitst_data.passed_tests) amdsmitst_data.skipped_test_count = len(amdsmitst_data.skipped_tests) @@ -609,16 +678,8 @@ def get_amdsmitst_data(self, amdsmi_version: AmdSmiVersion | None) -> AmdSmiTstD return amdsmitst_data def detect_amdsmi_commands(self) -> set[str]: - r"""Runs the help command to determine if a amd-smi command can be used. - - Uses the regex `^\s{4}(\w+)\s` to find all commands in the help output. - - Returns: - set[str]: _description_ - """ + r"""Runs the help command to determine if a amd-smi command can be used.""" command_pattern = re.compile(r"^\s{4}([\w\-]+)\s", re.MULTILINE) - - # run command with help help_output = self._run_amd_smi("-h") if help_output is None: self._log_event( @@ -628,19 +689,117 @@ def detect_amdsmi_commands(self) -> set[str]: console_log=True, ) return set() - # Find all matches in the provided output commands = command_pattern.findall(help_output) return set(commands) + def _smi_try(self, fn, *a, default=None, **kw): + """Call an AMDSMI function and normalize common library errors. + Extracts numeric ret_code from exceptions that don't expose a .status enum. + """ + try: + return fn(*a, **kw) + except AmdSmiException as e: + code = getattr(e, "ret_code", None) + if code is None: + try: + code = int(e.args[0]) if getattr(e, "args", None) else None + except Exception: + code = None + CODE2NAME = { + 1: "AMDSMI_STATUS_SUCCESS", + 2: "AMDSMI_STATUS_NOT_SUPPORTED", + 3: "AMDSMI_STATUS_PERMISSION", + 4: "AMDSMI_STATUS_OUT_OF_RESOURCES", + 5: "AMDSMI_STATUS_INIT_ERROR", + 6: "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS", + 7: "AMDSMI_STATUS_NOT_FOUND", + } + name = CODE2NAME.get(code, "unknown") + + if name == "AMDSMI_STATUS_NOT_SUPPORTED" or name == "AMDSMI_STATUS_NOT_FOUND": + self._log_event( + category=EventCategory.APPLICATION, + description=f"{fn.__name__} not supported on this device/mode (status={name}, code={code})", + priority=EventPriority.WARNING, + ) + return default + if name == "AMDSMI_STATUS_PERMISSION": + self._log_event( + category=EventCategory.APPLICATION, + description=f"{fn.__name__} permission denied (need access to /dev/kfd & render nodes, or root for RAS). status={name}, code={code}", + priority=EventPriority.WARNING, + ) + return default + # Generic case + self._log_event( + category=EventCategory.APPLICATION, + description=f"{fn.__name__} failed (status={name}, code={code})", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return default + if name == "AMDSMI_STATUS_PERMISSION": + self._log_event( + category=EventCategory.APPLICATION, + description=f"{fn.__name__} permission denied (need access to /dev/kfd and render nodes). status={name}, code={code}", + priority=EventPriority.WARNING, + ) + return default + self._log_event( + category=EventCategory.APPLICATION, + description=f"{fn.__name__} failed (status={name or 'unknown'}, code={code})", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return default + self._log_event( + category=EventCategory.APPLICATION, + description=f"{fn.__name__} failed", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return default + def collect_data( self, args=None, ) -> tuple[TaskResult, AmdSmiData | None]: try: - self.amd_smi_commands = self.detect_amdsmi_commands() + amdsmi_init(AmdSmiInitFlags.INIT_AMD_GPUS) + + for h in self._get_handles(): + board = self._smi_try(amdsmi_get_gpu_board_info, h, default={}) or {} + asic = self._smi_try(amdsmi_get_gpu_asic_info, h, default={}) or {} + name = board.get("product_name") or asic.get("market_name") + uuid = self._smi_try(amdsmi_get_gpu_device_uuid, h, default=None) + kfd = self._smi_try(amdsmi_get_gpu_kfd_info, h, default={}) or {} + print({"name": name, "uuid": uuid, "kfd": kfd}) + + amd_smi_data = None + version = self._get_amdsmi_version() + bad_pages = self.get_bad_pages() #call fails, need ras? + processes = self.get_process() + partition = self.get_partition() #call fails + firmware = self.get_firmware() + topology = self.get_topology() + amdsmi_metric = self.get_metric() + amdsmi_static = self.get_static() + gpu_list = self.get_gpu_list() + xgmi_metric = self.get_xgmi_data_metric() + if xgmi_metric is None: + xgmi_metric = {"metric": {}, "link": {}} + cper_data = self.get_cper_data() + amd_smi_data = self._get_amdsmi_data() #fails ras not found + if amd_smi_data is None: + return self.result, None + amd_smi_data = self._get_amdsmi_data() + if amd_smi_data is None: + return self.result, None + return self.result, amd_smi_data except Exception as e: + print(e) self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi collector", @@ -650,3 +809,9 @@ def collect_data( ) self.result.status = ExecutionStatus.EXECUTION_FAILURE return self.result, None + finally: + try: + amdsmi_shut_down() + except Exception: + pass + diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 712bba0e..fc03f295 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -801,6 +801,17 @@ class AmdSmiVersion(BaseModel): amdgpu_version: str | None = None amd_hsmp_driver_version: str | None = None + @field_validator("*", mode="before") + @classmethod + def _stringify(cls, v): + if v is None or isinstance(v, str): + return v + if isinstance(v, (bytes, bytearray)): + return v.decode("utf-8", "ignore") + if isinstance(v, (tuple, list)): + return ".".join(str(x) for x in v) + return str(v) + class PartitionCurrent(BaseModel): """Contains the Current Partition data for the GPUs""" From 629431595d86451588617c9b1c66483eebde1241 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Wed, 3 Sep 2025 12:53:08 -0500 Subject: [PATCH 09/38] update --- .../plugins/inband/amdsmi/amdsmi_collector.py | 143 ++++++++++-------- 1 file changed, 83 insertions(+), 60 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index c5ace29c..501782ec 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -28,33 +28,32 @@ import re from tarfile import TarFile from typing import TypeVar -import sys -from packaging.version import Version as PackageVersion -from pydantic import BaseModel, ValidationError import amdsmi from amdsmi import ( + AmdSmiException, AmdSmiInitFlags, - amdsmi_init, - amdsmi_shut_down, - amdsmi_get_gpu_board_info, - amdsmi_get_gpu_asic_info, - amdsmi_get_processor_handles, - amdsmi_get_lib_version, - amdsmi_get_rocm_version, - amdsmi_get_gpu_device_uuid, - amdsmi_get_gpu_device_bdf, - amdsmi_get_gpu_kfd_info, amdsmi_get_fw_info, - amdsmi_get_gpu_process_list, - amdsmi_get_gpu_compute_process_info, + amdsmi_get_gpu_accelerator_partition_profile, + amdsmi_get_gpu_asic_info, amdsmi_get_gpu_bad_page_info, - amdsmi_get_gpu_memory_reserved_pages, + amdsmi_get_gpu_board_info, amdsmi_get_gpu_compute_partition, + amdsmi_get_gpu_compute_process_info, + amdsmi_get_gpu_device_bdf, + amdsmi_get_gpu_device_uuid, + amdsmi_get_gpu_kfd_info, amdsmi_get_gpu_memory_partition, - amdsmi_get_gpu_accelerator_partition_profile, - AmdSmiException, + amdsmi_get_gpu_memory_reserved_pages, + amdsmi_get_gpu_process_list, + amdsmi_get_lib_version, + amdsmi_get_processor_handles, + amdsmi_get_rocm_version, + amdsmi_init, + amdsmi_shut_down, ) +from packaging.version import Version as PackageVersion +from pydantic import BaseModel, ValidationError from nodescraper.base.inbandcollectortask import InBandDataCollector @@ -336,15 +335,17 @@ def get_gpu_list(self) -> list[dict] | None: except amdsmi.AmdSmiException: pass - out.append({ - "gpu": idx, - "name": name or "unknown", - "bdf": bdf, - "uuid": uuid, - "kfd_id": int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, - "node_id": int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, - "partition_id": 0, - }) + out.append( + { + "gpu": idx, + "name": name or "unknown", + "bdf": bdf, + "uuid": uuid, + "kfd_id": int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, + "node_id": int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, + "partition_id": 0, + } + ) except AmdSmiException as e: self._log_event( category=EventCategory.APPLICATION, @@ -365,20 +366,27 @@ def get_process(self) -> list[dict] | None: for pid in pids: try: pinfo = amdsmi_get_gpu_compute_process_info(h, pid) or {} - plist.append({"process_info": { - "name": pinfo.get("name", str(pid)), - "pid": int(pid), - "memory_usage": { - "gtt_mem": {"value": pinfo.get("gtt_mem", 0), "unit": "B"}, - "cpu_mem": {"value": pinfo.get("cpu_mem", 0), "unit": "B"}, - "vram_mem": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, - }, - "mem_usage": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, - "usage": { - "gfx": {"value": pinfo.get("gfx", 0), "unit": "%"}, - "enc": {"value": pinfo.get("enc", 0), "unit": "%"}, - }, - }}) + plist.append( + { + "process_info": { + "name": pinfo.get("name", str(pid)), + "pid": int(pid), + "memory_usage": { + "gtt_mem": {"value": pinfo.get("gtt_mem", 0), "unit": "B"}, + "cpu_mem": {"value": pinfo.get("cpu_mem", 0), "unit": "B"}, + "vram_mem": { + "value": pinfo.get("vram_mem", 0), + "unit": "B", + }, + }, + "mem_usage": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, + "usage": { + "gfx": {"value": pinfo.get("gfx", 0), "unit": "%"}, + "enc": {"value": pinfo.get("enc", 0), "unit": "%"}, + }, + } + } + ) except AmdSmiException: plist.append({"process_info": str(pid)}) out.append({"gpu": idx, "process_list": plist}) @@ -405,18 +413,22 @@ def get_partition(self) -> dict | None: c_dict = c if isinstance(c, dict) else {} m_dict = m if isinstance(m, dict) else {} profiles.append(p if isinstance(p, dict) else {}) - current.append({ - "gpu_id": idx, - "memory": c_dict.get("memory"), - "accelerator_type": c_dict.get("accelerator_type"), - "accelerator_profile_index": c_dict.get("accelerator_profile_index"), - "partition_id": c_dict.get("partition_id"), - }) - memparts.append({ - "gpu_id": idx, - "memory_partition_caps": m_dict.get("memory_partition_caps"), - "current_partition_id": m_dict.get("current_partition_id"), - }) + current.append( + { + "gpu_id": idx, + "memory": c_dict.get("memory"), + "accelerator_type": c_dict.get("accelerator_type"), + "accelerator_profile_index": c_dict.get("accelerator_profile_index"), + "partition_id": c_dict.get("partition_id"), + } + ) + memparts.append( + { + "gpu_id": idx, + "memory_partition_caps": m_dict.get("memory_partition_caps"), + "current_partition_id": m_dict.get("current_partition_id"), + } + ) return { "current_partition": current, "memory_partition": memparts, @@ -479,7 +491,11 @@ def get_firmware(self) -> list[dict] | None: out.append( { "gpu": idx, - "fw_list": [{"fw_id": f.get("fw_id", ""), "fw_version": f.get("fw_version", "")} for f in fw_list if isinstance(f, dict)], + "fw_list": [ + {"fw_id": f.get("fw_id", ""), "fw_version": f.get("fw_version", "")} + for f in fw_list + if isinstance(f, dict) + ], } ) except AmdSmiException as e: @@ -499,8 +515,16 @@ def get_bad_pages(self) -> list[dict] | None: bad_list = self._smi_try(amdsmi_get_gpu_bad_page_info, h, default=[]) or [] res_list = self._smi_try(amdsmi_get_gpu_memory_reserved_pages, h, default=[]) or [] - retired = sum(1 for b in bad_list if isinstance(b, dict) and str(b.get("status", "")).lower() == "retired") - pending = sum(1 for b in bad_list if isinstance(b, dict) and str(b.get("status", "")).lower() == "pending") + retired = sum( + 1 + for b in bad_list + if isinstance(b, dict) and str(b.get("status", "")).lower() == "retired" + ) + pending = sum( + 1 + for b in bad_list + if isinstance(b, dict) and str(b.get("status", "")).lower() == "pending" + ) out.append( { @@ -777,9 +801,9 @@ def collect_data( amd_smi_data = None version = self._get_amdsmi_version() - bad_pages = self.get_bad_pages() #call fails, need ras? + bad_pages = self.get_bad_pages() # call fails, need ras? processes = self.get_process() - partition = self.get_partition() #call fails + partition = self.get_partition() # call fails firmware = self.get_firmware() topology = self.get_topology() amdsmi_metric = self.get_metric() @@ -789,7 +813,7 @@ def collect_data( if xgmi_metric is None: xgmi_metric = {"metric": {}, "link": {}} cper_data = self.get_cper_data() - amd_smi_data = self._get_amdsmi_data() #fails ras not found + amd_smi_data = self._get_amdsmi_data() # fails ras not found if amd_smi_data is None: return self.result, None @@ -814,4 +838,3 @@ def collect_data( amdsmi_shut_down() except Exception: pass - From 38884aeabbafcd490dcace54be99a3bc03edebdc Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Sat, 13 Sep 2025 17:18:37 -0500 Subject: [PATCH 10/38] cleanup --- .../plugins/inband/amdsmi/amdsmi_collector.py | 514 ++----------- .../plugins/inband/amdsmi/amdsmidata.py | 711 +----------------- 2 files changed, 82 insertions(+), 1143 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 501782ec..cd3d6130 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -23,10 +23,6 @@ # SOFTWARE. # ############################################################################### -import io -import json -import re -from tarfile import TarFile from typing import TypeVar import amdsmi @@ -34,17 +30,12 @@ AmdSmiException, AmdSmiInitFlags, amdsmi_get_fw_info, - amdsmi_get_gpu_accelerator_partition_profile, - amdsmi_get_gpu_asic_info, - amdsmi_get_gpu_bad_page_info, - amdsmi_get_gpu_board_info, amdsmi_get_gpu_compute_partition, amdsmi_get_gpu_compute_process_info, amdsmi_get_gpu_device_bdf, amdsmi_get_gpu_device_uuid, amdsmi_get_gpu_kfd_info, amdsmi_get_gpu_memory_partition, - amdsmi_get_gpu_memory_reserved_pages, amdsmi_get_gpu_process_list, amdsmi_get_lib_version, amdsmi_get_processor_handles, @@ -52,31 +43,19 @@ amdsmi_init, amdsmi_shut_down, ) -from packaging.version import Version as PackageVersion from pydantic import BaseModel, ValidationError from nodescraper.base.inbandcollectortask import InBandDataCollector - -# from nodescraper.models.datamodel import FileModel -from nodescraper.connection.inband import BinaryFileArtifact, TextFileArtifact -from nodescraper.connection.inband.inband import BaseFileArtifact, CommandArtifact +from nodescraper.connection.inband.inband import CommandArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily -from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.models import TaskResult from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiData, AmdSmiListItem, - AmdSmiMetric, - AmdSmiStatic, - AmdSmiTstData, AmdSmiVersion, - BadPages, Fw, Partition, Processes, - Topo, - XgmiLinks, - XgmiMetrics, ) from nodescraper.utils import get_exception_details, get_exception_traceback @@ -97,7 +76,6 @@ def _get_handles(self): try: return amdsmi_get_processor_handles() except amdsmi.AmdSmiException as e: - print("Exception1: %s" % e) self._log_event( category=EventCategory.APPLICATION, description="amdsmi_get_processor_handles failed", @@ -107,29 +85,11 @@ def _get_handles(self): ) return [] - def _check_amdsmi_installed(self) -> bool: - """Return if amd-smi is installed""" - - cmd_ret: CommandArtifact = self._run_sut_cmd("which amd-smi") - return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout) - - def _check_command_supported(self, command: str) -> bool: - """Log an event if the command is missing""" - if command not in self.amd_smi_commands: - self._log_event( - category=EventCategory.APPLICATION, - description=f"amd-smi does not support command: `{command}`, it was not found in the help output", - priority=EventPriority.INFO, - ) - return False - return True - def build_amdsmi_sub_data( self, amd_smi_data_model: type[T], json_data: list[dict] | dict | None ) -> list[T] | T | None: try: if json_data is None: - print("JSON is none") self._log_event( category=EventCategory.APPLICATION, description="No data returned from amd-smi sub command", @@ -176,32 +136,13 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: Returns: Union[AmdSmiData, None]: AmdSmiData object or None on failure """ - if not self._check_amdsmi_installed(): - self._log_event( - category=EventCategory.APPLICATION, - description="amd-smi is not installed", - priority=EventPriority.WARNING, - console_log=True, - ) - self.result.status = ExecutionStatus.NOT_RAN - return None try: - self.amd_smi_commands = self.detect_amdsmi_commands() version = self._get_amdsmi_version() - bad_pages = self.get_bad_pages() processes = self.get_process() partition = self.get_partition() firmware = self.get_firmware() - topology = self.get_topology() - amdsmi_metric = self.get_metric() - amdsmi_static = self.get_static() gpu_list = self.get_gpu_list() - xgmi_metric = self.get_xgmi_data_metric() - if xgmi_metric is None: - xgmi_metric = {"metric": {}, "link": {}} - cper_data = self.get_cper_data() except Exception as e: - print(e) self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi sub commands", @@ -212,33 +153,20 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: self.result.status = ExecutionStatus.EXECUTION_FAILURE return None - gpu_list_model = self.build_amdsmi_sub_data(AmdSmiListItem, gpu_list) - topo_data_model = self.build_amdsmi_sub_data(Topo, topology) - bad_pages_model = self.build_amdsmi_sub_data(BadPages, bad_pages) partition_data_model = self.build_amdsmi_sub_data(Partition, partition) process_data_model = self.build_amdsmi_sub_data(Processes, processes) firmware_model = self.build_amdsmi_sub_data(Fw, firmware) - amdsmi_metric_model = self.build_amdsmi_sub_data(AmdSmiMetric, amdsmi_metric) - amdsmi_static_model = self.build_amdsmi_sub_data(AmdSmiStatic, amdsmi_static) - xgmi_metric_model = self.build_amdsmi_sub_data(XgmiMetrics, xgmi_metric["metric"]) - xgmi_link_model = self.build_amdsmi_sub_data(XgmiLinks, xgmi_metric["link"]) + gpu_list_model = self.build_amdsmi_sub_data(AmdSmiListItem, gpu_list) try: amd_smi_data = AmdSmiData( version=version, gpu_list=gpu_list_model, process=process_data_model, partition=partition_data_model, - topology=topo_data_model, - static=amdsmi_static_model, - metric=amdsmi_metric_model, firmware=firmware_model, - bad_pages=bad_pages_model, - amdsmitst_data=self.get_amdsmitst_data(version), - xgmi_link=xgmi_link_model, - xgmi_metric=xgmi_metric_model, - cper_data=cper_data, ) except ValidationError as e: + self.logger.warning("Validation err: %s", e) self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiData model", @@ -270,28 +198,6 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None: rocm_version=rocm_ver, ) - def _run_amd_smi_dict( - self, cmd: str, sudo: bool = False, raise_event=True - ) -> dict | list[dict] | None: - """Run amd-smi command with json output.""" - cmd += " --json" - cmd_ret = self._run_amd_smi(cmd, sudo=True if sudo else False) - if cmd_ret: - try: - return json.loads(cmd_ret) - except json.JSONDecodeError as e: - if raise_event: - self._log_event( - category=EventCategory.APPLICATION, - description=f"Error parsing command: `{cmd}` json data", - data={"cmd": cmd, "exception": get_exception_traceback(e)}, - priority=EventPriority.ERROR, - console_log=True, - ) - return None - else: - return None - def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: """Run amd-smi command""" cmd_ret: CommandArtifact = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) @@ -313,46 +219,41 @@ def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: return cmd_ret.stdout or "" def get_gpu_list(self) -> list[dict] | None: + devices = self._get_handles() out: list[dict] = [] - for idx, h in enumerate(devices): + + def _to_int(x, default=0): try: - uuid = amdsmi_get_gpu_device_uuid(h) or "" - bdf = amdsmi_get_gpu_device_bdf(h) or "" - kfd = amdsmi_get_gpu_kfd_info(h) or {} + return int(x) + except Exception: + return default + + for idx, h in enumerate(devices): + bdf = self._smi_try(amdsmi_get_gpu_device_bdf, h, default="") or "" + uuid = self._smi_try(amdsmi_get_gpu_device_uuid, h, default="") or "" + kfd = self._smi_try(amdsmi_get_gpu_kfd_info, h, default={}) or {} + + partition_id = 0 + cp = self._smi_try(amdsmi_get_gpu_compute_partition, h, default={}) or {} + if isinstance(cp, dict) and cp.get("partition_id") is not None: + partition_id = _to_int(cp.get("partition_id"), 0) + else: + mp = self._smi_try(amdsmi_get_gpu_memory_partition, h, default={}) or {} + if isinstance(mp, dict) and mp.get("current_partition_id") is not None: + partition_id = _to_int(mp.get("current_partition_id"), 0) + + out.append( + { + "gpu": idx, + "bdf": bdf, + "uuid": uuid, + "kfd_id": _to_int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, + "node_id": _to_int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, + "partition_id": partition_id, + } + ) - # Name via board/ASIC info - name = None - try: - board = amdsmi_get_gpu_board_info(h) or {} - name = board.get("product_name") # preferred - except amdsmi.AmdSmiException: - pass - if not name: - try: - asic = amdsmi_get_gpu_asic_info(h) or {} - name = asic.get("market_name") # fallback - except amdsmi.AmdSmiException: - pass - - out.append( - { - "gpu": idx, - "name": name or "unknown", - "bdf": bdf, - "uuid": uuid, - "kfd_id": int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, - "node_id": int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, - "partition_id": 0, - } - ) - except AmdSmiException as e: - self._log_event( - category=EventCategory.APPLICATION, - description="Failed to build gpu list entry from API", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.WARNING, - ) return out def get_process(self) -> list[dict] | None: @@ -365,7 +266,7 @@ def get_process(self) -> list[dict] | None: plist = [] for pid in pids: try: - pinfo = amdsmi_get_gpu_compute_process_info(h, pid) or {} + pinfo = self._smi_try(amdsmi_get_gpu_compute_process_info(h, pid)) or {} plist.append( { "process_info": { @@ -404,15 +305,12 @@ def get_partition(self) -> dict | None: devices = self._get_handles() current: list[dict] = [] memparts: list[dict] = [] - profiles: list[dict] = [] resources: list[dict] = [] for idx, h in enumerate(devices): c = self._smi_try(amdsmi_get_gpu_compute_partition, h, default={}) or {} m = self._smi_try(amdsmi_get_gpu_memory_partition, h, default={}) or {} - p = self._smi_try(amdsmi_get_gpu_accelerator_partition_profile, h, default={}) or {} c_dict = c if isinstance(c, dict) else {} m_dict = m if isinstance(m, dict) else {} - profiles.append(p if isinstance(p, dict) else {}) current.append( { "gpu_id": idx, @@ -432,289 +330,61 @@ def get_partition(self) -> dict | None: return { "current_partition": current, "memory_partition": memparts, - "partition_profiles": profiles, "partition_resources": resources, } - def get_topology(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi topology""" - TOPO_CMD = "topology" - if not hasattr(self, "amd_smi_commands"): - self.amd_smi_commands = self.detect_amdsmi_commands() - if not self._check_command_supported(TOPO_CMD): - return None - return self._run_amd_smi_dict(TOPO_CMD) - - def get_static(self) -> list[dict] | None: - """Get data in dict format from cmd: amdsmi static""" - STATIC_CMD = "static" - if not hasattr(self, "amd_smi_commands"): - self.amd_smi_commands = self.detect_amdsmi_commands() - if not self._check_command_supported(STATIC_CMD): - return None - static_data = self._run_amd_smi_dict(f"{STATIC_CMD} -g all") - if static_data is None: - return None - if isinstance(static_data, dict) and "gpu_data" in static_data: - static_data = static_data["gpu_data"] - static_data_gpus = [] - for static in static_data: - if isinstance(static, dict) and "gpu" in static: - static_data_gpus.append(static) - return static_data_gpus - - def get_metric(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi metric""" - METRIC_CMD = "metric" - if not hasattr(self, "amd_smi_commands"): - self.amd_smi_commands = self.detect_amdsmi_commands() - if not self._check_command_supported(METRIC_CMD): - return None - metric_data = self._run_amd_smi_dict(f"{METRIC_CMD} -g all") - if metric_data is None: - return None - if isinstance(metric_data, dict) and "gpu_data" in metric_data: - metric_data = metric_data["gpu_data"] - metric_data_gpus = [] - for metric in metric_data: - if isinstance(metric, dict) and "gpu" in metric: - metric_data_gpus.append(metric) - return metric_data_gpus - def get_firmware(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi firmware""" devices = self._get_handles() out: list[dict] = [] - for idx, h in enumerate(devices): - try: - fw_list = amdsmi_get_fw_info(h) or [] - out.append( - { - "gpu": idx, - "fw_list": [ - {"fw_id": f.get("fw_id", ""), "fw_version": f.get("fw_version", "")} - for f in fw_list - if isinstance(f, dict) - ], - } - ) - except AmdSmiException as e: - self._log_event( - category=EventCategory.APPLICATION, - description="amdsmi_get_fw_info failed", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.WARNING, - ) - return out - def get_bad_pages(self) -> list[dict] | None: - devices = self._get_handles() - print("devices: %s" % (devices,)) - out: list[dict] = [] for idx, h in enumerate(devices): - bad_list = self._smi_try(amdsmi_get_gpu_bad_page_info, h, default=[]) or [] - res_list = self._smi_try(amdsmi_get_gpu_memory_reserved_pages, h, default=[]) or [] - - retired = sum( - 1 - for b in bad_list - if isinstance(b, dict) and str(b.get("status", "")).lower() == "retired" - ) - pending = sum( - 1 - for b in bad_list - if isinstance(b, dict) and str(b.get("status", "")).lower() == "pending" - ) - - out.append( - { - "gpu": idx, - "retired": retired, - "pending": pending, - "un_res": len(res_list), - "bad_pages": bad_list, - "reserved_pages": res_list, - } - ) - return out - - def get_xgmi_data_metric(self) -> dict[str, list[dict]] | None: - """Get data as a list of dict from cmd: amdsmi xgmi""" - XGMI_CMD = "xgmi" - if not hasattr(self, "amd_smi_commands"): - self.amd_smi_commands = self.detect_amdsmi_commands() - if not self._check_command_supported(XGMI_CMD): - return None - xgmi_metric_data = self._run_amd_smi_dict(f"{XGMI_CMD} -m") - if xgmi_metric_data is None: - xgmi_metric_data = [] - elif isinstance(xgmi_metric_data, dict) and "xgmi_metric" in xgmi_metric_data: - xgmi_metric_data = xgmi_metric_data["xgmi_metric"] - if isinstance(xgmi_metric_data, list) and len(xgmi_metric_data) == 1: - xgmi_metric_data = xgmi_metric_data[0] - xgmi_link_data = self._run_amd_smi_dict(f"{XGMI_CMD} -l", raise_event=False) - if isinstance(xgmi_link_data, dict) and "link_status" in xgmi_link_data: - xgmi_link_data = xgmi_link_data["link_status"] - if xgmi_link_data is None: - xgmi_link_data_str = self._run_amd_smi(f"{XGMI_CMD} -l --json") - if xgmi_link_data_str is None: - return { - "metric": xgmi_metric_data, - "link": [], - } - invalid_json_start = xgmi_link_data_str.find("]\n[") - if invalid_json_start != -1: - xgmi_link_data_str = xgmi_link_data_str[invalid_json_start + 2 :] - try: - xgmi_link_data = json.loads(xgmi_link_data_str) - except json.JSONDecodeError as e: - self._log_event( - category=EventCategory.APPLICATION, - description="Error parsing xgmi link data", - data={ - "xgmi_link_data": xgmi_link_data_str, - "exception": get_exception_traceback(e), - }, - priority=EventPriority.WARNING, - console_log=True, - ) - xgmi_metric_data = [] - return { - "metric": xgmi_metric_data, - "link": xgmi_link_data, - } - - def get_cper_data(self) -> list[TextFileArtifact]: - CPER_CMD = "ras" - if not hasattr(self, "amd_smi_commands"): - self.amd_smi_commands = self.detect_amdsmi_commands() - if not self._check_command_supported(CPER_CMD): - return [] - AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" - self._run_sut_cmd( - f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json", - sudo=False, - ) - cper_cmd = self._run_amd_smi(f"{CPER_CMD} --cper --folder={AMD_SMI_CPER_FOLDER}", sudo=True) - if cper_cmd is None: - return [] - regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd) - if not regex_cper_search: - return [] - self._run_sut_cmd( - f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", - sudo=True, - ) - cper_zip: BaseFileArtifact = self.ib_interface.read_file( - f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False - ) - self._log_file_artifact( - cper_zip.filename, - cper_zip.contents, - ) - io_bytes = io.BytesIO(cper_zip.contents) - del cper_zip - try: - with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file: - cper_data = [] - for member in tar_file.getmembers(): - if member.isfile() and member.name.endswith(".cper"): - file_content = tar_file.extractfile(member) - if file_content is not None: - file_content_bytes = file_content.read() - else: - file_content_bytes = b"" - cper_data.append( - BinaryFileArtifact(filename=member.name, contents=file_content_bytes) - ) - if cper_data: - self._log_event( - category=EventCategory.APPLICATION, - description="CPER data has been extracted from amd-smi", - data={ - "cper_count": len(cper_data), - }, - priority=EventPriority.INFO, - ) - except Exception as e: - self._log_event( - category=EventCategory.APPLICATION, - description="Error extracting cper data", - data={ - "exception": get_exception_traceback(e), - }, - priority=EventPriority.ERROR, - console_log=True, - ) - return [] - return cper_data - - def get_amdsmitst_data(self, amdsmi_version: AmdSmiVersion | None) -> AmdSmiTstData: - """Get data in dict format from cmd: amdsmi amdsmitst""" - MIN_FUNCTIONAL_AMDSMITST_ROCM_VERSION = PackageVersion("6.4.2") - amdsmitst_data = AmdSmiTstData() - if self.system_interaction_level != SystemInteractionLevel.DISRUPTIVE: - return amdsmitst_data - if ( - amdsmi_version is None - or amdsmi_version.rocm_version is None - or MIN_FUNCTIONAL_AMDSMITST_ROCM_VERSION > PackageVersion(amdsmi_version.rocm_version) - ): - self.logger.info("Skipping amdsmitst test due to Version incompatibility") - return amdsmitst_data - amdsmitst_cmd: str = "/opt/rocm/share/amd_smi/tests/amdsmitst" - cmd_ret: CommandArtifact = self._run_sut_cmd(amdsmitst_cmd, sudo=True) - if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: - self._log_event( - category=EventCategory.APPLICATION, - description="Error running amdsmitst command", - data={ - "command": amdsmitst_cmd, - "exit_code": cmd_ret.exit_code, - "stderr": cmd_ret.stderr, - }, - priority=EventPriority.WARNING, - console_log=True, - ) - return amdsmitst_data - - passed_test_pat = r"\[\s+OK\s+\] (.*?) \(\d+ ms\)" - skipped_test_pat = r"\[\s+SKIPPED\s+\] (.*?) \(\d+ ms\)" - failed_test_pat = r"\[\s+FAILED\s+\] (.*?) \(\d+ ms\)" - - for ret_line in cmd_ret.stdout.splitlines(): - m = re.match(passed_test_pat, ret_line) - if m: - amdsmitst_data.passed_tests.append(m.group(1)) + raw = self._smi_try(amdsmi_get_fw_info, h, default=None) + if raw is None: continue - m = re.match(skipped_test_pat, ret_line) - if m: - amdsmitst_data.skipped_tests.append(m.group(1)) - continue - m = re.match(failed_test_pat, ret_line) - if m: - amdsmitst_data.failed_tests.append(m.group(1)) - amdsmitst_data.passed_test_count = len(amdsmitst_data.passed_tests) - amdsmitst_data.skipped_test_count = len(amdsmitst_data.skipped_tests) - amdsmitst_data.failed_test_count = len(amdsmitst_data.failed_tests) + if isinstance(raw, list): + items = raw + elif isinstance(raw, dict): + if isinstance(raw.get("fw_list"), list): + items = raw["fw_list"] + elif raw and all(not isinstance(v, (dict, list, tuple)) for v in raw.values()): + items = [{"fw_id": k, "fw_version": v} for k, v in raw.items()] + else: + items = [raw] + else: + items = [] + + normalized: list[dict] = [] + for e in items: + if isinstance(e, dict): + fid = ( + e.get("fw_id") + or e.get("fw_name") + or e.get("name") + or e.get("block") + or e.get("type") + or e.get("id") + ) + ver = e.get("fw_version") or e.get("version") or e.get("fw_ver") or e.get("ver") + normalized.append( + { + "fw_id": "" if fid is None else str(fid), + "fw_version": "" if ver is None else str(ver), + } + ) + elif isinstance(e, (tuple, list)) and len(e) >= 2: + normalized.append({"fw_id": str(e[0]), "fw_version": str(e[1])}) + else: + self._log_event( + category=EventCategory.APPLICATION, + description="Unrecognized firmware entry shape", + data={"entry_repr": repr(e)}, + priority=EventPriority.INFO, + ) - return amdsmitst_data + out.append({"gpu": idx, "fw_list": normalized}) - def detect_amdsmi_commands(self) -> set[str]: - r"""Runs the help command to determine if a amd-smi command can be used.""" - command_pattern = re.compile(r"^\s{4}([\w\-]+)\s", re.MULTILINE) - help_output = self._run_amd_smi("-h") - if help_output is None: - self._log_event( - category=EventCategory.APPLICATION, - description="Error running amd-smi help command", - priority=EventPriority.ERROR, - console_log=True, - ) - return set() - commands = command_pattern.findall(help_output) - return set(commands) + return out def _smi_try(self, fn, *a, default=None, **kw): """Call an AMDSMI function and normalize common library errors. @@ -723,6 +393,7 @@ def _smi_try(self, fn, *a, default=None, **kw): try: return fn(*a, **kw) except AmdSmiException as e: + self.logger.warning(e) code = getattr(e, "ret_code", None) if code is None: try: @@ -790,40 +461,13 @@ def collect_data( ) -> tuple[TaskResult, AmdSmiData | None]: try: amdsmi_init(AmdSmiInitFlags.INIT_AMD_GPUS) - - for h in self._get_handles(): - board = self._smi_try(amdsmi_get_gpu_board_info, h, default={}) or {} - asic = self._smi_try(amdsmi_get_gpu_asic_info, h, default={}) or {} - name = board.get("product_name") or asic.get("market_name") - uuid = self._smi_try(amdsmi_get_gpu_device_uuid, h, default=None) - kfd = self._smi_try(amdsmi_get_gpu_kfd_info, h, default={}) or {} - print({"name": name, "uuid": uuid, "kfd": kfd}) - - amd_smi_data = None - version = self._get_amdsmi_version() - bad_pages = self.get_bad_pages() # call fails, need ras? - processes = self.get_process() - partition = self.get_partition() # call fails - firmware = self.get_firmware() - topology = self.get_topology() - amdsmi_metric = self.get_metric() - amdsmi_static = self.get_static() - gpu_list = self.get_gpu_list() - xgmi_metric = self.get_xgmi_data_metric() - if xgmi_metric is None: - xgmi_metric = {"metric": {}, "link": {}} - cper_data = self.get_cper_data() amd_smi_data = self._get_amdsmi_data() # fails ras not found - if amd_smi_data is None: - return self.result, None - amd_smi_data = self._get_amdsmi_data() if amd_smi_data is None: return self.result, None return self.result, amd_smi_data except Exception as e: - print(e) self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi collector", diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index fc03f295..341f9333 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1,18 +1,14 @@ from enum import Enum -from typing import Any, List, Optional +from typing import List from pydantic import ( - AliasChoices, BaseModel, ConfigDict, Field, - NonNegativeFloat, - NonNegativeInt, - computed_field, field_validator, ) -from nodescraper.models.datamodel import DataModel, FileModel +from nodescraper.models.datamodel import DataModel from nodescraper.utils import find_annotation_in_container @@ -75,13 +71,9 @@ class ValueUnit(BaseModel): """A model for a value with a unit.""" value: int | str | float - # value: int | str | float unit: str = "" -# a = ValueUnit(23) - - class EccState(Enum): ENABLED = "ENABLED" DISABLED = "DISABLED" @@ -93,483 +85,6 @@ class EccState(Enum): NA = "N/A" -### STATIC DATA ### - - -class StaticAsic(BaseModel): - market_name: str - vendor_id: str - vendor_name: str - subvendor_id: str - device_id: str - subsystem_id: str - rev_id: str - asic_serial: str - oam_id: int - num_compute_units: int - target_graphics_version: str - - -class StaticBus(AmdSmiBaseModel): - bdf: str - max_pcie_width: ValueUnit - max_pcie_speed: ValueUnit - pcie_interface_version: str - slot_type: str - - -class StaticVbios(BaseModel): - name: str - build_date: str - part_number: str - version: str - - -class StaticLimit(AmdSmiBaseModel): - max_power: ValueUnit | None - min_power: ValueUnit | None - socket_power: ValueUnit | None - slowdown_edge_temperature: ValueUnit | None - slowdown_hotspot_temperature: ValueUnit | None - slowdown_vram_temperature: ValueUnit | None - shutdown_edge_temperature: ValueUnit | None - shutdown_hotspot_temperature: ValueUnit | None - shutdown_vram_temperature: ValueUnit | None - na_validator = field_validator( - "max_power", - "min_power", - "socket_power", - "slowdown_edge_temperature", - "slowdown_hotspot_temperature", - "slowdown_vram_temperature", - "shutdown_edge_temperature", - "shutdown_hotspot_temperature", - "shutdown_vram_temperature", - mode="before", - )(na_to_none) - - -class StaticDriver(BaseModel): - name: str - version: str - - -class StaticBoard(BaseModel): - model_config = ConfigDict( - populate_by_name=True, - ) - - amdsmi_model_number: str = Field( - alias="model_number" - ) # Model number is a reserved keyword for pydantic - product_serial: str - fru_id: str - product_name: str - manufacturer_name: str - - -class StaticRas(BaseModel): - eeprom_version: str - parity_schema: EccState - single_bit_schema: EccState - double_bit_schema: EccState - poison_schema: EccState - ecc_block_state: dict[str, EccState] - - -class StaticPartition(BaseModel): - # The name for compute_partition has changed we will support both for now - - compute_partition: str = Field( - validation_alias=AliasChoices("compute_partition", "accelerator_partition") - ) - memory_partition: str - partition_id: int - - -class StaticPolicy(BaseModel): - policy_id: int - policy_description: str - - -class StaticSocPstate(BaseModel): - num_supported: int - current_id: int - policies: List[StaticPolicy] - - -class StaticXgmiPlpd(BaseModel): - num_supported: int - current_id: int - plpds: List[StaticPolicy] - - -class StaticNuma(BaseModel): - node: int - affinity: int - - -class StaticVram(AmdSmiBaseModel): - type: str - vendor: str | None - size: ValueUnit | None - bit_width: ValueUnit | None - max_bandwidth: ValueUnit | None = None - na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")( - na_to_none - ) - - -class StaticCacheInfoItem(AmdSmiBaseModel): - cache: ValueUnit - cache_properties: List[str] - cache_size: ValueUnit | None - cache_level: ValueUnit - max_num_cu_shared: ValueUnit - num_cache_instance: ValueUnit - na_validator = field_validator("cache_size", mode="before")(na_to_none) - - -class StaticFrequencyLevels(BaseModel): - model_config = ConfigDict( - populate_by_name=True, - ) - - Level_0: str = Field(..., alias="Level 0") - Level_1: str | None = Field(default=None, alias="Level 1") - Level_2: str | None = Field(default=None, alias="Level 2") - - -class StaticClockData(BaseModel): - model_config = ConfigDict( - populate_by_name=True, - ) - frequency_levels: StaticFrequencyLevels - - current_level: int | None = Field(..., alias="current level") - na_validator = field_validator("current_level", mode="before")(na_to_none) - - -class AmdSmiStatic(BaseModel): - gpu: int - asic: StaticAsic - bus: StaticBus - vbios: StaticVbios | None - limit: StaticLimit | None - driver: StaticDriver - board: StaticBoard - ras: StaticRas - soc_pstate: StaticSocPstate | None - xgmi_plpd: StaticXgmiPlpd | None - process_isolation: str - numa: StaticNuma - vram: StaticVram - cache_info: List[StaticCacheInfoItem] - partition: StaticPartition | None = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ - clock: dict[str, StaticClockData | None] | None = None - na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) - na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( - na_to_none - ) - - -### Metric Data ### - - -class MetricUsage(BaseModel): - gfx_activity: ValueUnit | None - umc_activity: ValueUnit | None - mm_activity: ValueUnit | None - vcn_activity: list[ValueUnit | str | None] - jpeg_activity: list[ValueUnit | str | None] - gfx_busy_inst: dict[str, list[ValueUnit | str | None]] | None - jpeg_busy: dict[str, list[ValueUnit | str | None]] | None - vcn_busy: dict[str, list[ValueUnit | str | None]] | None - na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")( - na_to_none_list - ) - na_validator = field_validator( - "gfx_activity", - "umc_activity", - "mm_activity", - "gfx_busy_inst", - "jpeg_busy", - "vcn_busy", - mode="before", - )(na_to_none) - - -class MetricPower(BaseModel): - socket_power: ValueUnit | None - gfx_voltage: ValueUnit | None - soc_voltage: ValueUnit | None - mem_voltage: ValueUnit | None - throttle_status: str | None - power_management: str | None - na_validator = field_validator( - "socket_power", - "gfx_voltage", - "soc_voltage", - "mem_voltage", - "throttle_status", - "power_management", - mode="before", - )(na_to_none) - - -class MetricClockData(BaseModel): - clk: ValueUnit | None - min_clk: ValueUnit | None - max_clk: ValueUnit | None - clk_locked: int | str | dict | None - deep_sleep: int | str | dict | None - na_validator = field_validator( - "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before" - )(na_to_none) - - -class MetricTemperature(BaseModel): - edge: ValueUnit | None - hotspot: ValueUnit | None - mem: ValueUnit | None - na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none) - - -class MetricPcie(BaseModel): - width: int | None - speed: ValueUnit | None - bandwidth: ValueUnit | None - replay_count: int | None - l0_to_recovery_count: int | None - replay_roll_over_count: int | None - nak_sent_count: int | None - nak_received_count: int | None - current_bandwidth_sent: int | None - current_bandwidth_received: int | None - max_packet_size: int | None - lc_perf_other_end_recovery: int | None - na_validator = field_validator( - "width", - "speed", - "bandwidth", - "replay_count", - "l0_to_recovery_count", - "replay_roll_over_count", - "nak_sent_count", - "nak_received_count", - "current_bandwidth_sent", - "current_bandwidth_received", - "max_packet_size", - "lc_perf_other_end_recovery", - mode="before", - )(na_to_none) - - -class MetricEccTotals(BaseModel): - total_correctable_count: int | None - total_uncorrectable_count: int | None - total_deferred_count: int | None - cache_correctable_count: int | None - cache_uncorrectable_count: int | None - na_validator = field_validator( - "total_correctable_count", - "total_uncorrectable_count", - "total_deferred_count", - "cache_correctable_count", - "cache_uncorrectable_count", - mode="before", - )(na_to_none) - - -class MetricErrorCounts(BaseModel): - correctable_count: str | None - uncorrectable_count: str | None - deferred_count: str | None - na_validator = field_validator( - "correctable_count", "uncorrectable_count", "deferred_count", mode="before" - )(na_to_none) - - -class MetricFan(BaseModel): - speed: ValueUnit | None - max: ValueUnit | None - rpm: ValueUnit | None - usage: ValueUnit | None - na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none) - - -class MetricVoltageCurve(BaseModel): - point_0_frequency: ValueUnit | None - point_0_voltage: ValueUnit | None - point_1_frequency: ValueUnit | None - point_1_voltage: ValueUnit | None - point_2_frequency: ValueUnit | None - point_2_voltage: ValueUnit | None - - na_validator = field_validator( - "point_0_frequency", - "point_0_voltage", - "point_1_frequency", - "point_1_voltage", - "point_2_frequency", - "point_2_voltage", - mode="before", - )(na_to_none) - - -class MetricEnergy(BaseModel): - total_energy_consumption: ValueUnit | None - na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none) - - -class MetricMemUsage(BaseModel): - total_vram: ValueUnit | None - used_vram: ValueUnit | None - free_vram: ValueUnit | None - total_visible_vram: ValueUnit | None - used_visible_vram: ValueUnit | None - free_visible_vram: ValueUnit | None - total_gtt: ValueUnit | None - used_gtt: ValueUnit | None - free_gtt: ValueUnit | None - na_validator = field_validator( - "total_vram", - "used_vram", - "free_vram", - "total_visible_vram", - "used_visible_vram", - "free_visible_vram", - "total_gtt", - "used_gtt", - "free_gtt", - mode="before", - )(na_to_none) - - -class MetricThrottleVu(BaseModel): - value: dict[str, list[int | str]] - unit: str = "" - - -class MetricThrottle(AmdSmiBaseModel): - # At some point in time these changed from being int -> ValueUnit - - accumulation_counter: MetricThrottleVu | ValueUnit | None = None - - gfx_clk_below_host_limit_accumulated: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_power_accumulated: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_power_violation_activity: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_power_violation_status: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_violation_activity: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_violation_accumulated: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_violation_status: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_thermal_violation_accumulated: MetricThrottleVu | ValueUnit | None = ( - None - ) - gfx_clk_below_host_limit_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None - gfx_clk_below_host_limit_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None - hbm_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None - hbm_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None - hbm_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None - low_utilization_violation_accumulated: MetricThrottleVu | ValueUnit | None = None - low_utilization_violation_activity: MetricThrottleVu | ValueUnit | None = None - low_utilization_violation_status: MetricThrottleVu | ValueUnit | None = None - ppt_accumulated: MetricThrottleVu | ValueUnit | None = None - ppt_violation_activity: MetricThrottleVu | ValueUnit | None = None - ppt_violation_status: MetricThrottleVu | ValueUnit | None = None - prochot_accumulated: MetricThrottleVu | ValueUnit | None = None - prochot_violation_activity: MetricThrottleVu | ValueUnit | None = None - prochot_violation_status: MetricThrottleVu | ValueUnit | None = None - socket_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None - socket_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None - socket_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None - vr_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None - vr_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None - vr_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None - - na_validator = field_validator( - "accumulation_counter", - "gfx_clk_below_host_limit_accumulated", - "gfx_clk_below_host_limit_power_accumulated", - "gfx_clk_below_host_limit_power_violation_activity", - "gfx_clk_below_host_limit_power_violation_status", - "gfx_clk_below_host_limit_violation_activity", - "gfx_clk_below_host_limit_violation_accumulated", - "gfx_clk_below_host_limit_violation_status", - "gfx_clk_below_host_limit_thermal_violation_accumulated", - "gfx_clk_below_host_limit_thermal_violation_activity", - "gfx_clk_below_host_limit_thermal_violation_status", - "hbm_thermal_accumulated", - "hbm_thermal_violation_activity", - "hbm_thermal_violation_status", - "low_utilization_violation_accumulated", - "low_utilization_violation_activity", - "low_utilization_violation_status", - "ppt_accumulated", - "ppt_violation_activity", - "ppt_violation_status", - "prochot_accumulated", - "prochot_violation_activity", - "prochot_violation_status", - "socket_thermal_accumulated", - "socket_thermal_violation_activity", - "socket_thermal_violation_status", - "vr_thermal_accumulated", - "vr_thermal_violation_activity", - "vr_thermal_violation_status", - mode="before", - )(na_to_none) - - -class EccData(BaseModel): - "ECC counts collected per ecc block" - - correctable_count: int | None = 0 - uncorrectable_count: int | None = 0 - deferred_count: int | None = 0 - - na_validator = field_validator( - "correctable_count", "uncorrectable_count", "deferred_count", mode="before" - )(na_to_none) - - -class AmdSmiMetric(BaseModel): - gpu: int - usage: MetricUsage - power: MetricPower - clock: dict[str, MetricClockData] - temperature: MetricTemperature - pcie: MetricPcie - ecc: MetricEccTotals - ecc_blocks: dict[str, EccData] | str - fan: MetricFan - voltage_curve: MetricVoltageCurve - perf_level: str | dict | None - xgmi_err: str | dict | None - energy: MetricEnergy | None - mem_usage: MetricMemUsage - throttle: MetricThrottle - - na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) - - @field_validator("ecc_blocks", mode="before") - @classmethod - def validate_ecc_blocks(cls, value: dict[str, EccData] | str) -> dict[str, EccData]: - """Validate the ecc_blocks field.""" - if isinstance(value, str): - # If it's a string, we assume it's "N/A" and return an empty dict - return {} - return value - - @field_validator("energy", mode="before") - @classmethod - def validate_energy(cls, value: Any | None) -> MetricEnergy | None: - """Validate the energy field.""" - if value == "N/A" or value is None: - return None - return value - - ### LINK DATA ### @@ -607,88 +122,12 @@ class AccessTable(Enum): DISABLED = "DISABLED" -# XGMI -class XgmiLink(BaseModel): - gpu: int - bdf: str - read: ValueUnit | None - write: ValueUnit | None - na_validator = field_validator("read", "write", mode="before")(na_to_none) - - -class XgmiLinkMetrics(BaseModel): - bit_rate: ValueUnit | None - max_bandwidth: ValueUnit | None - link_type: str - links: List[XgmiLink] - na_validator = field_validator("max_bandwidth", "bit_rate", mode="before")(na_to_none) - - -class XgmiMetrics(BaseModel): - gpu: int - bdf: str - link_metrics: XgmiLinkMetrics - - -class XgmiLinks(BaseModel): - gpu: int - bdf: str - link_status: list[LinkStatusTable] - - class CoherentTable(Enum): COHERANT = "C" NON_COHERANT = "NC" SELF = "SELF" -# TOPO - - -class TopoLink(BaseModel): - gpu: int - bdf: str - weight: int - link_status: AccessTable - link_type: LinkTypes - num_hops: int - bandwidth: str - # The below fields are sometimes missing, so we use Optional - coherent: CoherentTable | None = None - atomics: AtomicsTable | None = None - dma: DmaTable | None = None - bi_dir: BiDirectionalTable | None = None - - @computed_field - @property - def bandwidth_from(self) -> int | None: - """Get the bandwidth from the link.""" - bw_split = self.bandwidth.split("-") - if len(bw_split) == 2: - return int(bw_split[0]) - else: - # If the bandwidth is not in the expected format, return None - return None - - @computed_field - @property - def bandwidth_to(self) -> int | None: - """Get the bandwidth to the link.""" - bw_split = self.bandwidth.split("-") - if len(bw_split) == 2: - return int(bw_split[1]) - else: - # If the bandwidth is not in the expected format, return None - return None - - -class Topo(BaseModel): - gpu: int - bdf: str - links: List[TopoLink] - - -# PROCESS DATA class ProcessMemoryUsage(BaseModel): gtt_mem: ValueUnit | None cpu_mem: ValueUnit | None @@ -731,7 +170,6 @@ class Fw(BaseModel): fw_list: List[FwListItem] -# AMD SMI LIST class AmdSmiListItem(BaseModel): gpu: int bdf: str @@ -741,56 +179,6 @@ class AmdSmiListItem(BaseModel): partition_id: int -# PAGES -class PageData(BaseModel): - page_address: int | str - page_size: int | str - status: str - - -class BadPages(BaseModel): - gpu: int - retired: str | PageData | list[PageData] - pending: str | PageData | list[PageData] - un_res: str | PageData | list[PageData] - - -class AmdSmiMetricPcieData(BaseModel): - "Data in pcie subfield of metrics command" - - width: NonNegativeInt - speed: NonNegativeFloat - bandwidth: Optional[NonNegativeFloat] = 0 - replay_count: Optional[int] = 0 - l0_to_recovery_count: Optional[int] = 0 - replay_roll_over_count: Optional[int] = 0 - nak_sent_count: Optional[int] = 0 - nak_received_count: Optional[int] = 0 - - -class AmdSmiMetricEccData(BaseModel): - "ECC info collected per ecc block" - - umc: EccData = EccData() - sdma: EccData = EccData() - gfx: EccData = EccData() - mmhub: EccData = EccData() - pcie_bif: EccData = EccData() - hdp: EccData = EccData() - xgmi_wafl: EccData = EccData() - - -class AmdSmiTstData(BaseModel): - "Summary of amdsmitst results, with list and count of passing/skipped/failed tests" - - passed_tests: list[str] = Field(default_factory=list) - skipped_tests: list[str] = Field(default_factory=list) - failed_tests: list[str] = Field(default_factory=list) - passed_test_count: int = 0 - skipped_test_count: int = 0 - failed_test_count: int = 0 - - class AmdSmiVersion(BaseModel): """Contains the versioning info for amd-smi""" @@ -820,7 +208,7 @@ class PartitionCurrent(BaseModel): memory: str | None = None accelerator_type: str | None = None accelerator_profile_index: str | int | None = None - partition_id: str | int | None = None # Right now this is a string but it looks like an int + partition_id: int | None = None class PartitionMemory(BaseModel): @@ -847,24 +235,11 @@ class PartitionProfiles(AmdSmiBaseModel): resources_shared: str | None = None -class PartitionResources(AmdSmiBaseModel): - """Partition Resources""" - - # This does not have gpu_id field for some reason. - # gpu_id: int - resource_index: str | None = None - resource_type: str | None = None - resource_instances: str | None = None - resources_shared: str | None = None - - -# Partition info class Partition(BaseModel): """Contains the partition info for amd-smi""" current_partition: list[PartitionCurrent] = Field(default_factory=list) memory_partition: list[PartitionMemory] = Field(default_factory=list) - # Right now partition_profiles and partition_resources is all N/A by amd-smi so placeholder dict until better defined partition_profiles: list[dict] = Field(default_factory=list) partition_resources: list[dict] = Field(default_factory=list) @@ -889,15 +264,7 @@ class AmdSmiData(DataModel): gpu_list: list[AmdSmiListItem] | None = Field(default_factory=list) partition: Partition | None = None process: list[Processes] | None = Field(default_factory=list) - topology: list[Topo] | None = Field(default_factory=list) - static: list[AmdSmiStatic] | None = Field(default_factory=list) - metric: list[AmdSmiMetric] | None = Field(default_factory=list) firmware: list[Fw] | None = Field(default_factory=list) - bad_pages: list[BadPages] | None = Field(default_factory=list) - xgmi_metric: list[XgmiMetrics] | None = Field(default_factory=list) - xgmi_link: list[XgmiLinks] | None = Field(default_factory=list) - cper_data: list[FileModel] | None = Field(default_factory=list) - amdsmitst_data: AmdSmiTstData def get_list(self, gpu: int) -> AmdSmiListItem | None: """Get the gpu list item for the given gpu id.""" @@ -908,24 +275,6 @@ def get_list(self, gpu: int) -> AmdSmiListItem | None: return item return None - def get_static(self, gpu: int) -> AmdSmiStatic | None: - """Get the static data for the given gpu id.""" - if self.static is None: - return None - for item in self.static: - if item.gpu == gpu: - return item - return None - - def get_metric(self, gpu: int) -> AmdSmiMetric | None: - """Get the metric data for the given gpu id.""" - if self.metric is None: - return None - for item in self.metric: - if item.gpu == gpu: - return item - return None - def get_process(self, gpu: int) -> Processes | None: """Get the process data for the given gpu id.""" if self.process is None: @@ -935,15 +284,6 @@ def get_process(self, gpu: int) -> Processes | None: return item return None - def get_topology(self, gpu: int) -> Topo | None: - """Get the topology data for the given gpu id.""" - if self.topology is None: - return None - for item in self.topology: - if item.gpu == gpu: - return item - return None - def get_firmware(self, gpu: int) -> Fw | None: """Get the firmware data for the given gpu id.""" if self.firmware is None: @@ -952,48 +292,3 @@ def get_firmware(self, gpu: int) -> Fw | None: if item.gpu == gpu: return item return None - - def get_bad_pages(self, gpu: int) -> BadPages | None: - """Get the bad pages data for the given gpu id.""" - if self.bad_pages is None: - return None - for item in self.bad_pages: - if item.gpu == gpu: - return item - return None - - @property - def amdsmimetricpcie_data(self) -> dict[int, AmdSmiMetricPcieData]: - """Get the pcie data for the given gpu id.""" - return { - item.gpu: AmdSmiMetricPcieData( - width=item.pcie.width if item.pcie.width else 0, - speed=float(item.pcie.speed.value) if item.pcie.speed else 0, - bandwidth=float(item.pcie.bandwidth.value) if item.pcie.bandwidth else 0, - replay_count=item.pcie.replay_count, - l0_to_recovery_count=item.pcie.l0_to_recovery_count, - replay_roll_over_count=item.pcie.replay_roll_over_count, - nak_sent_count=item.pcie.nak_sent_count, - nak_received_count=item.pcie.nak_received_count, - ) - for item in self.metric or [] - } - - @property - def amdsmimetricecc_data(self) -> dict[int, AmdSmiMetricEccData]: - """Get the ecc data for the given gpu id.""" - amdsmimetric_ret = {} - for item in self.metric or []: - if isinstance(item.ecc_blocks, str): - # If ecc_blocks is a string, it means no ECC data is available - continue - amdsmimetric_ret[item.gpu] = AmdSmiMetricEccData( - umc=item.ecc_blocks.get("UMC", EccData()), - sdma=item.ecc_blocks.get("SDMA", EccData()), - gfx=item.ecc_blocks.get("GFX", EccData()), - mmhub=item.ecc_blocks.get("MMHUB", EccData()), - pcie_bif=item.ecc_blocks.get("PCIE_BIF", EccData()), - hdp=item.ecc_blocks.get("HDP", EccData()), - xgmi_wafl=item.ecc_blocks.get("XGMI_WAFL", EccData()), - ) - return amdsmimetric_ret From b3b6352f60aa13e18d1f7af67f23c0e57e7c895d Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Sat, 13 Sep 2025 17:24:36 -0500 Subject: [PATCH 11/38] removed extra utest --- test/unit/plugin/test_amdsmi_collector.py | 309 ---------------------- 1 file changed, 309 deletions(-) delete mode 100644 test/unit/plugin/test_amdsmi_collector.py diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py deleted file mode 100644 index 22bb0ea8..00000000 --- a/test/unit/plugin/test_amdsmi_collector.py +++ /dev/null @@ -1,309 +0,0 @@ -import json -from pathlib import Path -from unittest.mock import MagicMock - -from errorscraper.config.config import SystemInteractionLevel -from errorscraper.datacollector.inband.amdsmi import ( - AmdSmiCollector, - AmdSmiData, - AmdSmiTstData, -) -from errorscraper.datamodel.inband.amdsmidata import ( - AmdSmiListItem, - AmdSmiMetric, - AmdSmiStatic, - AmdSmiVersion, - BadPages, - Fw, - Processes, - Topo, -) -from errorscraper.interfaces.inband import CommandArtifact, FileArtifact -from errorscraper.taskresult import TaskStatus -from scraper_test_base import ScraperTestBase - - -class TestAmdSmiCollection(ScraperTestBase): - """Test the amdsmi collector""" - - def setUp(self) -> None: - super().setUp() - json_text = (Path(self.fixtures_path) / "test_amdsmi_collector_mi355.json").read_text() - self.fixture_dict = json.loads(json_text) - self.test_collector = AmdSmiCollector( - system_info=self.system_info_mi300x, - system_interaction_level=SystemInteractionLevel.STANDARD, - ib_interface=self.ib_interface, - ) - - self.ib_interface.run_command = MagicMock() - self.ib_interface.run_command.side_effect = self.mock_with_fixture - - def mock_with_fixture(self, *args, **kwargs): - """Mock the interface to return the fixture data""" - for artifact in self.fixture_dict: - a_cmd = artifact["command"] - a_cmd_sudo_pass = f"sudo -S -p '' {kwargs['command']}" - a_cmd_sudo = f"sudo {kwargs['command']}" - if a_cmd == kwargs["command"] or a_cmd_sudo == a_cmd or a_cmd == a_cmd_sudo_pass: - return CommandArtifact(**artifact) - - def test_data_collection_config(self) -> None: - """Test checks for the tool being installed and that the check aborts with the proper - task status when amd-smi is not installed""" - self.ib_interface.run_command.return_value = CommandArtifact( - command="which amd-smi", - stdout="/usr/bin/amd-smi", - stderr="", - exit_code=0, - ) - is_installed = self.test_collector._check_amdsmi_installed() - self.assertTrue(is_installed) - self.ib_interface.run_command.side_effect = None - self.ib_interface.run_command.return_value = CommandArtifact( - command="which amd-smi", - stdout="", - stderr="command not found", - exit_code=1, - ) - is_installed = self.test_collector._check_amdsmi_installed() - self.assertFalse(is_installed) - - res, data = self.test_collector.collect_data() - self.assertEqual(res.status, TaskStatus.NOT_RAN) - self.assertIsNone(data) - - def test_amd_smi_data_and_commands(self) -> None: - """Test basic AMD SMI data collection that all methods return correct types""" - amd_smi_return_dict_cmds = { - "gpu_list": (self.test_collector.get_gpu_list, AmdSmiListItem), - "process": (self.test_collector.get_process, Processes), - "topology": (self.test_collector.get_topology, Topo), - "static": (self.test_collector.get_static, AmdSmiStatic), - "metric": (self.test_collector.get_metric, AmdSmiMetric), - "firmware": (self.test_collector.get_firmware, Fw), - "bad_pages": (self.test_collector.get_bad_pages, BadPages), - } - result_data = {} - self.test_collector.amd_smi_commands = self.test_collector.detect_amdsmi_commands() - for cmd_name, amd_smi_cmd_obj in amd_smi_return_dict_cmds.items(): - result_data[cmd_name] = amd_smi_cmd_obj[0]() - - data = amd_smi_cmd_obj[1](**result_data[cmd_name][0]) - self.assertIsInstance(data, amd_smi_cmd_obj[1]) - self.assertIsNotNone(result_data[cmd_name]) - - def test_amd_smi_mi325(self): - json_text = (Path(self.fixtures_path) / "test_amdsmi_collector_mi325.json").read_text() - self.fixture_dict = json.loads(json_text) - - res, data = self.test_collector.collect_data() - self.assertEqual(res.status, TaskStatus.OK) - self.assertIsInstance(data, AmdSmiData) - # Check - self.assertEqual(data.gpu_list[0].bdf, "0000:09:00.0") - self.assertEqual( - data.process[0].process_list[0].process_info.name, - "rvs", - ) - self.assertEqual( - data.process[0].process_list[0].process_info.pid, - 206506, - ) - self.assertEqual(data.get_topology(0).links[0].num_hops, 0) - self.assertEqual(data.get_static(0).asic.device_id, "0x74a5") - self.assertEqual(data.metric[0].pcie.width, 16) - self.assertEqual(data.firmware[0].fw_list[0].fw_version, "177") - self.assertEqual(data.bad_pages[0].retired, "No bad pages found.") - self.assertEqual(data.xgmi_link[0].bdf, "0000:09:00.0") - self.assertEqual(data.xgmi_metric[0].link_metrics.bit_rate.value, 32) - - def test_amd_smi_tst_data(self) -> None: - """Test the AMD SMI test data collection, ensure it can built list and counts of tests of each status""" - # Example takes pertinent snippets from actual full output - self.test_collector.system_interaction_level = SystemInteractionLevel.DISRUPTIVE - version_data_pass = AmdSmiVersion( - tool="AMDSMI Tool", - version="25.5.1+c11e6492", - amdsmi_library_version="25.5.1", - rocm_version="6.4.2", - amdgpu_version="6.12.12", - amd_hsmp_driver_version="N/A", - ) - version_data_old = AmdSmiVersion( - tool="AMDSMI Tool", - version="25.5.1+c11e6492", - amdsmi_library_version="25.5.1", - rocm_version="6.4.0", - amdgpu_version="6.12.12", - amd_hsmp_driver_version="N/A", - ) - - amdsmitst_data = self.test_collector.get_amdsmitst_data(version_data_old) - self.assertIsInstance(amdsmitst_data, AmdSmiTstData) - self.assertEqual(amdsmitst_data.passed_test_count, 0) - self.assertEqual(amdsmitst_data.failed_test_count, 0) - self.assertEqual(amdsmitst_data.skipped_test_count, 0) - amdsmitst_data = self.test_collector.get_amdsmitst_data(version_data_pass) - self.assertIsInstance(amdsmitst_data, AmdSmiTstData) - self.assertEqual(amdsmitst_data.passed_test_count, 3) - self.assertEqual(amdsmitst_data.failed_test_count, 2) - self.assertEqual(amdsmitst_data.skipped_test_count, 1) - self.assertTrue("amdsmitstReadOnly.TestVersionRead" in amdsmitst_data.passed_tests) - self.assertTrue("amdsmitstReadWrite.TestXGMIReadWrite" in amdsmitst_data.skipped_tests) - self.assertTrue("amdsmitstReadWrite.TestPerfDeterminism" in amdsmitst_data.failed_tests) - self.ib_interface.run_command.side_effect = None - - self.ib_interface.run_command.return_value = CommandArtifact( - command="/opt/rocm/share/amd_smi_tests/amdsmitsts/", - stdout="", - stderr="No such file or directory", - exit_code=255, - ) - amdsmitst_data = self.test_collector.get_amdsmitst_data(version_data_pass) - self.assertEqual(amdsmitst_data, AmdSmiTstData()) - - def test_task_body_bad_data_collected(self): - """Test the task body when the data collection fails""" - self.ib_interface.run_command.side_effect = [ - CommandArtifact( - command="which amd-smi", - stdout="/usr/bin/amd-smi", - stderr="", - exit_code=0, - ) - ] * 100 - res, data = self.test_collector.collect_data() - self.assertEqual(res.status, TaskStatus.ERRORS_DETECTED) - self.assertIsInstance(data, AmdSmiData) - self.assertEqual( - res.events[0].description, - "Error parsing command: `version --json` json data", - ) - - def test_amdsmi_collector_350(self): - """Test the AMD SMI collector with a MI350x fixture""" - json_text = (Path(self.fixtures_path) / "test_amdsmi_collector_mi350.json").read_text() - self.fixture_dict = json.loads(json_text) - fixture_tar_file = Path(self.fixtures_path) / "amd_smi_cper.tar.gz" - with open(fixture_tar_file, "rb") as f: - tar_bytes = f.read() - self.ib_interface.read_file.return_value = FileArtifact( - filename="amd_smi_cper.tar.gz", - contents=tar_bytes, - ) - - res, data = self.test_collector.collect_data() - self.assertEqual(res.status, TaskStatus.OK) - self.assertIsInstance(data, AmdSmiData) - self.assertIsNotNone(data.gpu_list) - self.assertIsNotNone(data.process) - self.assertIsNotNone(data.topology) - self.assertIsNotNone(data.static) - self.assertIsNotNone(data.metric) - self.assertIsNotNone(data.firmware) - self.assertIsNotNone(data.bad_pages) - self.assertIsNotNone(data.xgmi_metric) - self.assertIsNotNone(data.xgmi_link) - self.assertIsNotNone(data.cper_data) - - def test_amdsmi_cper_collection(self): - """Test the AMD SMI collector with a MI350x fixture for CPER collection""" - fixture_tar_file = Path(self.fixtures_path) / "amd_smi_cper.tar.gz" - self.ib_interface.run_command.side_effect = [ - CommandArtifact( - command="which amd-smi", - stdout="/usr/bin/amd-smi", - stderr="", - exit_code=0, - ), - CommandArtifact( - command="sudo -S -p '' amd-smi ras --cper --folder=/tmp/amd_smi_cper --afid", - stdout="""Dumping CPER file header entries in folder /tmp/cpers - timestamp gpu_id severity file_name - 2025/06/17 21:45:30 0 corrected corrected_0.cper - """, - stderr="", - exit_code=0, - ), - CommandArtifact( - command="tar -czf /tmp/amd_smi_cper.tar.gz -C /tmp/amd_smi_cper .", - stdout="tar", - stderr="", - exit_code=0, - ), - ] - # read tar file into bytes - with open(fixture_tar_file, "rb") as f: - tar_bytes = f.read() - self.ib_interface.read_file.return_value = FileArtifact( - filename="amd_smi_cper.tar.gz", - contents=tar_bytes, - ) - self.test_collector.amd_smi_commands = {"ras"} - amd_data = self.test_collector.get_cper_data() - - self.assertEqual(len(amd_data), 1) - self.assertEqual(len(amd_data[0].file_contents), 4256) - self.assertEqual(amd_data[0].file_name, "./corrected_0.cper") - - def test_amdsmi_cper_no_cpers(self): - """Test the AMD SMI collector with a MI350x fixture for CPER collection with no CPER data""" - self.ib_interface.run_command.side_effect = [ - CommandArtifact( - command="which amd-smi", - stdout="/usr/bin/amd-smi", - stderr="", - exit_code=0, - ), - CommandArtifact( - command="mkdir -p /tmp/amd_smi_cper && rm /tmp/amd_smi_cper/*.cper && rm /tmp/amd_smi_cper/*.json", - stdout="", - stderr="", - exit_code=0, - ), - CommandArtifact( - command="sudo -S -p '' amd-smi ras --cper --folder=/tmp/amd_smi_cper --afid", - stdout="""Dumping CPER file header entries in folder /tmp/cpers - timestamp gpu_id severity file_name - - """, - stderr="", - exit_code=0, - ), - ] - self.test_collector.amd_smi_commands = {"ras"} - - amd_data = self.test_collector.get_cper_data() - self.assertEqual(len(amd_data), 0) - - def test_detect_amdsmi_commands(self): - """Test the detection of AMD SMI commands""" - self.ib_interface.run_command.side_effect = [ - CommandArtifact( - command="amd-smi -h", - stdout="AMD System Management Interface | Version: 25.3.0+ede62f2 | ROCm version: 6.4.0 |\nPlatform: Linux Baremetal\n\noptions:\n -h, --help show this help message and exit\n\nAMD-SMI Commands:\n Descriptions:\n version Display version information\n list List GPU information\n static Gets static information about the specified GPU\n firmware (ucode) Gets firmware information about the specified GPU\n bad-pages Gets bad page information about the specified GPU\n metric Gets metric/performance information about the specified GPU\n process Lists general process information running on the specified GPU\n event Displays event information for the given GPU\n topology Displays topology information of the devices\n set Set options for devices\n reset Reset options for devices\n monitor (dmon) Monitor metrics for target devices\n xgmi Displays xgmi information of the devices\n partition Displays partition information of the devices\n", - stderr="", - exit_code=0, - ), - ] - commands = self.test_collector.detect_amdsmi_commands() - self.assertEqual( - commands, - { - "version", - "list", - "static", - "firmware", - "bad-pages", - "metric", - "process", - "event", - "topology", - "set", - "reset", - "monitor", - "xgmi", - "partition", - }, - ) From 0d86d3f02ff261d38f893f9d26a1e5f74eeb30ce Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Mon, 15 Sep 2025 13:30:02 -0500 Subject: [PATCH 12/38] utest + import check --- .../plugins/inband/amdsmi/amdsmi_collector.py | 118 ++++++++++-------- .../plugins/inband/amdsmi/amdsmi_plugin.py | 6 +- .../plugins/inband/amdsmi/amdsmidata.py | 2 +- 3 files changed, 71 insertions(+), 55 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index cd3d6130..04b96751 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -25,32 +25,38 @@ ############################################################################### from typing import TypeVar -import amdsmi -from amdsmi import ( - AmdSmiException, - AmdSmiInitFlags, - amdsmi_get_fw_info, - amdsmi_get_gpu_compute_partition, - amdsmi_get_gpu_compute_process_info, - amdsmi_get_gpu_device_bdf, - amdsmi_get_gpu_device_uuid, - amdsmi_get_gpu_kfd_info, - amdsmi_get_gpu_memory_partition, - amdsmi_get_gpu_process_list, - amdsmi_get_lib_version, - amdsmi_get_processor_handles, - amdsmi_get_rocm_version, - amdsmi_init, - amdsmi_shut_down, -) from pydantic import BaseModel, ValidationError +try: + import amdsmi # noqa: F401 + from amdsmi import ( + AmdSmiException, + AmdSmiInitFlags, + amdsmi_get_fw_info, + amdsmi_get_gpu_compute_partition, + amdsmi_get_gpu_compute_process_info, + amdsmi_get_gpu_device_bdf, + amdsmi_get_gpu_device_uuid, + amdsmi_get_gpu_kfd_info, + amdsmi_get_gpu_memory_partition, + amdsmi_get_gpu_process_list, + amdsmi_get_lib_version, + amdsmi_get_processor_handles, + amdsmi_get_rocm_version, + amdsmi_init, + amdsmi_shut_down, + ) + + _AMDSMI_IMPORT_ERROR = None +except Exception as _e: + _AMDSMI_IMPORT_ERROR = _e + from nodescraper.base.inbandcollectortask import InBandDataCollector from nodescraper.connection.inband.inband import CommandArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult from nodescraper.plugins.inband.amdsmi.amdsmidata import ( - AmdSmiData, + AmdSmiDataModel, AmdSmiListItem, AmdSmiVersion, Fw, @@ -62,14 +68,14 @@ T = TypeVar("T", bound=BaseModel) -class AmdSmiCollector(InBandDataCollector[AmdSmiData, None]): +class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): """class for collection of inband tool amd-smi data.""" AMD_SMI_EXE = "amd-smi" SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} - DATA_MODEL = AmdSmiData + DATA_MODEL = AmdSmiDataModel def _get_handles(self): """Get processor handles.""" @@ -128,13 +134,13 @@ def build_amdsmi_sub_data( ) return None - def _get_amdsmi_data(self) -> AmdSmiData | None: - """Returns amd-smi tool data formatted as a AmdSmiData object + def _get_amdsmi_data(self) -> AmdSmiDataModel | None: + """Returns amd-smi tool data formatted as a AmdSmiDataModel object Returns None if tool is not installed or if drivers are not loaded Returns: - Union[AmdSmiData, None]: AmdSmiData object or None on failure + Union[AmdSmiDataModel, None]: AmdSmiDataModel object or None on failure """ try: version = self._get_amdsmi_version() @@ -158,7 +164,7 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: firmware_model = self.build_amdsmi_sub_data(Fw, firmware) gpu_list_model = self.build_amdsmi_sub_data(AmdSmiListItem, gpu_list) try: - amd_smi_data = AmdSmiData( + amd_smi_data = AmdSmiDataModel( version=version, gpu_list=gpu_list_model, process=process_data_model, @@ -169,7 +175,7 @@ def _get_amdsmi_data(self) -> AmdSmiData | None: self.logger.warning("Validation err: %s", e) self._log_event( category=EventCategory.APPLICATION, - description="Failed to build AmdSmiData model", + description="Failed to build AmdSmiDataModel model", data=get_exception_details(e), priority=EventPriority.ERROR, ) @@ -265,31 +271,29 @@ def get_process(self) -> list[dict] | None: pids = amdsmi_get_gpu_process_list(h) or [] plist = [] for pid in pids: - try: - pinfo = self._smi_try(amdsmi_get_gpu_compute_process_info(h, pid)) or {} - plist.append( - { - "process_info": { - "name": pinfo.get("name", str(pid)), - "pid": int(pid), - "memory_usage": { - "gtt_mem": {"value": pinfo.get("gtt_mem", 0), "unit": "B"}, - "cpu_mem": {"value": pinfo.get("cpu_mem", 0), "unit": "B"}, - "vram_mem": { - "value": pinfo.get("vram_mem", 0), - "unit": "B", - }, - }, - "mem_usage": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, - "usage": { - "gfx": {"value": pinfo.get("gfx", 0), "unit": "%"}, - "enc": {"value": pinfo.get("enc", 0), "unit": "%"}, - }, - } - } - ) - except AmdSmiException: + pinfo = self._smi_try(amdsmi_get_gpu_compute_process_info, h, pid, default=None) + if not isinstance(pinfo, dict): plist.append({"process_info": str(pid)}) + continue + + plist.append( + { + "process_info": { + "name": pinfo.get("name", str(pid)), + "pid": int(pid), + "memory_usage": { + "gtt_mem": {"value": pinfo.get("gtt_mem", 0), "unit": "B"}, + "cpu_mem": {"value": pinfo.get("cpu_mem", 0), "unit": "B"}, + "vram_mem": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, + }, + "mem_usage": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, + "usage": { + "gfx": {"value": pinfo.get("gfx", 0), "unit": "%"}, + "enc": {"value": pinfo.get("enc", 0), "unit": "%"}, + }, + } + } + ) out.append({"gpu": idx, "process_list": plist}) except AmdSmiException as e: self._log_event( @@ -458,7 +462,19 @@ def _smi_try(self, fn, *a, default=None, **kw): def collect_data( self, args=None, - ) -> tuple[TaskResult, AmdSmiData | None]: + ) -> tuple[TaskResult, AmdSmiDataModel | None]: + + if _AMDSMI_IMPORT_ERROR is not None: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to import amdsmi Python bindings", + data={"exception": get_exception_traceback(_AMDSMI_IMPORT_ERROR)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.NOT_RAN + return self.result, None + try: amdsmi_init(AmdSmiInitFlags.INIT_AMD_GPUS) amd_smi_data = self._get_amdsmi_data() # fails ras not found diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py index 66e011ef..77e5c735 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py @@ -26,12 +26,12 @@ from nodescraper.base import InBandDataPlugin from .amdsmi_collector import AmdSmiCollector -from .amdsmidata import AmdSmiData +from .amdsmidata import AmdSmiDataModel -class AmdsmiPlugin(InBandDataPlugin[AmdSmiData, None, None]): +class AmdsmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, None]): """Plugin for collection and analysis of amdsmi data""" - DATA_MODEL = AmdSmiData + DATA_MODEL = AmdSmiDataModel COLLECTOR = AmdSmiCollector diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 341f9333..18b0b980 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -244,7 +244,7 @@ class Partition(BaseModel): partition_resources: list[dict] = Field(default_factory=list) -class AmdSmiData(DataModel): +class AmdSmiDataModel(DataModel): """Data model for amd-smi data. Optionals are used to allow for the data to be missing, From c7b3344ecb0ddbd3f30161cddf51c71fb106916f Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Mon, 15 Sep 2025 15:50:31 -0500 Subject: [PATCH 13/38] adding analyzer --- nodescraper/plugins/inband/amdsmi/__init__.py | 4 +- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 249 ++++++++++++++++++ .../plugins/inband/amdsmi/amdsmi_collector.py | 235 ++++++++++++++--- .../plugins/inband/amdsmi/amdsmidata.py | 243 ++++++++++++++++- .../plugins/inband/amdsmi/analyzer_args.py | 50 ++++ test/unit/plugin/test_amdsmi_collector.py | 218 +++++++++++++++ 6 files changed, 963 insertions(+), 36 deletions(-) create mode 100644 nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py create mode 100644 nodescraper/plugins/inband/amdsmi/analyzer_args.py create mode 100644 test/unit/plugin/test_amdsmi_collector.py diff --git a/nodescraper/plugins/inband/amdsmi/__init__.py b/nodescraper/plugins/inband/amdsmi/__init__.py index ec4a6f86..f117a9fd 100644 --- a/nodescraper/plugins/inband/amdsmi/__init__.py +++ b/nodescraper/plugins/inband/amdsmi/__init__.py @@ -23,6 +23,6 @@ # SOFTWARE. # ############################################################################### -from .amdsmi_plugin import AmdsmiPlugin +from .amdsmi_plugin import AmdSmiPlugin -__all__ = ["AmdsmiPlugin"] +__all__ = ["AmdSmiPlugin"] diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py new file mode 100644 index 00000000..617d0f01 --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -0,0 +1,249 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + + +from nodescraper.enums import EventCategory, EventPriority +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult + +from .amdsmidata import AmdSmiDataModel + + +class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): + """""" + + DATA_MODEL = AmdSmiDataModel + + def expected_gpu_processes( + self, processes_data: list[Processes] | None, max_num_processes: int + ): + """Check the number of GPU processes running. If the number of processes is greater than the expected + number of processes, log an error event""" + gpu_exceeds_num_processes: dict[int, int] = {} + if processes_data is None or len(processes_data) == 0: + self._log_event( + category=EventCategory.PLATFORM, + description="No GPU processes data available", + priority=EventPriority.WARNING, + data={"processes_data": processes_data}, + console_log=True, + ) + return + for process in processes_data: + if len(process.process_list) == 0 or isinstance( + process.process_list[0].process_info, str + ): + # Skip if there are no processes or the process info is a string which indicates no processes + continue + + process_count = len(process.process_list) # Number of processes for GPU + if process_count > max_num_processes: + # Log an error event if the number of processes is greater than the expected number log event + gpu_exceeds_num_processes[process.gpu] = process_count + + if gpu_exceeds_num_processes: + self._log_event( + category=EventCategory.PLATFORM, + description="Number of processes exceeds max processes", + priority=EventPriority.ERROR, + data={ + "gpu_exceeds_num_processes": gpu_exceeds_num_processes, + }, + console_log=True, + ) + + def check_expected_memory_partition_mode( + self, + partition_data: Partition | None, + expected_memory_partition_mode: str | None, + expected_compute_partition_mode: str | None, + ): + if partition_data is None: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI Partition data not available", + priority=EventPriority.WARNING, + ) + return + bad_memory_partition_mode_gpus = [] + for partition_current in partition_data.current_partition: + if ( + expected_memory_partition_mode is not None + and partition_current.memory != expected_memory_partition_mode + ) or ( + expected_compute_partition_mode is not None + and partition_current.accelerator_type != expected_compute_partition_mode + ): + bad_memory_partition_mode_gpus.append( + { + "gpu_id": partition_current.gpu_id, + "compute_partition_mode": partition_current.accelerator_type, + "memory_partition_mode": partition_current.memory, + } + ) + if bad_memory_partition_mode_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Partition Mode Mismatch", + priority=EventPriority.ERROR, + data={ + "actual_partition_data": bad_memory_partition_mode_gpus, + "expected_memory_partition_mode": expected_memory_partition_mode, + "expected_compute_partition_mode": expected_compute_partition_mode, + }, + ) + + def check_expected_memory_partition_mode( + self, + partition_data: Partition | None, + expected_memory_partition_mode: str | None, + expected_compute_partition_mode: str | None, + ): + if partition_data is None: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI Partition data not available", + priority=EventPriority.WARNING, + ) + return + bad_memory_partition_mode_gpus = [] + for partition_current in partition_data.current_partition: + if ( + expected_memory_partition_mode is not None + and partition_current.memory != expected_memory_partition_mode + ) or ( + expected_compute_partition_mode is not None + and partition_current.accelerator_type != expected_compute_partition_mode + ): + bad_memory_partition_mode_gpus.append( + { + "gpu_id": partition_current.gpu_id, + "compute_partition_mode": partition_current.accelerator_type, + "memory_partition_mode": partition_current.memory, + } + ) + if bad_memory_partition_mode_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Partition Mode Mismatch", + priority=EventPriority.ERROR, + data={ + "actual_partition_data": bad_memory_partition_mode_gpus, + "expected_memory_partition_mode": expected_memory_partition_mode, + "expected_compute_partition_mode": expected_compute_partition_mode, + }, + ) + + def check_pldm_version( + self, + amdsmi_fw_data: list[Fw] | None, + expected_pldm_version: str | None, + ): + """Check the PLDM version for all GPUs. If the PLDM version is not as expected, log an error event for which GPUs don't have a match""" + PLDM_STRING = "PLDM_BUNDLE" + if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI firmware data available", + priority=EventPriority.WARNING, + data={"amdsmi_fw_data": amdsmi_fw_data}, + ) + return + mismatched_gpus: list[int] = [] + pldm_missing_gpus: list[int] = [] + for fw_data in amdsmi_fw_data: + gpu = fw_data.gpu + for fw_info in fw_data.fw_list: + if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version: + mismatched_gpus.append(gpu) + if PLDM_STRING == fw_info.fw_id: + break + else: + pldm_missing_gpus.append(gpu) + + if mismatched_gpus or pldm_missing_gpus: + self._log_event( + category=EventCategory.FW, + description="PLDM Version Mismatch", + priority=EventPriority.ERROR, + data={ + "mismatched_gpus": mismatched_gpus, + "pldm_missing_gpus": pldm_missing_gpus, + "expected_pldm_version": expected_pldm_version, + }, + ) + + def check_expected_memory_partition_mode( + self, + partition_data: Partition | None, + expected_memory_partition_mode: str | None, + expected_compute_partition_mode: str | None, + ): + if partition_data is None: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI Partition data not available", + priority=EventPriority.WARNING, + ) + return + bad_memory_partition_mode_gpus = [] + for partition_current in partition_data.current_partition: + if ( + expected_memory_partition_mode is not None + and partition_current.memory != expected_memory_partition_mode + ) or ( + expected_compute_partition_mode is not None + and partition_current.accelerator_type != expected_compute_partition_mode + ): + bad_memory_partition_mode_gpus.append( + { + "gpu_id": partition_current.gpu_id, + "compute_partition_mode": partition_current.accelerator_type, + "memory_partition_mode": partition_current.memory, + } + ) + if bad_memory_partition_mode_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Partition Mode Mismatch", + priority=EventPriority.ERROR, + data={ + "actual_partition_data": bad_memory_partition_mode_gpus, + "expected_memory_partition_mode": expected_memory_partition_mode, + "expected_compute_partition_mode": expected_compute_partition_mode, + }, + ) + + def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult: + + if args.expected_gpu_processes: + self.expected_gpu_processes(amdsmi_data.process, expected_gpu_processes) + if expected_memory_partition_mode or expected_compute_partition_mode: + self.check_expected_memory_partition_mode( + amdsmi_data.partition, + expected_memory_partition_mode, + expected_compute_partition_mode, + ) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 04b96751..2a592771 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -27,29 +27,23 @@ from pydantic import BaseModel, ValidationError -try: - import amdsmi # noqa: F401 - from amdsmi import ( - AmdSmiException, - AmdSmiInitFlags, - amdsmi_get_fw_info, - amdsmi_get_gpu_compute_partition, - amdsmi_get_gpu_compute_process_info, - amdsmi_get_gpu_device_bdf, - amdsmi_get_gpu_device_uuid, - amdsmi_get_gpu_kfd_info, - amdsmi_get_gpu_memory_partition, - amdsmi_get_gpu_process_list, - amdsmi_get_lib_version, - amdsmi_get_processor_handles, - amdsmi_get_rocm_version, - amdsmi_init, - amdsmi_shut_down, - ) - - _AMDSMI_IMPORT_ERROR = None -except Exception as _e: - _AMDSMI_IMPORT_ERROR = _e +_AMDSMI_SYMBOLS = ( + "AmdSmiException", + "AmdSmiInitFlags", + "amdsmi_get_fw_info", + "amdsmi_get_gpu_compute_partition", + "amdsmi_get_gpu_compute_process_info", + "amdsmi_get_gpu_device_bdf", + "amdsmi_get_gpu_device_uuid", + "amdsmi_get_gpu_kfd_info", + "amdsmi_get_gpu_memory_partition", + "amdsmi_get_gpu_process_list", + "amdsmi_get_lib_version", + "amdsmi_get_processor_handles", + "amdsmi_get_rocm_version", + "amdsmi_init", + "amdsmi_shut_down", +) from nodescraper.base.inbandcollectortask import InBandDataCollector from nodescraper.connection.inband.inband import CommandArtifact @@ -58,6 +52,7 @@ from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiDataModel, AmdSmiListItem, + AmdSmiStatic, AmdSmiVersion, Fw, Partition, @@ -77,6 +72,33 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel + def _amdsmi_is_bound() -> bool: + """Check if symbol has already been added into globals""" + return all(name in globals() for name in ("amdsmi_init", "AmdSmiInitFlags")) + + def _bind_amdsmi_or_log(collector) -> bool: + """ + Try to import amdsmi and bind the symbols used by this module into globals(). + On failure, log an event and return False (caller should set NOT_RAN and exit). + """ + if _amdsmi_is_bound(): + return True + try: + mod = importlib.import_module("amdsmi") + g = globals() + for name in _AMDSMI_SYMBOLS: + g[name] = getattr(mod, name) + return True + except Exception as e: + collector._log_event( + category=EventCategory.APPLICATION, + description="Failed to import amdsmi Python bindings", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + return False + def _get_handles(self): """Get processor handles.""" try: @@ -148,6 +170,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: partition = self.get_partition() firmware = self.get_firmware() gpu_list = self.get_gpu_list() + amdsmi_static = self.get_static() except Exception as e: self._log_event( category=EventCategory.APPLICATION, @@ -163,6 +186,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: process_data_model = self.build_amdsmi_sub_data(Processes, processes) firmware_model = self.build_amdsmi_sub_data(Fw, firmware) gpu_list_model = self.build_amdsmi_sub_data(AmdSmiListItem, gpu_list) + amdsmi_static_model = self.build_amdsmi_sub_data(AmdSmiStatic, amdsmi_static) try: amd_smi_data = AmdSmiDataModel( version=version, @@ -459,25 +483,172 @@ def _smi_try(self, fn, *a, default=None, **kw): ) return default + def get_static(self) -> list[dict] | None: + devices = self._get_handles() + if not devices: + return [] + + _pcie_fn = globals().get("amdsmi_get_pcie_info", None) + + out: list[dict] = [] + + for idx, h in enumerate(devices): + board = self._smi_try(amdsmi_get_gpu_board_info, h, default={}) or {} + asic = self._smi_try(amdsmi_get_gpu_asic_info, h, default={}) or {} + bdf = self._smi_try(amdsmi_get_gpu_device_bdf, h, default="") or "" + _ = self._smi_try(amdsmi_get_gpu_device_uuid, h, default="") # uuid not used here + kfd = self._smi_try(amdsmi_get_gpu_kfd_info, h, default={}) or {} + + # ----------------------- + # Bus / PCIe + # ----------------------- + pcie = {} + if callable(_pcie_fn): + p = self._smi_try(_pcie_fn, h, default={}) or {} + if isinstance(p, dict): + max_w = p.get("max_link_width") + max_s = p.get("max_link_speed") + pcie_ver = p.get("pcie_version") or p.get("pcie_interface_version") + pcie = { + "bdf": bdf, + "max_pcie_width": ( + f"{max_w} x" if max_w not in (None, "", "N/A") else None + ), + "max_pcie_speed": ( + f"{max_s} GT/s" if max_s not in (None, "", "N/A") else None + ), + "pcie_interface_version": str(pcie_ver or ""), + "slot_type": str(p.get("slot_type", "")), + } + if not pcie: + pcie = { + "bdf": bdf, + "max_pcie_width": None, + "max_pcie_speed": None, + "pcie_interface_version": "", + "slot_type": "", + } + + # ----------------------- + # ASIC + # ----------------------- + asic_mapped = { + "market_name": str(asic.get("market_name") or asic.get("asic_name") or ""), + "vendor_id": str(asic.get("vendor_id", "")), + "vendor_name": str(asic.get("vendor_name", "")), + "subvendor_id": str(asic.get("subvendor_id", "")), + "device_id": str(asic.get("device_id", "")), + "subsystem_id": str(asic.get("subsystem_id", "")), + "rev_id": str(asic.get("rev_id", "")), + "asic_serial": str(asic.get("asic_serial", "")), + "oam_id": int(asic.get("oam_id", 0) or 0), + "num_compute_units": int(asic.get("num_compute_units", 0) or 0), + "target_graphics_version": str(asic.get("target_graphics_version", "")), + } + + # ----------------------- + # Board + # ----------------------- + board_mapped = { + "model_number": str( + board.get("model_number", "") or board.get("amdsmi_model_number", "") + ), + "product_serial": str(board.get("product_serial", "")), + "fru_id": str(board.get("fru_id", "")), + "product_name": str(board.get("product_name", "")), + "manufacturer_name": str(board.get("manufacturer_name", "")), + } + + # ----------------------- + # VBIOS + # ----------------------- + vbios = None + vb = {} + for k in ("vbios_name", "vbios_build_date", "vbios_part_number", "vbios_version"): + if k in board: + vb[k] = board[k] + if vb: + vbios = { + "name": str(vb.get("vbios_name", "")), + "build_date": str(vb.get("vbios_build_date", "")), + "part_number": str(vb.get("vbios_part_number", "")), + "version": str(vb.get("vbios_version", "")), + } + + # ----------------------- + # NUMA (from KFD) + # ----------------------- + if isinstance(kfd, dict): + try: + numa_node = int(kfd.get("node_id", 0) or 0) + except Exception: + numa_node = 0 + try: + affinity = int(kfd.get("cpu_affinity", 0) or 0) + except Exception: + affinity = 0 + else: + numa_node, affinity = 0, 0 + numa = {"node": numa_node, "affinity": affinity} + + # ----------------------- + # VRAM + # ----------------------- + vram_type = str(asic.get("vram_type", "") or "unknown") + vram_vendor = asic.get("vram_vendor") + vram_bits = asic.get("vram_bit_width") + vram_size_b = None + if asic.get("vram_size_bytes") is not None: + vram_size_b = int(asic["vram_size_bytes"]) + elif asic.get("vram_size_mb") is not None: + try: + vram_size_b = int(asic["vram_size_mb"]) * 1024 * 1024 + except Exception: + vram_size_b = None + + vram = { + "type": vram_type, + "vendor": None if vram_vendor in (None, "", "N/A") else str(vram_vendor), + "size": (f"{vram_size_b} B" if isinstance(vram_size_b, int) else None), + "bit_width": (f"{vram_bits} bit" if isinstance(vram_bits, (int, float)) else None), + "max_bandwidth": None, + } + + out.append( + { + "gpu": idx, + "asic": asic_mapped, + "bus": pcie, + "vbios": vbios, + "limit": None, # not available via API + "driver": None, + "board": board_mapped, + "ras": None, + "soc_pstate": soc_pstate, + "xgmi_plpd": xgmi_plpd, + "process_isolation": process_isolation, + "numa": numa, + "vram": vram, + "cache_info": cache_info, + "partition": part, + "clock": clock, + } + ) + + return out + def collect_data( self, args=None, ) -> tuple[TaskResult, AmdSmiDataModel | None]: - if _AMDSMI_IMPORT_ERROR is not None: - self._log_event( - category=EventCategory.APPLICATION, - description="Failed to import amdsmi Python bindings", - data={"exception": get_exception_traceback(_AMDSMI_IMPORT_ERROR)}, - priority=EventPriority.ERROR, - console_log=True, - ) + if not _bind_amdsmi_or_log(self): self.result.status = ExecutionStatus.NOT_RAN return self.result, None try: amdsmi_init(AmdSmiInitFlags.INIT_AMD_GPUS) - amd_smi_data = self._get_amdsmi_data() # fails ras not found + amd_smi_data = self._get_amdsmi_data() if amd_smi_data is None: return self.result, None diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 18b0b980..b744fa71 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -68,11 +68,62 @@ def __init__(self, **data): class ValueUnit(BaseModel): - """A model for a value with a unit.""" + """A model for a value with a unit. - value: int | str | float + Accepts: + - dict: {"value": 123, "unit": "W"} + - number: 123 -> unit="" + - string with number+unit: "123 W" -> {"value": 123, "unit": "W"} + - "N/A" / "NA" / "" / None -> None + """ + + value: int | float | str unit: str = "" + @model_validator(mode="before") + @classmethod + def _coerce(cls, v): + # treat N/A as None + def na(x) -> bool: + return x is None or (isinstance(x, str) and x.strip().upper() in {"N/A", "NA", ""}) + + if na(v): + return None + + # Dict form: normalize value and possibly extract unit + if isinstance(v, dict): + val = v.get("value") + unit = v.get("unit", "") + if na(val): + return None + if isinstance(val, str): + m = _NUM_UNIT_RE.match(val.strip()) + if m and not unit: + num, u = m.groups() + unit = u or unit or "" + val = float(num) if "." in num else int(num) + return {"value": val, "unit": unit} + + # numbers + if isinstance(v, (int, float)): + return {"value": v, "unit": ""} + + if isinstance(v, str): + s = v.strip() + m = _NUM_UNIT_RE.match(s) + if m: + num, unit = m.groups() + val = float(num) if "." in num else int(num) + return {"value": val, "unit": unit or ""} + return {"value": s, "unit": ""} + + return v + + @field_validator("unit") + @classmethod + def _clean_unit(cls, u): + return "" if u is None else str(u).strip() + class EccState(Enum): ENABLED = "ENABLED" @@ -244,6 +295,184 @@ class Partition(BaseModel): partition_resources: list[dict] = Field(default_factory=list) +### STATIC DATA ### +class StaticAsic(BaseModel): + market_name: str + vendor_id: str + vendor_name: str + subvendor_id: str + device_id: str + subsystem_id: str + rev_id: str + asic_serial: str + oam_id: int + num_compute_units: int + target_graphics_version: str + + +class StaticBus(AmdSmiBaseModel): + bdf: str + max_pcie_width: ValueUnit + max_pcie_speed: ValueUnit + pcie_interface_version: str + slot_type: str + + +class StaticVbios(BaseModel): + name: str + build_date: str + part_number: str + version: str + + +class StaticLimit(AmdSmiBaseModel): + max_power: ValueUnit | None + min_power: ValueUnit | None + socket_power: ValueUnit | None + slowdown_edge_temperature: ValueUnit | None + slowdown_hotspot_temperature: ValueUnit | None + slowdown_vram_temperature: ValueUnit | None + shutdown_edge_temperature: ValueUnit | None + shutdown_hotspot_temperature: ValueUnit | None + shutdown_vram_temperature: ValueUnit | None + na_validator = field_validator( + "max_power", + "min_power", + "socket_power", + "slowdown_edge_temperature", + "slowdown_hotspot_temperature", + "slowdown_vram_temperature", + "shutdown_edge_temperature", + "shutdown_hotspot_temperature", + "shutdown_vram_temperature", + mode="before", + )(na_to_none) + + +class StaticDriver(BaseModel): + name: str + version: str + + +class StaticBoard(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + + amdsmi_model_number: str = Field( + alias="model_number" + ) # Model number is a reserved keyword for pydantic + product_serial: str + fru_id: str + product_name: str + manufacturer_name: str + + +class StaticRas(BaseModel): + eeprom_version: str + parity_schema: EccState + single_bit_schema: EccState + double_bit_schema: EccState + poison_schema: EccState + ecc_block_state: dict[str, EccState] + + +class StaticPartition(BaseModel): + # The name for compute_partition has changed we will support both for now + + compute_partition: str = Field( + validation_alias=AliasChoices("compute_partition", "accelerator_partition") + ) + memory_partition: str + partition_id: int + + +class StaticPolicy(BaseModel): + policy_id: int + policy_description: str + + +class StaticSocPstate(BaseModel): + num_supported: int + current_id: int + policies: List[StaticPolicy] + + +class StaticXgmiPlpd(BaseModel): + num_supported: int + current_id: int + plpds: List[StaticPolicy] + + +class StaticNuma(BaseModel): + node: int + affinity: int + + +class StaticVram(AmdSmiBaseModel): + type: str + vendor: str | None + size: ValueUnit | None + bit_width: ValueUnit | None + max_bandwidth: ValueUnit | None = None + na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")( + na_to_none + ) + + +class StaticCacheInfoItem(AmdSmiBaseModel): + cache: ValueUnit + cache_properties: List[str] + cache_size: ValueUnit | None + cache_level: ValueUnit + max_num_cu_shared: ValueUnit + num_cache_instance: ValueUnit + na_validator = field_validator("cache_size", mode="before")(na_to_none) + + +class StaticFrequencyLevels(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + + Level_0: str = Field(..., alias="Level 0") + Level_1: str | None = Field(default=None, alias="Level 1") + Level_2: str | None = Field(default=None, alias="Level 2") + + +class StaticClockData(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + ) + frequency_levels: StaticFrequencyLevels + + current_level: int | None = Field(..., alias="current level") + na_validator = field_validator("current_level", mode="before")(na_to_none) + + +class AmdSmiStatic(BaseModel): + gpu: int + asic: StaticAsic + bus: StaticBus + vbios: StaticVbios | None + limit: StaticLimit | None + driver: StaticDriver + board: StaticBoard + ras: StaticRas + soc_pstate: StaticSocPstate | None + xgmi_plpd: StaticXgmiPlpd | None + process_isolation: str + numa: StaticNuma + vram: StaticVram + cache_info: List[StaticCacheInfoItem] + partition: StaticPartition | None = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ + clock: dict[str, StaticClockData | None] | None = None + na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) + na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( + na_to_none + ) + + class AmdSmiDataModel(DataModel): """Data model for amd-smi data. @@ -265,6 +494,7 @@ class AmdSmiDataModel(DataModel): partition: Partition | None = None process: list[Processes] | None = Field(default_factory=list) firmware: list[Fw] | None = Field(default_factory=list) + static: list[AmdSmiStatic] | None = Field(default_factory=list) def get_list(self, gpu: int) -> AmdSmiListItem | None: """Get the gpu list item for the given gpu id.""" @@ -292,3 +522,12 @@ def get_firmware(self, gpu: int) -> Fw | None: if item.gpu == gpu: return item return None + + def get_static(self, gpu: int) -> AmdSmiStatic | None: + """Get the static data for the given gpu id.""" + if self.static is None: + return None + for item in self.static: + if item.gpu == gpu: + return item + return None diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py new file mode 100644 index 00000000..622e3f59 --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -0,0 +1,50 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from pydantic import Field, field_validator + +from nodescraper.models import AnalyzerArgs +from nodescraper.plugins.inband.os.osdata import OsDataModel + + +class AmdSmiAnalyzerArgs(AnalyzerArgs): + + + @classmethod + def build_from_model(cls, datamodel: OsDataModel) -> "AmdSmiAnalyzerArgs": + """build analyzer args from data model + + Args: + datamodel (AmdSmiDataModel): data model for plugin + + Returns: + AmdSmiAnalyzerArgs: instance of analyzer args class + """ + return cls(expected_gpu_processes= + expected_max_power + expected_driver_version + expected_memory_partition_mode + expected_compute_partition_mode + expected_pldm_version diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py new file mode 100644 index 00000000..7bdbf82d --- /dev/null +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -0,0 +1,218 @@ +import types + +import pytest +from pydantic import BaseModel + +import nodescraper.plugins.inband.amdsmi.amdsmi_collector as mod +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector + + +@pytest.fixture +def collector(system_info, conn_mock): + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + c._events = [] + + def _log_event(**kwargs): + c._events.append(kwargs) + + c._log_event = _log_event + c.result = types.SimpleNamespace(status=None) + c.logger = types.SimpleNamespace( + log=lambda *a, **k: None, + warning=lambda *a, **k: None, + info=lambda *a, **k: None, + error=lambda *a, **k: None, + ) + + return c + + +class FakeAmdSmiException(Exception): + """Stand-in for amdsmi.AmdSmiException.""" + + +def set_handles(monkeypatch, handles): + monkeypatch.setattr(mod, "amdsmi_get_processor_handles", lambda: handles) + + +def test_get_handles_success(monkeypatch, collector): + handles = ["h0", "h1"] + set_handles(monkeypatch, handles) + assert collector._get_handles() == handles + assert collector._events == [] + + +def test_get_amdsmi_version(monkeypatch, collector): + monkeypatch.setattr(mod, "amdsmi_get_lib_version", lambda: "25.3.0") + monkeypatch.setattr(mod, "amdsmi_get_rocm_version", lambda: "6.4.0") + v = collector._get_amdsmi_version() + assert v is not None + assert v.version == "25.3.0" + assert v.rocm_version == "6.4.0" + + +def test_get_gpu_list_with_compute_partition(monkeypatch, collector): + handles = ["h0", "h1"] + set_handles(monkeypatch, handles) + monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) + + calls = { + "bdf": {"h0": "0000:01:00.0", "h1": "0001:01:00.0"}, + "uuid": {"h0": "U0", "h1": "U1"}, + "kfd": {"h0": {"kfd_id": "7", "node_id": 3}, "h1": {}}, + "cp": {"h0": {"partition_id": "2"}, "h1": {"partition_id": 0}}, + "mp": {"h0": {}, "h1": {}}, + } + + monkeypatch.setattr(mod, "amdsmi_get_gpu_device_bdf", lambda h: calls["bdf"][h]) + monkeypatch.setattr(mod, "amdsmi_get_gpu_device_uuid", lambda h: calls["uuid"][h]) + monkeypatch.setattr(mod, "amdsmi_get_gpu_kfd_info", lambda h: calls["kfd"][h]) + monkeypatch.setattr(mod, "amdsmi_get_gpu_compute_partition", lambda h: calls["cp"][h]) + monkeypatch.setattr(mod, "amdsmi_get_gpu_memory_partition", lambda h: calls["mp"][h]) + + out = collector.get_gpu_list() + assert out == [ + { + "gpu": 0, + "bdf": "0000:01:00.0", + "uuid": "U0", + "kfd_id": 7, + "node_id": 3, + "partition_id": 2, + }, + { + "gpu": 1, + "bdf": "0001:01:00.0", + "uuid": "U1", + "kfd_id": 0, + "node_id": 0, + "partition_id": 0, + }, + ] + + +def test_get_gpu_list_fallback_to_memory_partition(monkeypatch, collector): + handles = ["h0"] + set_handles(monkeypatch, handles) + monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) + + monkeypatch.setattr(mod, "amdsmi_get_gpu_device_bdf", lambda h: "0000:01:00.0") + monkeypatch.setattr(mod, "amdsmi_get_gpu_device_uuid", lambda h: "U0") + monkeypatch.setattr(mod, "amdsmi_get_gpu_kfd_info", lambda h: {"kfd_id": 1, "node_id": "9"}) + + def raise_cp(h): + raise FakeAmdSmiException(2) + + monkeypatch.setattr(mod, "amdsmi_get_gpu_compute_partition", raise_cp) + monkeypatch.setattr( + mod, "amdsmi_get_gpu_memory_partition", lambda h: {"current_partition_id": "4"} + ) + + out = collector.get_gpu_list() + assert out[0]["partition_id"] == 4 + + +def test_get_process_mixed(monkeypatch, collector): + handles = ["h0"] + set_handles(monkeypatch, handles) + monkeypatch.setattr(mod, "amdsmi_get_gpu_process_list", lambda h: [111, 222]) + + def get_info(h, pid): + if pid == 111: + return {"name": "proc111", "vram_mem": 42, "gtt_mem": 1, "cpu_mem": 2} + raise FakeAmdSmiException(2) + + monkeypatch.setattr(mod, "amdsmi_get_gpu_compute_process_info", get_info) + monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) + + out = collector.get_process() + assert out and out[0]["gpu"] == 0 + plist = out[0]["process_list"] + assert plist[0]["process_info"]["name"] == "proc111" + assert plist[1]["process_info"] == "222" + + +def test_get_partition(monkeypatch, collector): + handles = ["h0", "h1"] + set_handles(monkeypatch, handles) + monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) + + monkeypatch.setattr( + mod, "amdsmi_get_gpu_compute_partition", lambda h: {"memory": "X", "partition_id": 1} + ) + monkeypatch.setattr( + mod, + "amdsmi_get_gpu_memory_partition", + lambda h: {"current_partition_id": 1, "memory_partition_caps": [1, 2]}, + ) + + out = collector.get_partition() + assert "current_partition" in out and len(out["current_partition"]) == 2 + assert "memory_partition" in out and len(out["memory_partition"]) == 2 + + +def test_get_firmware_various_shapes(monkeypatch, collector): + handles = ["h0", "h1", "h2"] + set_handles(monkeypatch, handles) + monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) + + fw_map = { + "h0": [{"fw_id": "SMU", "fw_version": "1.2.3"}, {"fw_name": "VBIOS", "version": "abc"}], + "h1": {"fw_list": [{"name": "PMFW", "ver": "9.9"}]}, + "h2": {"SMU": "4.5.6", "XGMI": "7.8.9"}, + } + monkeypatch.setattr(mod, "amdsmi_get_fw_info", lambda h: fw_map[h]) + + out = collector.get_firmware() + assert out and len(out) == 3 + assert out[0]["fw_list"][0] == {"fw_id": "SMU", "fw_version": "1.2.3"} + assert out[0]["fw_list"][1] == {"fw_id": "VBIOS", "fw_version": "abc"} + assert out[1]["fw_list"][0]["fw_id"] in ("PMFW", "name", "") + ids = {e["fw_id"] for e in out[2]["fw_list"]} + assert {"SMU", "XGMI"}.issubset(ids) + + +def test_smi_try_not_supported(monkeypatch, collector): + monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) + + def fn(): + raise FakeAmdSmiException(2) + + ret = collector._smi_try(fn, default="X") + assert ret == "X" + assert any("not supported" in e["description"] for e in collector._events) + + +def test_collect_data(monkeypatch, collector): + init_called = [] + shut_called = [] + + monkeypatch.setattr(mod, "amdsmi_init", lambda *a, **k: init_called.append(True)) + monkeypatch.setattr(mod, "amdsmi_shut_down", lambda *a, **k: shut_called.append(True)) + monkeypatch.setattr(AmdSmiCollector, "_get_amdsmi_data", lambda self: {"ok": True}) + + res, data = collector.collect_data() + assert data == {"ok": True} + assert init_called and shut_called + + +def test_build_amdsmi_sub_data(collector): + class M(BaseModel): + a: int + + out = collector.build_amdsmi_sub_data(M, [{"a": 1}, {"a": 2}]) + assert [m.a for m in out] == [1, 2] + + out2 = collector.build_amdsmi_sub_data(M, {"a": 5}) + assert out2.a == 5 + + out3 = collector.build_amdsmi_sub_data(M, ["not-a-dict"]) + assert out3 is None + assert any( + "Invalid data type for amd-smi sub data" in e["description"] for e in collector._events + ) From 359a36d068ef7113ff28d35374456b0f8dd1a6f9 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Sep 2025 11:33:28 -0500 Subject: [PATCH 14/38] updates --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 99 +++-------------- .../plugins/inband/amdsmi/amdsmi_collector.py | 101 ++++++++---------- .../plugins/inband/amdsmi/amdsmi_plugin.py | 2 +- .../plugins/inband/amdsmi/amdsmidata.py | 5 + .../plugins/inband/amdsmi/analyzer_args.py | 28 ++--- 5 files changed, 72 insertions(+), 163 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 617d0f01..a8aba0a0 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -29,7 +29,8 @@ from nodescraper.interfaces import DataAnalyzer from nodescraper.models import TaskResult -from .amdsmidata import AmdSmiDataModel +from .amdsmidata import AmdSmiDataModel, Fw, Partition, Processes +from .analyzer_args import AmdSmiAnalyzerArgs class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): @@ -37,6 +38,9 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel + L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3 # Thresholds defined in https://ontrack-internal.amd.com/browse/DCGPUSDV-1204, must be greated than this value to generate a error event + L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1 # Thresholds defined in https://ontrack-internal.amd.com/browse/SWLORC-10120, Must be greater than this value to generate a warning event + def expected_gpu_processes( self, processes_data: list[Processes] | None, max_num_processes: int ): @@ -75,88 +79,6 @@ def expected_gpu_processes( console_log=True, ) - def check_expected_memory_partition_mode( - self, - partition_data: Partition | None, - expected_memory_partition_mode: str | None, - expected_compute_partition_mode: str | None, - ): - if partition_data is None: - self._log_event( - category=EventCategory.PLATFORM, - description="No AMD SMI Partition data not available", - priority=EventPriority.WARNING, - ) - return - bad_memory_partition_mode_gpus = [] - for partition_current in partition_data.current_partition: - if ( - expected_memory_partition_mode is not None - and partition_current.memory != expected_memory_partition_mode - ) or ( - expected_compute_partition_mode is not None - and partition_current.accelerator_type != expected_compute_partition_mode - ): - bad_memory_partition_mode_gpus.append( - { - "gpu_id": partition_current.gpu_id, - "compute_partition_mode": partition_current.accelerator_type, - "memory_partition_mode": partition_current.memory, - } - ) - if bad_memory_partition_mode_gpus: - self._log_event( - category=EventCategory.PLATFORM, - description="Partition Mode Mismatch", - priority=EventPriority.ERROR, - data={ - "actual_partition_data": bad_memory_partition_mode_gpus, - "expected_memory_partition_mode": expected_memory_partition_mode, - "expected_compute_partition_mode": expected_compute_partition_mode, - }, - ) - - def check_expected_memory_partition_mode( - self, - partition_data: Partition | None, - expected_memory_partition_mode: str | None, - expected_compute_partition_mode: str | None, - ): - if partition_data is None: - self._log_event( - category=EventCategory.PLATFORM, - description="No AMD SMI Partition data not available", - priority=EventPriority.WARNING, - ) - return - bad_memory_partition_mode_gpus = [] - for partition_current in partition_data.current_partition: - if ( - expected_memory_partition_mode is not None - and partition_current.memory != expected_memory_partition_mode - ) or ( - expected_compute_partition_mode is not None - and partition_current.accelerator_type != expected_compute_partition_mode - ): - bad_memory_partition_mode_gpus.append( - { - "gpu_id": partition_current.gpu_id, - "compute_partition_mode": partition_current.accelerator_type, - "memory_partition_mode": partition_current.memory, - } - ) - if bad_memory_partition_mode_gpus: - self._log_event( - category=EventCategory.PLATFORM, - description="Partition Mode Mismatch", - priority=EventPriority.ERROR, - data={ - "actual_partition_data": bad_memory_partition_mode_gpus, - "expected_memory_partition_mode": expected_memory_partition_mode, - "expected_compute_partition_mode": expected_compute_partition_mode, - }, - ) - def check_pldm_version( self, amdsmi_fw_data: list[Fw] | None, @@ -239,11 +161,14 @@ def check_expected_memory_partition_mode( def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult: + if args is None: + args = AmdSmiAnalyzerArgs() + if args.expected_gpu_processes: - self.expected_gpu_processes(amdsmi_data.process, expected_gpu_processes) - if expected_memory_partition_mode or expected_compute_partition_mode: + self.expected_gpu_processes(amdsmi_data.process, args.expected_gpu_processes) + if args.expected_memory_partition_mode or args.expected_compute_partition_mode: self.check_expected_memory_partition_mode( amdsmi_data.partition, - expected_memory_partition_mode, - expected_compute_partition_mode, + args.expected_memory_partition_mode, + args.expected_compute_partition_mode, ) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 2a592771..114db515 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -23,28 +23,11 @@ # SOFTWARE. # ############################################################################### +import importlib from typing import TypeVar from pydantic import BaseModel, ValidationError -_AMDSMI_SYMBOLS = ( - "AmdSmiException", - "AmdSmiInitFlags", - "amdsmi_get_fw_info", - "amdsmi_get_gpu_compute_partition", - "amdsmi_get_gpu_compute_process_info", - "amdsmi_get_gpu_device_bdf", - "amdsmi_get_gpu_device_uuid", - "amdsmi_get_gpu_kfd_info", - "amdsmi_get_gpu_memory_partition", - "amdsmi_get_gpu_process_list", - "amdsmi_get_lib_version", - "amdsmi_get_processor_handles", - "amdsmi_get_rocm_version", - "amdsmi_init", - "amdsmi_shut_down", -) - from nodescraper.base.inbandcollectortask import InBandDataCollector from nodescraper.connection.inband.inband import CommandArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily @@ -72,25 +55,18 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel - def _amdsmi_is_bound() -> bool: - """Check if symbol has already been added into globals""" + def _amdsmi_is_bound(self) -> bool: return all(name in globals() for name in ("amdsmi_init", "AmdSmiInitFlags")) - def _bind_amdsmi_or_log(collector) -> bool: - """ - Try to import amdsmi and bind the symbols used by this module into globals(). - On failure, log an event and return False (caller should set NOT_RAN and exit). - """ - if _amdsmi_is_bound(): + def _bind_amdsmi_or_log(self) -> bool: + """Import amdsmi and store the module on self. Return True if ok.""" + if getattr(self, "_amdsmi", None) is not None: return True try: - mod = importlib.import_module("amdsmi") - g = globals() - for name in _AMDSMI_SYMBOLS: - g[name] = getattr(mod, name) + self._amdsmi = importlib.import_module("amdsmi") return True except Exception as e: - collector._log_event( + self._log_event( category=EventCategory.APPLICATION, description="Failed to import amdsmi Python bindings", data={"exception": get_exception_traceback(e)}, @@ -100,10 +76,9 @@ def _bind_amdsmi_or_log(collector) -> bool: return False def _get_handles(self): - """Get processor handles.""" try: - return amdsmi_get_processor_handles() - except amdsmi.AmdSmiException as e: + return self._amdsmi.amdsmi_get_processor_handles() + except self._amdsmi.AmdSmiException as e: self._log_event( category=EventCategory.APPLICATION, description="amdsmi_get_processor_handles failed", @@ -194,6 +169,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: process=process_data_model, partition=partition_data_model, firmware=firmware_model, + static=amdsmi_static_model, ) except ValidationError as e: self.logger.warning("Validation err: %s", e) @@ -210,9 +186,9 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: def _get_amdsmi_version(self) -> AmdSmiVersion | None: """Get lib/rocm versions.""" try: - lib_ver = amdsmi_get_lib_version() or "" - rocm_ver = amdsmi_get_rocm_version() or "" - except AmdSmiException as e: + lib_ver = self._amdsmi.amdsmi_get_lib_version() or "" + rocm_ver = self._amdsmi.amdsmi_get_rocm_version() or "" + except self._amdsmi.AmdSmiException as e: self._log_event( category=EventCategory.APPLICATION, description="Failed to read AMD SMI versions", @@ -260,16 +236,18 @@ def _to_int(x, default=0): return default for idx, h in enumerate(devices): - bdf = self._smi_try(amdsmi_get_gpu_device_bdf, h, default="") or "" - uuid = self._smi_try(amdsmi_get_gpu_device_uuid, h, default="") or "" - kfd = self._smi_try(amdsmi_get_gpu_kfd_info, h, default={}) or {} + bdf = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" + uuid = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" + kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} partition_id = 0 - cp = self._smi_try(amdsmi_get_gpu_compute_partition, h, default={}) or {} + cp = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} if isinstance(cp, dict) and cp.get("partition_id") is not None: partition_id = _to_int(cp.get("partition_id"), 0) else: - mp = self._smi_try(amdsmi_get_gpu_memory_partition, h, default={}) or {} + mp = ( + self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} + ) if isinstance(mp, dict) and mp.get("current_partition_id") is not None: partition_id = _to_int(mp.get("current_partition_id"), 0) @@ -292,10 +270,12 @@ def get_process(self) -> list[dict] | None: out: list[dict] = [] for idx, h in enumerate(devices): try: - pids = amdsmi_get_gpu_process_list(h) or [] + pids = self._amdsmi.amdsmi_get_gpu_process_list(h) or [] plist = [] for pid in pids: - pinfo = self._smi_try(amdsmi_get_gpu_compute_process_info, h, pid, default=None) + pinfo = self._smi_try( + self._amdsmi.amdsmi_get_gpu_compute_process_info, h, pid, default=None + ) if not isinstance(pinfo, dict): plist.append({"process_info": str(pid)}) continue @@ -335,8 +315,8 @@ def get_partition(self) -> dict | None: memparts: list[dict] = [] resources: list[dict] = [] for idx, h in enumerate(devices): - c = self._smi_try(amdsmi_get_gpu_compute_partition, h, default={}) or {} - m = self._smi_try(amdsmi_get_gpu_memory_partition, h, default={}) or {} + c = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} + m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} c_dict = c if isinstance(c, dict) else {} m_dict = m if isinstance(m, dict) else {} current.append( @@ -366,7 +346,7 @@ def get_firmware(self) -> list[dict] | None: out: list[dict] = [] for idx, h in enumerate(devices): - raw = self._smi_try(amdsmi_get_fw_info, h, default=None) + raw = self._smi_try(self._amdsmi.amdsmi_get_fw_info, h, default=None) if raw is None: continue @@ -420,7 +400,7 @@ def _smi_try(self, fn, *a, default=None, **kw): """ try: return fn(*a, **kw) - except AmdSmiException as e: + except self._amdsmi.AmdSmiException as e: self.logger.warning(e) code = getattr(e, "ret_code", None) if code is None: @@ -493,11 +473,20 @@ def get_static(self) -> list[dict] | None: out: list[dict] = [] for idx, h in enumerate(devices): - board = self._smi_try(amdsmi_get_gpu_board_info, h, default={}) or {} - asic = self._smi_try(amdsmi_get_gpu_asic_info, h, default={}) or {} - bdf = self._smi_try(amdsmi_get_gpu_device_bdf, h, default="") or "" - _ = self._smi_try(amdsmi_get_gpu_device_uuid, h, default="") # uuid not used here - kfd = self._smi_try(amdsmi_get_gpu_kfd_info, h, default={}) or {} + board = self._smi_try(self._amdsmi.amdsmi_get_gpu_board_info, h, default={}) or {} + asic = self._smi_try(self._amdsmi.amdsmi_get_gpu_asic_info, h, default={}) or {} + bdf = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" + _ = self._smi_try( + self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="" + ) # uuid not used here + kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} + + cache_info: list[dict] = [] + part = None + soc_pstate = None + xgmi_plpd = None + clock = None + process_isolation = "" # ----------------------- # Bus / PCIe @@ -642,12 +631,12 @@ def collect_data( args=None, ) -> tuple[TaskResult, AmdSmiDataModel | None]: - if not _bind_amdsmi_or_log(self): + if not self._bind_amdsmi_or_log(): self.result.status = ExecutionStatus.NOT_RAN return self.result, None try: - amdsmi_init(AmdSmiInitFlags.INIT_AMD_GPUS) + self._amdsmi.amdsmi_init(self._amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS) amd_smi_data = self._get_amdsmi_data() if amd_smi_data is None: @@ -666,6 +655,6 @@ def collect_data( return self.result, None finally: try: - amdsmi_shut_down() + self._amdsmi.amdsmi_shut_down() except Exception: pass diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py index 77e5c735..fa652822 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py @@ -29,7 +29,7 @@ from .amdsmidata import AmdSmiDataModel -class AmdsmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, None]): +class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, None]): """Plugin for collection and analysis of amdsmi data""" DATA_MODEL = AmdSmiDataModel diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index b744fa71..1a47b37c 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1,16 +1,21 @@ +import re from enum import Enum from typing import List from pydantic import ( + AliasChoices, BaseModel, ConfigDict, Field, field_validator, + model_validator, ) from nodescraper.models.datamodel import DataModel from nodescraper.utils import find_annotation_in_container +_NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$") + def na_to_none(values: int | str): if values == "N/A": diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 622e3f59..7e6dbf3f 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -23,28 +23,18 @@ # SOFTWARE. # ############################################################################### -from pydantic import Field, field_validator +from typing import Optional from nodescraper.models import AnalyzerArgs -from nodescraper.plugins.inband.os.osdata import OsDataModel class AmdSmiAnalyzerArgs(AnalyzerArgs): - - @classmethod - def build_from_model(cls, datamodel: OsDataModel) -> "AmdSmiAnalyzerArgs": - """build analyzer args from data model - - Args: - datamodel (AmdSmiDataModel): data model for plugin - - Returns: - AmdSmiAnalyzerArgs: instance of analyzer args class - """ - return cls(expected_gpu_processes= - expected_max_power - expected_driver_version - expected_memory_partition_mode - expected_compute_partition_mode - expected_pldm_version + check_static_data: bool = False + expected_gpu_processes: Optional[int] = None + expected_max_power: Optional[int] = None + expected_driver_version: Optional[str] = None + expected_memory_partition_mode: Optional[str] = None + expected_compute_partition_mode: Optional[str] = None + expected_pldm_version: Optional[str] = None + xgmi_speed_override: Optional[float] = None From d1e73a0f871057fd4504f1950edaf1bf4df260b6 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Tue, 16 Sep 2025 14:39:17 -0500 Subject: [PATCH 15/38] cleanup --- .../plugins/inband/amdsmi/amdsmi_collector.py | 235 +++++++++--------- .../plugins/inband/amdsmi/amdsmidata.py | 37 ++- 2 files changed, 144 insertions(+), 128 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 114db515..00c24a39 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -55,9 +55,6 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel - def _amdsmi_is_bound(self) -> bool: - return all(name in globals() for name in ("amdsmi_init", "AmdSmiInitFlags")) - def _bind_amdsmi_or_log(self) -> bool: """Import amdsmi and store the module on self. Return True if ok.""" if getattr(self, "_amdsmi", None) is not None: @@ -68,7 +65,7 @@ def _bind_amdsmi_or_log(self) -> bool: except Exception as e: self._log_event( category=EventCategory.APPLICATION, - description="Failed to import amdsmi Python bindings", + description="Failed to import amdsmi package, please ensure amdsmi is installed and Python bindings are available", data={"exception": get_exception_traceback(e)}, priority=EventPriority.ERROR, console_log=True, @@ -88,48 +85,6 @@ def _get_handles(self): ) return [] - def build_amdsmi_sub_data( - self, amd_smi_data_model: type[T], json_data: list[dict] | dict | None - ) -> list[T] | T | None: - try: - if json_data is None: - self._log_event( - category=EventCategory.APPLICATION, - description="No data returned from amd-smi sub command", - priority=EventPriority.ERROR, - ) - return None - validated_data = [] - if isinstance(json_data, list): - for data in json_data: - if not isinstance(data, dict): - self._log_event( - category=EventCategory.APPLICATION, - description="Invalid data type for amd-smi sub data", - data={ - "data_type": type(data).__name__, - "model_name": amd_smi_data_model.__name__, - }, - priority=EventPriority.WARNING, - ) - return None - validated_data.append(amd_smi_data_model(**data)) - elif isinstance(json_data, dict): - return amd_smi_data_model(**json_data) - else: - raise ValidationError( - f"Invalid data type for amd-smi sub data: {type(json_data).__name__}", - model=amd_smi_data_model, - ) - return validated_data - except ValidationError as e: - self._log_event( - category=EventCategory.APPLICATION, - description=f"Failed to build amd-smi model {amd_smi_data_model.__name__}", - data=get_exception_traceback(e), - priority=EventPriority.WARNING, - ) - return None def _get_amdsmi_data(self) -> AmdSmiDataModel | None: """Returns amd-smi tool data formatted as a AmdSmiDataModel object @@ -145,7 +100,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: partition = self.get_partition() firmware = self.get_firmware() gpu_list = self.get_gpu_list() - amdsmi_static = self.get_static() + statics = self.get_static() except Exception as e: self._log_event( category=EventCategory.APPLICATION, @@ -157,32 +112,25 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: self.result.status = ExecutionStatus.EXECUTION_FAILURE return None - partition_data_model = self.build_amdsmi_sub_data(Partition, partition) - process_data_model = self.build_amdsmi_sub_data(Processes, processes) - firmware_model = self.build_amdsmi_sub_data(Fw, firmware) - gpu_list_model = self.build_amdsmi_sub_data(AmdSmiListItem, gpu_list) - amdsmi_static_model = self.build_amdsmi_sub_data(AmdSmiStatic, amdsmi_static) try: - amd_smi_data = AmdSmiDataModel( + return AmdSmiDataModel( version=version, - gpu_list=gpu_list_model, - process=process_data_model, - partition=partition_data_model, - firmware=firmware_model, - static=amdsmi_static_model, + gpu_list=gpu_list, + process=processes, + partition=partition, + firmware=firmware, + static=statics, ) except ValidationError as e: self.logger.warning("Validation err: %s", e) self._log_event( category=EventCategory.APPLICATION, - description="Failed to build AmdSmiDataModel model", + description="Failed to build AmdSmiDataModel", data=get_exception_details(e), priority=EventPriority.ERROR, ) return None - return amd_smi_data - def _get_amdsmi_version(self) -> AmdSmiVersion | None: """Get lib/rocm versions.""" try: @@ -251,23 +199,30 @@ def _to_int(x, default=0): if isinstance(mp, dict) and mp.get("current_partition_id") is not None: partition_id = _to_int(mp.get("current_partition_id"), 0) - out.append( - { - "gpu": idx, - "bdf": bdf, - "uuid": uuid, - "kfd_id": _to_int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, - "node_id": _to_int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, - "partition_id": partition_id, - } - ) + try: + out.append( + AmdSmiListItem( + gpu=idx, + bdf=bdf, + uuid=uuid, + kfd_id=_to_int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, + node_id=_to_int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, + partition_id=partition_id, + ) + ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiListItem", + data={"exception": get_exception_traceback(e), "gpu_index": idx}, + priority=EventPriority.WARNING, + ) return out - def get_process(self) -> list[dict] | None: - """Get data as a list of dict from cmd: amdsmi process""" + def get_process(self) -> list[Processes] | None: devices = self._get_handles() - out: list[dict] = [] + out: list[Processes] = [] for idx, h in enumerate(devices): try: pids = self._amdsmi.amdsmi_get_gpu_process_list(h) or [] @@ -279,7 +234,6 @@ def get_process(self) -> list[dict] | None: if not isinstance(pinfo, dict): plist.append({"process_info": str(pid)}) continue - plist.append( { "process_info": { @@ -298,12 +252,20 @@ def get_process(self) -> list[dict] | None: } } ) - out.append({"gpu": idx, "process_list": plist}) - except AmdSmiException as e: + try: + out.append(Processes(gpu=idx, process_list=plist)) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build Processes", + data={"exception": get_exception_traceback(e), "gpu_index": idx}, + priority=EventPriority.WARNING, + ) + except self._amdsmi.AmdSmiException as e: self._log_event( category=EventCategory.APPLICATION, description="Process collection failed", - data={"exception": get_exception_traceback(e)}, + data={"exception": get_exception_traceback(e), "gpu_index": idx}, priority=EventPriority.WARNING, ) return out @@ -335,11 +297,20 @@ def get_partition(self) -> dict | None: "current_partition_id": m_dict.get("current_partition_id"), } ) - return { - "current_partition": current, - "memory_partition": memparts, - "partition_resources": resources, - } + try: + return Partition( + current_partition=current, + memory_partition=memparts, + partition_resources=resources, + ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build Partition", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return None def get_firmware(self) -> list[dict] | None: devices = self._get_handles() @@ -390,7 +361,15 @@ def get_firmware(self) -> list[dict] | None: priority=EventPriority.INFO, ) - out.append({"gpu": idx, "fw_list": normalized}) + try: + out.append(Fw(gpu=idx, fw_list=normalized)) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build Fw", + data={"exception": get_exception_traceback(e), "gpu_index": idx}, + priority=EventPriority.WARNING, + ) return out @@ -468,6 +447,11 @@ def get_static(self) -> list[dict] | None: if not devices: return [] + def _nz(val: object, default: str = "unknown") -> str: + """Normalize possibly-empty/NA strings to a non-empty default.""" + s = str(val).strip() if val is not None else "" + return s if s and s.upper() != "N/A" else default + _pcie_fn = globals().get("amdsmi_get_pcie_info", None) out: list[dict] = [] @@ -476,15 +460,12 @@ def get_static(self) -> list[dict] | None: board = self._smi_try(self._amdsmi.amdsmi_get_gpu_board_info, h, default={}) or {} asic = self._smi_try(self._amdsmi.amdsmi_get_gpu_asic_info, h, default={}) or {} bdf = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" - _ = self._smi_try( - self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="" - ) # uuid not used here + _ = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} cache_info: list[dict] = [] - part = None soc_pstate = None - xgmi_plpd = None + xgmi_plpd = None # TODO clock = None process_isolation = "" @@ -492,6 +473,7 @@ def get_static(self) -> list[dict] | None: # Bus / PCIe # ----------------------- pcie = {} + if callable(_pcie_fn): p = self._smi_try(_pcie_fn, h, default={}) or {} if isinstance(p, dict): @@ -501,21 +483,38 @@ def get_static(self) -> list[dict] | None: pcie = { "bdf": bdf, "max_pcie_width": ( - f"{max_w} x" if max_w not in (None, "", "N/A") else None + { + "value": ( + int(max_w) + if isinstance(max_w, (int, float, str)) and str(max_w).isdigit() + else 0 + ), + "unit": "x", + } + if max_w not in (None, "", "N/A") + else None ), "max_pcie_speed": ( - f"{max_s} GT/s" if max_s not in (None, "", "N/A") else None + { + "value": ( + float(max_s) if isinstance(max_s, (int, float, str)) else 0 + ), + "unit": "GT/s", + } + if max_s not in (None, "", "N/A") + else None ), - "pcie_interface_version": str(pcie_ver or ""), - "slot_type": str(p.get("slot_type", "")), + "pcie_interface_version": _nz(pcie_ver), + "slot_type": _nz(p.get("slot_type")), } + if not pcie: pcie = { "bdf": bdf, "max_pcie_width": None, "max_pcie_speed": None, - "pcie_interface_version": "", - "slot_type": "", + "pcie_interface_version": "unknown", + "slot_type": "unknown", } # ----------------------- @@ -603,26 +602,32 @@ def get_static(self) -> list[dict] | None: "max_bandwidth": None, } - out.append( - { - "gpu": idx, - "asic": asic_mapped, - "bus": pcie, - "vbios": vbios, - "limit": None, # not available via API - "driver": None, - "board": board_mapped, - "ras": None, - "soc_pstate": soc_pstate, - "xgmi_plpd": xgmi_plpd, - "process_isolation": process_isolation, - "numa": numa, - "vram": vram, - "cache_info": cache_info, - "partition": part, - "clock": clock, - } - ) + try: + out.append( + AmdSmiStatic( + gpu=idx, + asic=asic_mapped, + bus=pcie, + vbios=vbios, + limit=None, + board=board_mapped, + soc_pstate=None, # TODO + xgmi_plpd=None, + process_isolation="", # TODO + numa=numa, + vram=vram, + cache_info=[], # TODO + partition=None, + clock=None, # TODO + ) + ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiStatic", + data={"exception": get_exception_traceback(e), "gpu_index": idx, "pcie": pcie}, + priority=EventPriority.WARNING, + ) return out diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 1a47b37c..15c6ee5c 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1,6 +1,6 @@ import re from enum import Enum -from typing import List +from typing import Any, List, Mapping from pydantic import ( AliasChoices, @@ -31,12 +31,23 @@ def na_to_none_list(values: list[int | str]) -> List[int | str | None]: return ret_list -def na_to_none_dict(values: dict[str, int | str]) -> dict[str, int | str | None]: - ret_dict: dict[str, int | str | None] = values.copy() - for key in ret_dict: - if ret_dict[key] == "N/A": - ret_dict[key] = None - return ret_dict +def na_to_none_dict(values: object) -> dict[str, Any] | None: + """Normalize mapping-like fields where 'N/A' or empty should become None. + Accepts None; returns None for 'N/A'/'NA'/'' or non-mapping inputs.""" + if values is None: + return None + if isinstance(values, str) and values.strip().upper() in {"N/A", "NA", ""}: + return None + if not isinstance(values, Mapping): # guard: pydantic may pass non-dicts in 'before' mode + return None + + out: dict[str, Any] = {} + for k, v in values.items(): + if isinstance(v, str) and v.strip().upper() in {"N/A", "NA", ""}: + out[k] = None + else: + out[k] = v + return out class AmdSmiBaseModel(BaseModel): @@ -317,10 +328,10 @@ class StaticAsic(BaseModel): class StaticBus(AmdSmiBaseModel): bdf: str - max_pcie_width: ValueUnit - max_pcie_speed: ValueUnit - pcie_interface_version: str - slot_type: str + max_pcie_width: ValueUnit | None = None + max_pcie_speed: ValueUnit | None = None + pcie_interface_version: str = "unknown" + slot_type: str = "unknown" class StaticVbios(BaseModel): @@ -461,9 +472,9 @@ class AmdSmiStatic(BaseModel): bus: StaticBus vbios: StaticVbios | None limit: StaticLimit | None - driver: StaticDriver + # driver: StaticDriver board: StaticBoard - ras: StaticRas + # ras: StaticRas soc_pstate: StaticSocPstate | None xgmi_plpd: StaticXgmiPlpd | None process_isolation: str From 7094979d9fc537e04b3856fd413140503c1307f9 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Sep 2025 17:59:17 -0500 Subject: [PATCH 16/38] mypy --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 6 +- .../plugins/inband/amdsmi/amdsmi_collector.py | 368 +++++++++--------- .../plugins/inband/amdsmi/amdsmidata.py | 2 +- 3 files changed, 183 insertions(+), 193 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index a8aba0a0..dd61e19d 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -165,10 +165,12 @@ def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult: args = AmdSmiAnalyzerArgs() if args.expected_gpu_processes: - self.expected_gpu_processes(amdsmi_data.process, args.expected_gpu_processes) + self.expected_gpu_processes(data.process, args.expected_gpu_processes) if args.expected_memory_partition_mode or args.expected_compute_partition_mode: self.check_expected_memory_partition_mode( - amdsmi_data.partition, + data.partition, args.expected_memory_partition_mode, args.expected_compute_partition_mode, ) + + return self.result diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 00c24a39..6136d863 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -24,9 +24,9 @@ # ############################################################################### import importlib -from typing import TypeVar +from typing import cast -from pydantic import BaseModel, ValidationError +from pydantic import ValidationError from nodescraper.base.inbandcollectortask import InBandDataCollector from nodescraper.connection.inband.inband import CommandArtifact @@ -38,13 +38,23 @@ AmdSmiStatic, AmdSmiVersion, Fw, + FwListItem, Partition, + PartitionCurrent, + PartitionMemory, Processes, + ProcessInfo, + ProcessListItem, + StaticAsic, + StaticBoard, + StaticBus, + StaticNuma, + StaticVbios, + StaticVram, + ValueUnit, ) from nodescraper.utils import get_exception_details, get_exception_traceback -T = TypeVar("T", bound=BaseModel) - class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): """class for collection of inband tool amd-smi data.""" @@ -85,7 +95,6 @@ def _get_handles(self): ) return [] - def _get_amdsmi_data(self) -> AmdSmiDataModel | None: """Returns amd-smi tool data formatted as a AmdSmiDataModel object @@ -172,10 +181,9 @@ def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: return cmd_ret.stdout or "" - def get_gpu_list(self) -> list[dict] | None: - + def get_gpu_list(self) -> list[AmdSmiListItem] | None: devices = self._get_handles() - out: list[dict] = [] + out: list[AmdSmiListItem] = [] def _to_int(x, default=0): try: @@ -226,31 +234,44 @@ def get_process(self) -> list[Processes] | None: for idx, h in enumerate(devices): try: pids = self._amdsmi.amdsmi_get_gpu_process_list(h) or [] - plist = [] + plist: list[ProcessListItem] = [] + for pid in pids: pinfo = self._smi_try( self._amdsmi.amdsmi_get_gpu_compute_process_info, h, pid, default=None ) if not isinstance(pinfo, dict): - plist.append({"process_info": str(pid)}) + plist.append(ProcessListItem(process_info=str(pid))) continue + plist.append( - { - "process_info": { - "name": pinfo.get("name", str(pid)), - "pid": int(pid), - "memory_usage": { - "gtt_mem": {"value": pinfo.get("gtt_mem", 0), "unit": "B"}, - "cpu_mem": {"value": pinfo.get("cpu_mem", 0), "unit": "B"}, - "vram_mem": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, - }, - "mem_usage": {"value": pinfo.get("vram_mem", 0), "unit": "B"}, - "usage": { - "gfx": {"value": pinfo.get("gfx", 0), "unit": "%"}, - "enc": {"value": pinfo.get("enc", 0), "unit": "%"}, + ProcessListItem( + process_info=cast( + ProcessInfo, + { + "name": pinfo.get("name", str(pid)), + "pid": int(pid), + "memory_usage": { + "gtt_mem": ValueUnit( + value=pinfo.get("gtt_mem", 0), unit="B" + ), + "cpu_mem": ValueUnit( + value=pinfo.get("cpu_mem", 0), unit="B" + ), + "vram_mem": ValueUnit( + value=pinfo.get("vram_mem", 0), unit="B" + ), + }, + "mem_usage": ValueUnit( + value=pinfo.get("vram_mem", 0), unit="B" + ), + "usage": { + "gfx": ValueUnit(value=pinfo.get("gfx", 0), unit="%"), + "enc": ValueUnit(value=pinfo.get("enc", 0), unit="%"), + }, }, - } - } + ) + ) ) try: out.append(Processes(gpu=idx, process_list=plist)) @@ -270,33 +291,35 @@ def get_process(self) -> list[Processes] | None: ) return out - def get_partition(self) -> dict | None: - """Collect partition info via AMDSMI; degrade gracefully if unsupported.""" + def get_partition(self) -> Partition | None: devices = self._get_handles() - current: list[dict] = [] - memparts: list[dict] = [] - resources: list[dict] = [] + current: list[PartitionCurrent] = [] + memparts: list[PartitionMemory] = [] + resources: list[dict] = [] # keep as-is if your model allows + for idx, h in enumerate(devices): c = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} c_dict = c if isinstance(c, dict) else {} m_dict = m if isinstance(m, dict) else {} + current.append( - { - "gpu_id": idx, - "memory": c_dict.get("memory"), - "accelerator_type": c_dict.get("accelerator_type"), - "accelerator_profile_index": c_dict.get("accelerator_profile_index"), - "partition_id": c_dict.get("partition_id"), - } + PartitionCurrent( + gpu_id=idx, + memory=c_dict.get("memory"), + accelerator_type=c_dict.get("accelerator_type"), + accelerator_profile_index=c_dict.get("accelerator_profile_index"), + partition_id=c_dict.get("partition_id"), + ) ) memparts.append( - { - "gpu_id": idx, - "memory_partition_caps": m_dict.get("memory_partition_caps"), - "current_partition_id": m_dict.get("current_partition_id"), - } + PartitionMemory( + gpu_id=idx, + memory_partition_caps=m_dict.get("memory_partition_caps"), + current_partition_id=m_dict.get("current_partition_id"), + ) ) + try: return Partition( current_partition=current, @@ -312,9 +335,9 @@ def get_partition(self) -> dict | None: ) return None - def get_firmware(self) -> list[dict] | None: + def get_firmware(self) -> list[Fw] | None: devices = self._get_handles() - out: list[dict] = [] + out: list[Fw] = [] for idx, h in enumerate(devices): raw = self._smi_try(self._amdsmi.amdsmi_get_fw_info, h, default=None) @@ -333,7 +356,7 @@ def get_firmware(self) -> list[dict] | None: else: items = [] - normalized: list[dict] = [] + normalized: list[FwListItem] = [] for e in items: if isinstance(e, dict): fid = ( @@ -346,13 +369,13 @@ def get_firmware(self) -> list[dict] | None: ) ver = e.get("fw_version") or e.get("version") or e.get("fw_ver") or e.get("ver") normalized.append( - { - "fw_id": "" if fid is None else str(fid), - "fw_version": "" if ver is None else str(ver), - } + FwListItem( + fw_id="" if fid is None else str(fid), + fw_version="" if ver is None else str(ver), + ) ) elif isinstance(e, (tuple, list)) and len(e) >= 2: - normalized.append({"fw_id": str(e[0]), "fw_version": str(e[1])}) + normalized.append(FwListItem(fw_id=str(e[0]), fw_version=str(e[1]))) else: self._log_event( category=EventCategory.APPLICATION, @@ -398,7 +421,7 @@ def _smi_try(self, fn, *a, default=None, **kw): } name = CODE2NAME.get(code, "unknown") - if name == "AMDSMI_STATUS_NOT_SUPPORTED" or name == "AMDSMI_STATUS_NOT_FOUND": + if name in ("AMDSMI_STATUS_NOT_SUPPORTED", "AMDSMI_STATUS_NOT_FOUND"): self._log_event( category=EventCategory.APPLICATION, description=f"{fn.__name__} not supported on this device/mode (status={name}, code={code})", @@ -408,11 +431,11 @@ def _smi_try(self, fn, *a, default=None, **kw): if name == "AMDSMI_STATUS_PERMISSION": self._log_event( category=EventCategory.APPLICATION, - description=f"{fn.__name__} permission denied (need access to /dev/kfd & render nodes, or root for RAS). status={name}, code={code}", + description=f"{fn.__name__} permission denied (need access to /dev/kfd & render nodes, or root for RAS). status={name}, code={code})", priority=EventPriority.WARNING, ) return default - # Generic case + self._log_event( category=EventCategory.APPLICATION, description=f"{fn.__name__} failed (status={name}, code={code})", @@ -420,29 +443,8 @@ def _smi_try(self, fn, *a, default=None, **kw): priority=EventPriority.WARNING, ) return default - if name == "AMDSMI_STATUS_PERMISSION": - self._log_event( - category=EventCategory.APPLICATION, - description=f"{fn.__name__} permission denied (need access to /dev/kfd and render nodes). status={name}, code={code}", - priority=EventPriority.WARNING, - ) - return default - self._log_event( - category=EventCategory.APPLICATION, - description=f"{fn.__name__} failed (status={name or 'unknown'}, code={code})", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.WARNING, - ) - return default - self._log_event( - category=EventCategory.APPLICATION, - description=f"{fn.__name__} failed", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.WARNING, - ) - return default - def get_static(self) -> list[dict] | None: + def get_static(self) -> list[AmdSmiStatic] | None: devices = self._get_handles() if not devices: return [] @@ -452,120 +454,105 @@ def _nz(val: object, default: str = "unknown") -> str: s = str(val).strip() if val is not None else "" return s if s and s.upper() != "N/A" else default - _pcie_fn = globals().get("amdsmi_get_pcie_info", None) + def _vu(val: object, unit: str) -> ValueUnit | None: + """Build ValueUnit from mixed numeric/string input, else None.""" + if val in (None, "", "N/A"): + return None + try: + if isinstance(val, str): + v = float(val) if any(ch in val for ch in ".eE") else int(val) + elif isinstance(val, float): + v = val + else: + v = int(val) + except Exception: + return None + return ValueUnit(value=v, unit=unit) - out: list[dict] = [] + pcie_fn = getattr(self._amdsmi, "amdsmi_get_pcie_info", None) + + out: list[AmdSmiStatic] = [] for idx, h in enumerate(devices): board = self._smi_try(self._amdsmi.amdsmi_get_gpu_board_info, h, default={}) or {} asic = self._smi_try(self._amdsmi.amdsmi_get_gpu_asic_info, h, default={}) or {} bdf = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" - _ = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") + _ = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") # TODO kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} - cache_info: list[dict] = [] - soc_pstate = None - xgmi_plpd = None # TODO - clock = None - process_isolation = "" - - # ----------------------- # Bus / PCIe - # ----------------------- - pcie = {} - - if callable(_pcie_fn): - p = self._smi_try(_pcie_fn, h, default={}) or {} + if callable(pcie_fn): + p = self._smi_try(pcie_fn, h, default={}) or {} if isinstance(p, dict): max_w = p.get("max_link_width") max_s = p.get("max_link_speed") pcie_ver = p.get("pcie_version") or p.get("pcie_interface_version") - pcie = { - "bdf": bdf, - "max_pcie_width": ( - { - "value": ( - int(max_w) - if isinstance(max_w, (int, float, str)) and str(max_w).isdigit() - else 0 - ), - "unit": "x", - } - if max_w not in (None, "", "N/A") - else None - ), - "max_pcie_speed": ( - { - "value": ( - float(max_s) if isinstance(max_s, (int, float, str)) else 0 - ), - "unit": "GT/s", - } - if max_s not in (None, "", "N/A") - else None - ), - "pcie_interface_version": _nz(pcie_ver), - "slot_type": _nz(p.get("slot_type")), - } - - if not pcie: - pcie = { - "bdf": bdf, - "max_pcie_width": None, - "max_pcie_speed": None, - "pcie_interface_version": "unknown", - "slot_type": "unknown", - } - - # ----------------------- + bus = StaticBus( + bdf=bdf, + max_pcie_width=_vu(max_w, "x"), + max_pcie_speed=_vu(max_s, "GT/s"), + pcie_interface_version=_nz(pcie_ver), + slot_type=_nz(p.get("slot_type")), + ) + else: + bus = StaticBus( + bdf=bdf, + max_pcie_width=None, + max_pcie_speed=None, + pcie_interface_version="unknown", + slot_type="unknown", + ) + else: + bus = StaticBus( + bdf=bdf, + max_pcie_width=None, + max_pcie_speed=None, + pcie_interface_version="unknown", + slot_type="unknown", + ) + # ASIC - # ----------------------- - asic_mapped = { - "market_name": str(asic.get("market_name") or asic.get("asic_name") or ""), - "vendor_id": str(asic.get("vendor_id", "")), - "vendor_name": str(asic.get("vendor_name", "")), - "subvendor_id": str(asic.get("subvendor_id", "")), - "device_id": str(asic.get("device_id", "")), - "subsystem_id": str(asic.get("subsystem_id", "")), - "rev_id": str(asic.get("rev_id", "")), - "asic_serial": str(asic.get("asic_serial", "")), - "oam_id": int(asic.get("oam_id", 0) or 0), - "num_compute_units": int(asic.get("num_compute_units", 0) or 0), - "target_graphics_version": str(asic.get("target_graphics_version", "")), - } + asic_model = StaticAsic( + market_name=_nz(asic.get("market_name") or asic.get("asic_name"), default=""), + vendor_id=str(asic.get("vendor_id", "")), + vendor_name=str(asic.get("vendor_name", "")), + subvendor_id=str(asic.get("subvendor_id", "")), + device_id=str(asic.get("device_id", "")), + subsystem_id=str(asic.get("subsystem_id", "")), + rev_id=str(asic.get("rev_id", "")), + asic_serial=str(asic.get("asic_serial", "")), + oam_id=int(asic.get("oam_id", 0) or 0), + num_compute_units=int(asic.get("num_compute_units", 0) or 0), + target_graphics_version=str(asic.get("target_graphics_version", "")), + ) - # ----------------------- # Board - # ----------------------- - board_mapped = { - "model_number": str( + board_model = StaticBoard( + model_number=str( board.get("model_number", "") or board.get("amdsmi_model_number", "") ), - "product_serial": str(board.get("product_serial", "")), - "fru_id": str(board.get("fru_id", "")), - "product_name": str(board.get("product_name", "")), - "manufacturer_name": str(board.get("manufacturer_name", "")), - } + product_serial=str(board.get("product_serial", "")), + fru_id=str(board.get("fru_id", "")), + product_name=str(board.get("product_name", "")), + manufacturer_name=str(board.get("manufacturer_name", "")), + ) - # ----------------------- # VBIOS - # ----------------------- - vbios = None - vb = {} - for k in ("vbios_name", "vbios_build_date", "vbios_part_number", "vbios_version"): - if k in board: - vb[k] = board[k] + vb = { + k: board[k] + for k in ("vbios_name", "vbios_build_date", "vbios_part_number", "vbios_version") + if k in board + } + vbios_model: StaticVbios | None = None if vb: - vbios = { - "name": str(vb.get("vbios_name", "")), - "build_date": str(vb.get("vbios_build_date", "")), - "part_number": str(vb.get("vbios_part_number", "")), - "version": str(vb.get("vbios_version", "")), - } - - # ----------------------- - # NUMA (from KFD) - # ----------------------- + vbios_model = StaticVbios( + name=str(vb.get("vbios_name", "")), + build_date=str(vb.get("vbios_build_date", "")), + part_number=str(vb.get("vbios_part_number", "")), + version=str(vb.get("vbios_version", "")), + ) + + # NUMA (via KFD) if isinstance(kfd, dict): try: numa_node = int(kfd.get("node_id", 0) or 0) @@ -577,45 +564,46 @@ def _nz(val: object, default: str = "unknown") -> str: affinity = 0 else: numa_node, affinity = 0, 0 - numa = {"node": numa_node, "affinity": affinity} + numa_model = StaticNuma(node=numa_node, affinity=affinity) - # ----------------------- # VRAM - # ----------------------- vram_type = str(asic.get("vram_type", "") or "unknown") vram_vendor = asic.get("vram_vendor") vram_bits = asic.get("vram_bit_width") - vram_size_b = None + vram_size_b: int | None = None if asic.get("vram_size_bytes") is not None: - vram_size_b = int(asic["vram_size_bytes"]) + try: + vram_size_b = int(asic["vram_size_bytes"]) + except Exception: + vram_size_b = None elif asic.get("vram_size_mb") is not None: try: vram_size_b = int(asic["vram_size_mb"]) * 1024 * 1024 except Exception: vram_size_b = None - vram = { - "type": vram_type, - "vendor": None if vram_vendor in (None, "", "N/A") else str(vram_vendor), - "size": (f"{vram_size_b} B" if isinstance(vram_size_b, int) else None), - "bit_width": (f"{vram_bits} bit" if isinstance(vram_bits, (int, float)) else None), - "max_bandwidth": None, - } + vram_model = StaticVram( + type=vram_type, + vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor), + size=_vu(vram_size_b, "B"), + bit_width=_vu(vram_bits, "bit"), + max_bandwidth=None, + ) try: out.append( AmdSmiStatic( gpu=idx, - asic=asic_mapped, - bus=pcie, - vbios=vbios, - limit=None, - board=board_mapped, + asic=asic_model, + bus=bus, + vbios=vbios_model, + limit=None, # not available via API + board=board_model, soc_pstate=None, # TODO - xgmi_plpd=None, - process_isolation="", # TODO - numa=numa, - vram=vram, + xgmi_plpd=None, # TODO + process_isolation="", + numa=numa_model, + vram=vram_model, cache_info=[], # TODO partition=None, clock=None, # TODO @@ -625,7 +613,7 @@ def _nz(val: object, default: str = "unknown") -> str: self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiStatic", - data={"exception": get_exception_traceback(e), "gpu_index": idx, "pcie": pcie}, + data={"exception": get_exception_traceback(e), "gpu_index": idx}, priority=EventPriority.WARNING, ) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 15c6ee5c..a44e5574 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -23,7 +23,7 @@ def na_to_none(values: int | str): return values -def na_to_none_list(values: list[int | str]) -> List[int | str | None]: +def na_to_none_list(values: List[int | str | None]) -> List[int | str | None]: ret_list: List[int | str | None] = values.copy() for i in range(len(ret_list)): if ret_list[i] == "N/A": From 08ed3f05cd6966c9ec331204817bdbac814923d1 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 17 Sep 2025 15:47:01 -0500 Subject: [PATCH 17/38] filled in data for AmdSmiStatic, clock is left --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 5 +- .../plugins/inband/amdsmi/amdsmi_collector.py | 191 +++++++++++++++++- .../plugins/inband/amdsmi/amdsmidata.py | 2 +- 3 files changed, 190 insertions(+), 8 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index dd61e19d..1f263cb0 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -38,8 +38,8 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel - L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3 # Thresholds defined in https://ontrack-internal.amd.com/browse/DCGPUSDV-1204, must be greated than this value to generate a error event - L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1 # Thresholds defined in https://ontrack-internal.amd.com/browse/SWLORC-10120, Must be greater than this value to generate a warning event + L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3 + L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1 def expected_gpu_processes( self, processes_data: list[Processes] | None, max_num_processes: int @@ -65,7 +65,6 @@ def expected_gpu_processes( process_count = len(process.process_list) # Number of processes for GPU if process_count > max_num_processes: - # Log an error event if the number of processes is greater than the expected number log event gpu_exceeds_num_processes[process.gpu] = process_count if gpu_exceeds_num_processes: diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 6136d863..9f36db01 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -48,9 +48,14 @@ StaticAsic, StaticBoard, StaticBus, + StaticCacheInfoItem, + StaticDriver, StaticNuma, + StaticPolicy, + StaticSocPstate, StaticVbios, StaticVram, + StaticXgmiPlpd, ValueUnit, ) from nodescraper.utils import get_exception_details, get_exception_traceback @@ -537,6 +542,16 @@ def _vu(val: object, unit: str) -> ValueUnit | None: manufacturer_name=str(board.get("manufacturer_name", "")), ) + # Driver + driver_model = None + drv_fn = getattr(self._amdsmi, "amdsmi_get_gpu_driver_info", None) + if callable(drv_fn): + drv = self._smi_try(drv_fn, h, default={}) or {} + driver_model = StaticDriver( + name=_nz(drv.get("driver_name"), default="unknown"), + version=_nz(drv.get("driver_version"), default="unknown"), + ) + # VBIOS vb = { k: board[k] @@ -590,6 +605,10 @@ def _vu(val: object, unit: str) -> ValueUnit | None: max_bandwidth=None, ) + soc_pstate_model = self._get_soc_pstate(h) + xgmi_plpd_model = self._get_xgmi_plpd(h) + cache_info_model = self._get_cache_info(h) + try: out.append( AmdSmiStatic( @@ -598,15 +617,16 @@ def _vu(val: object, unit: str) -> ValueUnit | None: bus=bus, vbios=vbios_model, limit=None, # not available via API + driver=driver_model, board=board_model, - soc_pstate=None, # TODO - xgmi_plpd=None, # TODO + soc_pstate=soc_pstate_model, + xgmi_plpd=xgmi_plpd_model, process_isolation="", numa=numa_model, vram=vram_model, - cache_info=[], # TODO + cache_info=cache_info_model, partition=None, - clock=None, # TODO + clock=None, # TODO amdsmi_get_clk_freq?? ) ) except ValidationError as e: @@ -619,6 +639,169 @@ def _vu(val: object, unit: str) -> ValueUnit | None: return out + def _get_soc_pstate(self, h) -> StaticSocPstate | None: + data = self._smi_try(self._amdsmi.amdsmi_get_soc_pstate, h, default=None) + if not isinstance(data, dict): + return None + + try: + num_supported = int(data.get("num_supported", 0) or 0) + except Exception: + num_supported = 0 + try: + current_id = int(data.get("current_id", 0) or 0) + except Exception: + current_id = 0 + + policies_raw = data.get("policies") or [] + policies: list[StaticPolicy] = [] + if isinstance(policies_raw, list): + for p in policies_raw: + if not isinstance(p, dict): + continue + pid = p.get("policy_id", 0) + desc = p.get("policy_description", "") + try: + policies.append( + StaticPolicy( + policy_id=int(pid) if pid not in (None, "") else 0, + policy_description=str(desc), + ) + ) + except ValidationError: + continue + + if not num_supported and not current_id and not policies: + return None + + try: + return StaticSocPstate( + num_supported=num_supported, + current_id=current_id, + policies=policies, + ) + except ValidationError: + return None + + def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: + data = self._smi_try(self._amdsmi.amdsmi_get_xgmi_plpd, h, default=None) + if not isinstance(data, dict): + return None + + try: + num_supported = int(data.get("num_supported", 0) or 0) + except Exception: + num_supported = 0 + try: + current_id = int(data.get("current_id", 0) or 0) + except Exception: + current_id = 0 + + plpds_raw = data.get("plpds") or [] + plpds: list[StaticPolicy] = [] + if isinstance(plpds_raw, list): + for p in plpds_raw: + if not isinstance(p, dict): + continue + pid = p.get("policy_id", 0) + desc = p.get("policy_description", "") + try: + plpds.append( + StaticPolicy( + policy_id=int(pid) if pid not in (None, "") else 0, + policy_description=str(desc), + ) + ) + except ValidationError: + continue + + if not num_supported and not current_id and not plpds: + return None + + try: + return StaticXgmiPlpd( + num_supported=num_supported, + current_id=current_id, + plpds=plpds, + ) + except ValidationError: + return None + + def _get_cache_info(self, h) -> list[StaticCacheInfoItem]: + """Map amdsmi_get_gpu_cache_info -> List[StaticCacheInfoItem].""" + raw = self._smi_try(self._amdsmi.amdsmi_get_gpu_cache_info, h, default=None) + if raw is None: + return [] + + items = raw if isinstance(raw, list) else [raw] + + def _to_num(v) -> float | int | None: + if isinstance(v, (int, float)): + return v + if isinstance(v, str): + s = v.strip() + try: + return int(s) + except Exception: + try: + return float(s) + except Exception: + return None + return None + + def _vu_req(v) -> ValueUnit: + n = _to_num(v) + return ValueUnit(value=0 if n is None else n, unit="") + + def _vu_opt(v) -> ValueUnit | None: + n = _to_num(v) + return None if n is None else ValueUnit(value=n, unit="") + + def _as_list_str(v) -> list[str]: + if isinstance(v, list): + return [str(x) for x in v] + if isinstance(v, str): + parts = [p.strip() for p in v.replace(";", ",").split(",")] + return [p for p in parts if p] + return [] + + out: list[StaticCacheInfoItem] = [] + for e in items: + if not isinstance(e, dict): + continue + + cache_level = _vu_req(e.get("cache_level")) + max_num_cu_shared = _vu_req(e.get("max_num_cu_shared")) + num_cache_instance = _vu_req(e.get("num_cache_instance")) + cache_size = _vu_opt(e.get("cache_size")) + cache_props = _as_list_str(e.get("cache_properties")) + + # AMDSMI doesn’t give a name , "Lable_" as the label??? + cache_label_val = f"Lable_{int(cache_level.value) if isinstance(cache_level.value, (int, float)) else cache_level.value}" + cache_label = ValueUnit(value=cache_label_val, unit="") + + try: + out.append( + StaticCacheInfoItem( + cache=cache_label, + cache_properties=cache_props, + cache_size=cache_size, + cache_level=cache_level, + max_num_cu_shared=max_num_cu_shared, + num_cache_instance=num_cache_instance, + ) + ) + except ValidationError as ve: + self._log_event( + category=EventCategory.APPLICATION, + description="Bad cache info entry from AMDSMI; skipping", + data={"entry": repr(e), "exception": get_exception_traceback(ve)}, + priority=EventPriority.WARNING, + ) + continue + + return out + def collect_data( self, args=None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index a44e5574..4fb49142 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -472,7 +472,7 @@ class AmdSmiStatic(BaseModel): bus: StaticBus vbios: StaticVbios | None limit: StaticLimit | None - # driver: StaticDriver + driver: StaticDriver | None board: StaticBoard # ras: StaticRas soc_pstate: StaticSocPstate | None From a8437e46d0d89cbdbdade14fbcd552a181482857 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Thu, 18 Sep 2025 11:18:16 -0500 Subject: [PATCH 18/38] added clock and fixed try for API that doesnt exist in this version --- .../plugins/inband/amdsmi/amdsmi_collector.py | 78 +++++++++++++++++-- .../plugins/inband/amdsmi/amdsmidata.py | 10 +-- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 9f36db01..50d23d53 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -49,7 +49,9 @@ StaticBoard, StaticBus, StaticCacheInfoItem, + StaticClockData, StaticDriver, + StaticFrequencyLevels, StaticNuma, StaticPolicy, StaticSocPstate, @@ -111,7 +113,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: try: version = self._get_amdsmi_version() processes = self.get_process() - partition = self.get_partition() + partition = self._get_partition() firmware = self.get_firmware() gpu_list = self.get_gpu_list() statics = self.get_static() @@ -296,7 +298,7 @@ def get_process(self) -> list[Processes] | None: ) return out - def get_partition(self) -> Partition | None: + def _get_partition(self) -> Partition | None: devices = self._get_handles() current: list[PartitionCurrent] = [] memparts: list[PartitionMemory] = [] @@ -608,6 +610,7 @@ def _vu(val: object, unit: str) -> ValueUnit | None: soc_pstate_model = self._get_soc_pstate(h) xgmi_plpd_model = self._get_xgmi_plpd(h) cache_info_model = self._get_cache_info(h) + clock_model = self._get_clock(h) try: out.append( @@ -626,10 +629,11 @@ def _vu(val: object, unit: str) -> ValueUnit | None: vram=vram_model, cache_info=cache_info_model, partition=None, - clock=None, # TODO amdsmi_get_clk_freq?? + clock=clock_model, ) ) except ValidationError as e: + self.logger.error(e) self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiStatic", @@ -640,7 +644,16 @@ def _vu(val: object, unit: str) -> ValueUnit | None: return out def _get_soc_pstate(self, h) -> StaticSocPstate | None: - data = self._smi_try(self._amdsmi.amdsmi_get_soc_pstate, h, default=None) + fn = getattr(self._amdsmi, "amdsmi_get_soc_pstate", None) + if not callable(fn): + self._log_event( + category=EventCategory.APPLICATION, + description="amdsmi_get_soc_pstate not exposed by amdsmi build", + priority=EventPriority.INFO, + ) + return None + + data = self._smi_try(fn, h, default=None) if not isinstance(data, dict): return None @@ -684,7 +697,16 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None: return None def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: - data = self._smi_try(self._amdsmi.amdsmi_get_xgmi_plpd, h, default=None) + fn = getattr(self._amdsmi, "amdsmi_get_xgmi_plpd", None) + if not callable(fn): + self._log_event( + category=EventCategory.APPLICATION, + description="XGMI PLPD not exposed by this amdsmi build", + priority=EventPriority.INFO, + ) + return None + + data = self._smi_try(fn, h, default=None) if not isinstance(data, dict): return None @@ -802,6 +824,52 @@ def _as_list_str(v) -> list[str]: return out + + def _get_clock(self, h) -> StaticClockData | None: + """ + """ + fn = getattr(self._amdsmi, "amdsmi_get_clk_freq", None) + clk_type = getattr(self._amdsmi, "AmdSmiClkType", None) + if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"): + return None + + data = self._smi_try(fn, h, clk_type.SYS, default=None) + if not isinstance(data, dict): + return None + + freqs_raw = data.get("frequency") + if not isinstance(freqs_raw, list): + return None + + freqs_mhz: list[int] = [] + for v in freqs_raw: + if isinstance(v, (int, float)): + freqs_mhz.append(int(round(float(v) / 1_000_000.0))) + + if not freqs_mhz: + return None + + def _fmt(n: int | None) -> str | None: + return None if n is None else f"{n} MHz" + + level0: str = _fmt(freqs_mhz[0]) or "0 MHz" + level1: str | None = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None + level2: str | None = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None + + cur_raw = data.get("current") + try: + current: int | None = None if cur_raw in (None, "", "N/A") else int(cur_raw) + except Exception: + current = None + + try: + levels = StaticFrequencyLevels(Level_0=level0, Level_1=level1, Level_2=level2) + return StaticClockData(frequency=levels, current=current) + except ValidationError: + return None + + + def collect_data( self, args=None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 4fb49142..ae206b95 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -460,10 +460,10 @@ class StaticClockData(BaseModel): model_config = ConfigDict( populate_by_name=True, ) - frequency_levels: StaticFrequencyLevels + frequency: StaticFrequencyLevels - current_level: int | None = Field(..., alias="current level") - na_validator = field_validator("current_level", mode="before")(na_to_none) + current: int | None = Field(..., alias="current") + na_validator = field_validator("current", mode="before")(na_to_none) class AmdSmiStatic(BaseModel): @@ -481,8 +481,8 @@ class AmdSmiStatic(BaseModel): numa: StaticNuma vram: StaticVram cache_info: List[StaticCacheInfoItem] - partition: StaticPartition | None = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ - clock: dict[str, StaticClockData | None] | None = None + partition: StaticPartition | None = None + clock: StaticClockData | None = None na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( na_to_none From c3be35465a7975ea31aafd7b0cfee023f3c78a52 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Thu, 18 Sep 2025 15:15:25 -0500 Subject: [PATCH 19/38] updated partition, and other calls that look slightly differnt --- .../plugins/inband/amdsmi/amdsmi_collector.py | 265 +++++++++--------- .../plugins/inband/amdsmi/amdsmidata.py | 9 +- 2 files changed, 146 insertions(+), 128 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 50d23d53..90b35617 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -24,12 +24,10 @@ # ############################################################################### import importlib -from typing import cast from pydantic import ValidationError from nodescraper.base.inbandcollectortask import InBandDataCollector -from nodescraper.connection.inband.inband import CommandArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult from nodescraper.plugins.inband.amdsmi.amdsmidata import ( @@ -66,8 +64,6 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): """class for collection of inband tool amd-smi data.""" - AMD_SMI_EXE = "amd-smi" - SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} DATA_MODEL = AmdSmiDataModel @@ -113,7 +109,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: try: version = self._get_amdsmi_version() processes = self.get_process() - partition = self._get_partition() + partition = self.get_partition() firmware = self.get_firmware() gpu_list = self.get_gpu_list() statics = self.get_static() @@ -168,26 +164,6 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None: rocm_version=rocm_ver, ) - def _run_amd_smi(self, cmd: str, sudo: bool = False) -> str | None: - """Run amd-smi command""" - cmd_ret: CommandArtifact = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}", sudo=sudo) - - if cmd_ret.exit_code != 0: - self._log_event( - category=EventCategory.APPLICATION, - description="Error running amd-smi command", - data={ - "command": cmd, - "exit_code": cmd_ret.exit_code, - "stderr": cmd_ret.stderr, - }, - priority=EventPriority.ERROR, - console_log=True, - ) - return None - - return cmd_ret.stdout or "" - def get_gpu_list(self) -> list[AmdSmiListItem] | None: devices = self._get_handles() out: list[AmdSmiListItem] = [] @@ -238,48 +214,68 @@ def _to_int(x, default=0): def get_process(self) -> list[Processes] | None: devices = self._get_handles() out: list[Processes] = [] + for idx, h in enumerate(devices): try: - pids = self._amdsmi.amdsmi_get_gpu_process_list(h) or [] + raw_list = ( + self._smi_try(self._amdsmi.amdsmi_get_gpu_process_list, h, default=[]) or [] + ) plist: list[ProcessListItem] = [] - for pid in pids: - pinfo = self._smi_try( - self._amdsmi.amdsmi_get_gpu_compute_process_info, h, pid, default=None - ) - if not isinstance(pinfo, dict): - plist.append(ProcessListItem(process_info=str(pid))) + for entry in raw_list: + if not isinstance(entry, dict): + plist.append(ProcessListItem(process_info=str(entry))) continue - plist.append( - ProcessListItem( - process_info=cast( - ProcessInfo, - { - "name": pinfo.get("name", str(pid)), - "pid": int(pid), - "memory_usage": { - "gtt_mem": ValueUnit( - value=pinfo.get("gtt_mem", 0), unit="B" - ), - "cpu_mem": ValueUnit( - value=pinfo.get("cpu_mem", 0), unit="B" - ), - "vram_mem": ValueUnit( - value=pinfo.get("vram_mem", 0), unit="B" - ), - }, - "mem_usage": ValueUnit( - value=pinfo.get("vram_mem", 0), unit="B" - ), - "usage": { - "gfx": ValueUnit(value=pinfo.get("gfx", 0), unit="%"), - "enc": ValueUnit(value=pinfo.get("enc", 0), unit="%"), - }, - }, + name = entry.get("name", "N/A") + pid_val = entry.get("pid", 0) + try: + pid = int(pid_val) if pid_val not in (None, "") else 0 + except Exception: + pid = 0 + + mem_vu = self._vu(entry.get("mem"), "B") + mu = entry.get("memory_usage") or {} + mem_usage = { + "gtt_mem": self._vu(mu.get("gtt_mem"), "B"), + "cpu_mem": self._vu(mu.get("cpu_mem"), "B"), + "vram_mem": self._vu(mu.get("vram_mem"), "B"), + } + + eu = entry.get("engine_usage") or {} + usage = { + "gfx": self._vu(eu.get("gfx"), "ns"), + "enc": self._vu(eu.get("enc"), "ns"), + } + + cu_occ = self._vu(entry.get("cu_occupancy"), "") + + try: + plist.append( + ProcessListItem( + process_info=ProcessInfo( + name=str(name), + pid=pid, + mem=mem_vu, + memory_usage=mem_usage, + usage=usage, + cu_occupancy=cu_occ, + ) ) ) - ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build ProcessListItem; skipping entry", + data={ + "exception": get_exception_traceback(e), + "gpu_index": idx, + "entry": repr(entry), + }, + priority=EventPriority.WARNING, + ) + continue + try: out.append(Processes(gpu=idx, process_list=plist)) except ValidationError as e: @@ -296,36 +292,71 @@ def get_process(self) -> list[Processes] | None: data={"exception": get_exception_traceback(e), "gpu_index": idx}, priority=EventPriority.WARNING, ) + return out - def _get_partition(self) -> Partition | None: + def get_partition(self) -> Partition | None: devices = self._get_handles() current: list[PartitionCurrent] = [] memparts: list[PartitionMemory] = [] - resources: list[dict] = [] # keep as-is if your model allows + resources: list[dict] = [] for idx, h in enumerate(devices): + # compute c = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} - m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} c_dict = c if isinstance(c, dict) else {} + + # memory + m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} m_dict = m if isinstance(m, dict) else {} - current.append( - PartitionCurrent( - gpu_id=idx, - memory=c_dict.get("memory"), - accelerator_type=c_dict.get("accelerator_type"), - accelerator_profile_index=c_dict.get("accelerator_profile_index"), - partition_id=c_dict.get("partition_id"), + prof_list: list[dict] = ( + [] + ) # amdsmi_get_gpu_accelerator_partition_profile -> currently not supported + + try: + current.append( + PartitionCurrent( + gpu_id=idx, + memory=c_dict.get("memory"), + accelerator_type=c_dict.get("accelerator_type"), + accelerator_profile_index=c_dict.get("accelerator_profile_index"), + partition_id=c_dict.get("partition_id"), + ) ) - ) - memparts.append( - PartitionMemory( - gpu_id=idx, - memory_partition_caps=m_dict.get("memory_partition_caps"), - current_partition_id=m_dict.get("current_partition_id"), + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build PartitionCurrent", + data={ + "exception": get_exception_traceback(e), + "gpu_index": idx, + "data": c_dict, + }, + priority=EventPriority.WARNING, ) - ) + + try: + memparts.append( + PartitionMemory( + gpu_id=idx, + memory_partition_caps=m_dict.get("memory_partition_caps"), + current_partition_id=m_dict.get("current_partition_id"), + ) + ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build PartitionMemory", + data={ + "exception": get_exception_traceback(e), + "gpu_index": idx, + "data": m_dict, + }, + priority=EventPriority.WARNING, + ) + + resources.append({"gpu_id": idx, "profiles": []}) try: return Partition( @@ -461,21 +492,6 @@ def _nz(val: object, default: str = "unknown") -> str: s = str(val).strip() if val is not None else "" return s if s and s.upper() != "N/A" else default - def _vu(val: object, unit: str) -> ValueUnit | None: - """Build ValueUnit from mixed numeric/string input, else None.""" - if val in (None, "", "N/A"): - return None - try: - if isinstance(val, str): - v = float(val) if any(ch in val for ch in ".eE") else int(val) - elif isinstance(val, float): - v = val - else: - v = int(val) - except Exception: - return None - return ValueUnit(value=v, unit=unit) - pcie_fn = getattr(self._amdsmi, "amdsmi_get_pcie_info", None) out: list[AmdSmiStatic] = [] @@ -496,8 +512,8 @@ def _vu(val: object, unit: str) -> ValueUnit | None: pcie_ver = p.get("pcie_version") or p.get("pcie_interface_version") bus = StaticBus( bdf=bdf, - max_pcie_width=_vu(max_w, "x"), - max_pcie_speed=_vu(max_s, "GT/s"), + max_pcie_width=self._vu(max_w, "x"), + max_pcie_speed=self._vu(max_s, "GT/s"), pcie_interface_version=_nz(pcie_ver), slot_type=_nz(p.get("slot_type")), ) @@ -602,8 +618,8 @@ def _vu(val: object, unit: str) -> ValueUnit | None: vram_model = StaticVram( type=vram_type, vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor), - size=_vu(vram_size_b, "B"), - bit_width=_vu(vram_bits, "bit"), + size=self._vu(vram_size_b, "B"), + bit_width=self._vu(vram_bits, "bit"), max_bandwidth=None, ) @@ -757,28 +773,6 @@ def _get_cache_info(self, h) -> list[StaticCacheInfoItem]: items = raw if isinstance(raw, list) else [raw] - def _to_num(v) -> float | int | None: - if isinstance(v, (int, float)): - return v - if isinstance(v, str): - s = v.strip() - try: - return int(s) - except Exception: - try: - return float(s) - except Exception: - return None - return None - - def _vu_req(v) -> ValueUnit: - n = _to_num(v) - return ValueUnit(value=0 if n is None else n, unit="") - - def _vu_opt(v) -> ValueUnit | None: - n = _to_num(v) - return None if n is None else ValueUnit(value=n, unit="") - def _as_list_str(v) -> list[str]: if isinstance(v, list): return [str(x) for x in v] @@ -792,10 +786,10 @@ def _as_list_str(v) -> list[str]: if not isinstance(e, dict): continue - cache_level = _vu_req(e.get("cache_level")) - max_num_cu_shared = _vu_req(e.get("max_num_cu_shared")) - num_cache_instance = _vu_req(e.get("num_cache_instance")) - cache_size = _vu_opt(e.get("cache_size")) + cache_level = self._vu(e.get("cache_level"), "", required=True) + max_num_cu_shared = self._vu(e.get("max_num_cu_shared"), "", required=True) + num_cache_instance = self._vu(e.get("num_cache_instance"), "", required=True) + cache_size = self._vu(e.get("cache_size"), "", required=False) cache_props = _as_list_str(e.get("cache_properties")) # AMDSMI doesn’t give a name , "Lable_" as the label??? @@ -824,10 +818,8 @@ def _as_list_str(v) -> list[str]: return out - def _get_clock(self, h) -> StaticClockData | None: - """ - """ + """ """ fn = getattr(self._amdsmi, "amdsmi_get_clk_freq", None) clk_type = getattr(self._amdsmi, "AmdSmiClkType", None) if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"): @@ -868,8 +860,6 @@ def _fmt(n: int | None) -> str | None: except ValidationError: return None - - def collect_data( self, args=None, @@ -902,3 +892,26 @@ def collect_data( self._amdsmi.amdsmi_shut_down() except Exception: pass + + def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | None: + """ + Build ValueUnit from mixed numeric/string input. + Returns: + None for None/''/'N/A' unless required=True, in which case ValueUnit(0, unit). + """ + if v in (None, "", "N/A"): + return ValueUnit(value=0, unit=unit) if required else None + try: + if isinstance(v, str): + s = v.strip() + try: + n = int(s) + except Exception: + n = float(s) + elif isinstance(v, (int, float)): + n = v + else: + n = int(v) + except Exception: + return ValueUnit(value=0, unit=unit) if required else None + return ValueUnit(value=n, unit=unit) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index ae206b95..dcfcb0b7 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -195,14 +195,17 @@ class CoherentTable(Enum): SELF = "SELF" +# Process class ProcessMemoryUsage(BaseModel): gtt_mem: ValueUnit | None cpu_mem: ValueUnit | None vram_mem: ValueUnit | None + na_validator = field_validator("gtt_mem", "cpu_mem", "vram_mem", mode="before")(na_to_none) class ProcessUsage(BaseModel): + # AMDSMI reports engine usage in nanoseconds gfx: ValueUnit | None enc: ValueUnit | None na_validator = field_validator("gfx", "enc", mode="before")(na_to_none) @@ -211,10 +214,12 @@ class ProcessUsage(BaseModel): class ProcessInfo(BaseModel): name: str pid: int + + mem: ValueUnit | None = None memory_usage: ProcessMemoryUsage - mem_usage: ValueUnit | None usage: ProcessUsage - na_validator = field_validator("mem_usage", mode="before")(na_to_none) + cu_occupancy: ValueUnit | None = None + na_validator = field_validator("mem", "cu_occupancy", mode="before")(na_to_none) class ProcessListItem(BaseModel): From 7faf0f389a77d213a4e55e2e26f03fbf6d50bdb2 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Fri, 19 Sep 2025 13:20:00 -0500 Subject: [PATCH 20/38] fixed partition(compute,gpu), static needs work --- .../plugins/inband/amdsmi/amdsmi_collector.py | 136 ++++++++---------- .../plugins/inband/amdsmi/amdsmidata.py | 44 +++--- 2 files changed, 82 insertions(+), 98 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 90b35617..3f5f22ff 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -38,7 +38,7 @@ Fw, FwListItem, Partition, - PartitionCurrent, + PartitionCompute, PartitionMemory, Processes, ProcessInfo, @@ -179,16 +179,9 @@ def _to_int(x, default=0): uuid = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} + # partition is will be supported in amdsmi_get_gpu_accelerator_partition_profile. + # Currently returns hardcoded empty values partition_id = 0 - cp = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} - if isinstance(cp, dict) and cp.get("partition_id") is not None: - partition_id = _to_int(cp.get("partition_id"), 0) - else: - mp = ( - self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} - ) - if isinstance(mp, dict) and mp.get("current_partition_id") is not None: - partition_id = _to_int(mp.get("current_partition_id"), 0) try: out.append( @@ -297,73 +290,62 @@ def get_process(self) -> list[Processes] | None: def get_partition(self) -> Partition | None: devices = self._get_handles() - current: list[PartitionCurrent] = [] memparts: list[PartitionMemory] = [] - resources: list[dict] = [] + computeparts: list[PartitionCompute] = [] for idx, h in enumerate(devices): # compute - c = self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} - c_dict = c if isinstance(c, dict) else {} + compute_partition = ( + self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} + ) # memory - m = self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} - m_dict = m if isinstance(m, dict) else {} + memory_partition = ( + self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} + ) - prof_list: list[dict] = ( - [] - ) # amdsmi_get_gpu_accelerator_partition_profile -> currently not supported + # accelerator partion currently hardcoded to compty values in API try: - current.append( - PartitionCurrent( + memparts.append( + PartitionMemory( gpu_id=idx, - memory=c_dict.get("memory"), - accelerator_type=c_dict.get("accelerator_type"), - accelerator_profile_index=c_dict.get("accelerator_profile_index"), - partition_id=c_dict.get("partition_id"), + partition_type=memory_partition, ) ) except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, - description="Failed to build PartitionCurrent", + description="Failed to build PartitionMemory", data={ "exception": get_exception_traceback(e), "gpu_index": idx, - "data": c_dict, + "data": memory_partition, }, priority=EventPriority.WARNING, ) try: - memparts.append( - PartitionMemory( + computeparts.append( + PartitionCompute( gpu_id=idx, - memory_partition_caps=m_dict.get("memory_partition_caps"), - current_partition_id=m_dict.get("current_partition_id"), + partition_type=compute_partition, ) ) except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, - description="Failed to build PartitionMemory", + description="Failed to build PartitionCompute", data={ "exception": get_exception_traceback(e), "gpu_index": idx, - "data": m_dict, + "data": compute_partition, }, priority=EventPriority.WARNING, ) - resources.append({"gpu_id": idx, "profiles": []}) - try: - return Partition( - current_partition=current, - memory_partition=memparts, - partition_resources=resources, - ) + return Partition(memory_partition=memparts, compute_partition=computeparts) except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, @@ -382,38 +364,19 @@ def get_firmware(self) -> list[Fw] | None: if raw is None: continue - if isinstance(raw, list): - items = raw - elif isinstance(raw, dict): - if isinstance(raw.get("fw_list"), list): - items = raw["fw_list"] - elif raw and all(not isinstance(v, (dict, list, tuple)) for v in raw.values()): - items = [{"fw_id": k, "fw_version": v} for k, v in raw.items()] - else: - items = [raw] - else: - items = [] + items = raw["fw_list"] normalized: list[FwListItem] = [] for e in items: if isinstance(e, dict): - fid = ( - e.get("fw_id") - or e.get("fw_name") - or e.get("name") - or e.get("block") - or e.get("type") - or e.get("id") - ) - ver = e.get("fw_version") or e.get("version") or e.get("fw_ver") or e.get("ver") + fid = e.get("fw_name") + ver = e.get("fw_version") normalized.append( FwListItem( - fw_id="" if fid is None else str(fid), + fw_name="" if fid is None else str(fid), fw_version="" if ver is None else str(ver), ) ) - elif isinstance(e, (tuple, list)) and len(e) >= 2: - normalized.append(FwListItem(fw_id=str(e[0]), fw_version=str(e[1]))) else: self._log_event( category=EventCategory.APPLICATION, @@ -487,11 +450,6 @@ def get_static(self) -> list[AmdSmiStatic] | None: if not devices: return [] - def _nz(val: object, default: str = "unknown") -> str: - """Normalize possibly-empty/NA strings to a non-empty default.""" - s = str(val).strip() if val is not None else "" - return s if s and s.upper() != "N/A" else default - pcie_fn = getattr(self._amdsmi, "amdsmi_get_pcie_info", None) out: list[AmdSmiStatic] = [] @@ -507,15 +465,15 @@ def _nz(val: object, default: str = "unknown") -> str: if callable(pcie_fn): p = self._smi_try(pcie_fn, h, default={}) or {} if isinstance(p, dict): - max_w = p.get("max_link_width") - max_s = p.get("max_link_speed") - pcie_ver = p.get("pcie_version") or p.get("pcie_interface_version") + max_w = p.get("max_pcie_width") + max_s = p.get("max_pcie_speed") + pcie_ver = p.get("pcie_interface_version") bus = StaticBus( bdf=bdf, max_pcie_width=self._vu(max_w, "x"), max_pcie_speed=self._vu(max_s, "GT/s"), - pcie_interface_version=_nz(pcie_ver), - slot_type=_nz(p.get("slot_type")), + pcie_interface_version=self._nz(pcie_ver), + slot_type=self._nz(p.get("slot_type"), slot_type=True), ) else: bus = StaticBus( @@ -523,7 +481,7 @@ def _nz(val: object, default: str = "unknown") -> str: max_pcie_width=None, max_pcie_speed=None, pcie_interface_version="unknown", - slot_type="unknown", + slot_type="Unknown", ) else: bus = StaticBus( @@ -531,12 +489,12 @@ def _nz(val: object, default: str = "unknown") -> str: max_pcie_width=None, max_pcie_speed=None, pcie_interface_version="unknown", - slot_type="unknown", + slot_type="Unknown", ) # ASIC asic_model = StaticAsic( - market_name=_nz(asic.get("market_name") or asic.get("asic_name"), default=""), + market_name=self._nz(asic.get("market_name") or asic.get("asic_name"), default=""), vendor_id=str(asic.get("vendor_id", "")), vendor_name=str(asic.get("vendor_name", "")), subvendor_id=str(asic.get("subvendor_id", "")), @@ -566,8 +524,8 @@ def _nz(val: object, default: str = "unknown") -> str: if callable(drv_fn): drv = self._smi_try(drv_fn, h, default={}) or {} driver_model = StaticDriver( - name=_nz(drv.get("driver_name"), default="unknown"), - version=_nz(drv.get("driver_version"), default="unknown"), + name=self._nz(drv.get("driver_name"), default="unknown"), + version=self._nz(drv.get("driver_version"), default="unknown"), ) # VBIOS @@ -915,3 +873,25 @@ def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | No except Exception: return ValueUnit(value=0, unit=unit) if required else None return ValueUnit(value=n, unit=unit) + + def _nz(self, val: object, default: str = "unknown", *, slot_type: bool = False) -> str: + """ + Normalize strings: + - Generic: return trimmed value unless empty/'N/A', else `default`. + - slot_type=True: map to one of {'OAM','PCIE','CEM','Unknown'}. + """ + s = str(val).strip() if val is not None else "" + if not s or s.upper() == "N/A": + return "Unknown" if slot_type else default + + if slot_type: + u = s.upper().replace(" ", "").replace("-", "") + if u == "OAM": + return "OAM" + if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"): + return "PCIE" + if u == "CEM": + return "CEM" + return "Unknown" + + return s diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index dcfcb0b7..be1f11c9 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -233,8 +233,8 @@ class Processes(BaseModel): # FW class FwListItem(BaseModel): - fw_id: str fw_version: str + fw_name: str class Fw(BaseModel): @@ -273,8 +273,8 @@ def _stringify(cls, v): return str(v) -class PartitionCurrent(BaseModel): - """Contains the Current Partition data for the GPUs""" +class PartitionAccelerator(BaseModel): + """Contains the tition data for the GPUs""" gpu_id: int memory: str | None = None @@ -287,33 +287,37 @@ class PartitionMemory(BaseModel): """Memory Partition data""" gpu_id: int - memory_partition_caps: str | None = None - current_partition_id: str | None = None + partition_type: str | None = None -class PartitionProfiles(AmdSmiBaseModel): - """Partition Profiles data""" +class PartitionCompute(BaseModel): + """Compute Partition data""" gpu_id: int - profile_index: str | None = None - memory_partition_caps: str | None = None - accelerator_type: str | None = None - partition_id: str | None = None - num_partitions: str | None = None - num_resources: str | None = None - resource_index: str | None = None - resource_type: str | None = None - resource_instances: str | None = None - resources_shared: str | None = None + partition_type: str | None = None + + +# class PartitionProfiles(AmdSmiBaseModel): +# """Partition Profiles data""" +# +# gpu_id: int +# profile_index: str | None = None +# memory_partition_caps: str | None = None +# accelerator_type: str | None = None +# partition_id: str | None = None +# num_partitions: str | None = None +# num_resources: str | None = None +# resource_index: str | None = None +# resource_type: str | None = None +# resource_instances: str | None = None +# resources_shared: str | None = None class Partition(BaseModel): """Contains the partition info for amd-smi""" - current_partition: list[PartitionCurrent] = Field(default_factory=list) memory_partition: list[PartitionMemory] = Field(default_factory=list) - partition_profiles: list[dict] = Field(default_factory=list) - partition_resources: list[dict] = Field(default_factory=list) + compute_partition: list[PartitionCompute] = Field(default_factory=list) ### STATIC DATA ### From f4a40645f00b4bfc6160d9344033847ad84d570f Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Mon, 22 Sep 2025 11:15:47 -0500 Subject: [PATCH 21/38] fixed measuring units, mypy --- .../plugins/inband/amdsmi/amdsmi_collector.py | 336 ++++++++++-------- 1 file changed, 190 insertions(+), 146 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 3f5f22ff..54c74d7f 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -24,6 +24,7 @@ # ############################################################################### import importlib +from typing import Any, Optional, Union, cast from pydantic import ValidationError @@ -43,6 +44,8 @@ Processes, ProcessInfo, ProcessListItem, + ProcessMemoryUsage, + ProcessUsage, StaticAsic, StaticBoard, StaticBus, @@ -68,8 +71,66 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel + _amdsmi: Any | None = None # dynamic import + + def _amdsmi_mod(self) -> Any: + assert self._amdsmi is not None, "amdsmi module not bound" + return self._amdsmi + + def _to_number(self, v: object) -> Union[int, float] | None: + if v in (None, "", "N/A"): + return None + try: + if isinstance(v, (int, float)): + return v + if isinstance(v, str): + s = v.strip() + try: + return int(s) + except Exception: + return float(s) + return float(str(v)) + except Exception: + return None + + def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | None: + """ + Build ValueUnit from mixed numeric/string input. + None/''/'N/A' -> None unless required=True (then 0{unit}) + """ + n = self._to_number(v) + if n is None: + return ValueUnit(value=0, unit=unit) if required else None + return ValueUnit(value=n, unit=unit) + + def _vu_req(self, v: object, unit: str) -> ValueUnit: + vu = self._vu(v, unit, required=True) + assert vu is not None + return vu + + def _nz(self, val: object, default: str = "unknown", *, slot_type: bool = False) -> str: + """ + Normalize strings: + - Generic: return trimmed value unless empty/'N/A', else `default`. + - slot_type=True: map to one of {'OAM','PCIE','CEM','Unknown'}. + """ + s = str(val).strip() if val is not None else "" + if not s or s.upper() == "N/A": + return "Unknown" if slot_type else default + + if slot_type: + u = s.upper().replace(" ", "").replace("-", "") + if u == "OAM": + return "OAM" + if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"): + return "PCIE" + if u == "CEM": + return "CEM" + return "Unknown" + + return s + def _bind_amdsmi_or_log(self) -> bool: - """Import amdsmi and store the module on self. Return True if ok.""" if getattr(self, "_amdsmi", None) is not None: return True try: @@ -86,9 +147,10 @@ def _bind_amdsmi_or_log(self) -> bool: return False def _get_handles(self): + amdsmi = self._amdsmi_mod() try: - return self._amdsmi.amdsmi_get_processor_handles() - except self._amdsmi.AmdSmiException as e: + return amdsmi.amdsmi_get_processor_handles() + except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] self._log_event( category=EventCategory.APPLICATION, description="amdsmi_get_processor_handles failed", @@ -99,13 +161,6 @@ def _get_handles(self): return [] def _get_amdsmi_data(self) -> AmdSmiDataModel | None: - """Returns amd-smi tool data formatted as a AmdSmiDataModel object - - Returns None if tool is not installed or if drivers are not loaded - - Returns: - Union[AmdSmiDataModel, None]: AmdSmiDataModel object or None on failure - """ try: version = self._get_amdsmi_version() processes = self.get_process() @@ -144,11 +199,11 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: return None def _get_amdsmi_version(self) -> AmdSmiVersion | None: - """Get lib/rocm versions.""" + amdsmi = self._amdsmi_mod() try: - lib_ver = self._amdsmi.amdsmi_get_lib_version() or "" - rocm_ver = self._amdsmi.amdsmi_get_rocm_version() or "" - except self._amdsmi.AmdSmiException as e: + lib_ver = amdsmi.amdsmi_get_lib_version() or "" + rocm_ver = amdsmi.amdsmi_get_rocm_version() or "" + except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] self._log_event( category=EventCategory.APPLICATION, description="Failed to read AMD SMI versions", @@ -165,6 +220,7 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None: ) def get_gpu_list(self) -> list[AmdSmiListItem] | None: + amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[AmdSmiListItem] = [] @@ -175,13 +231,11 @@ def _to_int(x, default=0): return default for idx, h in enumerate(devices): - bdf = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" - uuid = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" - kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} + bdf = self._smi_try(amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" + uuid = self._smi_try(amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" + kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} - # partition is will be supported in amdsmi_get_gpu_accelerator_partition_profile. - # Currently returns hardcoded empty values - partition_id = 0 + partition_id = 0 # no profile id available yet try: out.append( @@ -205,14 +259,13 @@ def _to_int(x, default=0): return out def get_process(self) -> list[Processes] | None: + amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[Processes] = [] for idx, h in enumerate(devices): try: - raw_list = ( - self._smi_try(self._amdsmi.amdsmi_get_gpu_process_list, h, default=[]) or [] - ) + raw_list = self._smi_try(amdsmi.amdsmi_get_gpu_process_list, h, default=[]) or [] plist: list[ProcessListItem] = [] for entry in raw_list: @@ -228,18 +281,19 @@ def get_process(self) -> list[Processes] | None: pid = 0 mem_vu = self._vu(entry.get("mem"), "B") + mu = entry.get("memory_usage") or {} - mem_usage = { - "gtt_mem": self._vu(mu.get("gtt_mem"), "B"), - "cpu_mem": self._vu(mu.get("cpu_mem"), "B"), - "vram_mem": self._vu(mu.get("vram_mem"), "B"), - } + mem_usage = ProcessMemoryUsage( + gtt_mem=self._vu(mu.get("gtt_mem"), "B"), + cpu_mem=self._vu(mu.get("cpu_mem"), "B"), + vram_mem=self._vu(mu.get("vram_mem"), "B"), + ) eu = entry.get("engine_usage") or {} - usage = { - "gfx": self._vu(eu.get("gfx"), "ns"), - "enc": self._vu(eu.get("enc"), "ns"), - } + usage = ProcessUsage( + gfx=self._vu(eu.get("gfx"), "ns"), + enc=self._vu(eu.get("enc"), "ns"), + ) cu_occ = self._vu(entry.get("cu_occupancy"), "") @@ -278,7 +332,7 @@ def get_process(self) -> list[Processes] | None: data={"exception": get_exception_traceback(e), "gpu_index": idx}, priority=EventPriority.WARNING, ) - except self._amdsmi.AmdSmiException as e: + except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] self._log_event( category=EventCategory.APPLICATION, description="Process collection failed", @@ -289,28 +343,31 @@ def get_process(self) -> list[Processes] | None: return out def get_partition(self) -> Partition | None: + amdsmi = self._amdsmi_mod() devices = self._get_handles() memparts: list[PartitionMemory] = [] computeparts: list[PartitionCompute] = [] for idx, h in enumerate(devices): - # compute compute_partition = ( - self._smi_try(self._amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} + self._smi_try(amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} ) - - # memory memory_partition = ( - self._smi_try(self._amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} + self._smi_try(amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} ) - # accelerator partion currently hardcoded to compty values in API + mem_pt: Optional[str] = None + if isinstance(memory_partition, dict): + mem_pt = cast(Optional[str], memory_partition.get("partition_type")) + comp_pt: Optional[str] = None + if isinstance(compute_partition, dict): + comp_pt = cast(Optional[str], compute_partition.get("partition_type")) try: memparts.append( PartitionMemory( gpu_id=idx, - partition_type=memory_partition, + partition_type=mem_pt, ) ) except ValidationError as e: @@ -329,7 +386,7 @@ def get_partition(self) -> Partition | None: computeparts.append( PartitionCompute( gpu_id=idx, - partition_type=compute_partition, + partition_type=comp_pt, ) ) except ValidationError as e: @@ -356,12 +413,17 @@ def get_partition(self) -> Partition | None: return None def get_firmware(self) -> list[Fw] | None: + amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[Fw] = [] for idx, h in enumerate(devices): - raw = self._smi_try(self._amdsmi.amdsmi_get_fw_info, h, default=None) - if raw is None: + raw = self._smi_try(amdsmi.amdsmi_get_fw_info, h, default=None) + if ( + not isinstance(raw, dict) + or "fw_list" not in raw + or not isinstance(raw["fw_list"], list) + ): continue items = raw["fw_list"] @@ -381,7 +443,7 @@ def get_firmware(self) -> list[Fw] | None: self._log_event( category=EventCategory.APPLICATION, description="Unrecognized firmware entry shape", - data={"entry_repr": repr(e)}, + data={"entry_shape": repr(e)}, priority=EventPriority.INFO, ) @@ -398,12 +460,10 @@ def get_firmware(self) -> list[Fw] | None: return out def _smi_try(self, fn, *a, default=None, **kw): - """Call an AMDSMI function and normalize common library errors. - Extracts numeric ret_code from exceptions that don't expose a .status enum. - """ + amdsmi = self._amdsmi_mod() try: return fn(*a, **kw) - except self._amdsmi.AmdSmiException as e: + except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] self.logger.warning(e) code = getattr(e, "ret_code", None) if code is None: @@ -446,51 +506,54 @@ def _smi_try(self, fn, *a, default=None, **kw): return default def get_static(self) -> list[AmdSmiStatic] | None: + amdsmi = self._amdsmi_mod() devices = self._get_handles() if not devices: return [] - pcie_fn = getattr(self._amdsmi, "amdsmi_get_pcie_info", None) + pcie_fn = getattr(amdsmi, "amdsmi_get_pcie_info", None) out: list[AmdSmiStatic] = [] for idx, h in enumerate(devices): - board = self._smi_try(self._amdsmi.amdsmi_get_gpu_board_info, h, default={}) or {} - asic = self._smi_try(self._amdsmi.amdsmi_get_gpu_asic_info, h, default={}) or {} - bdf = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" - _ = self._smi_try(self._amdsmi.amdsmi_get_gpu_device_uuid, h, default="") # TODO - kfd = self._smi_try(self._amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} + board = self._smi_try(amdsmi.amdsmi_get_gpu_board_info, h, default={}) or {} + asic = self._smi_try(amdsmi.amdsmi_get_gpu_asic_info, h, default={}) or {} + bdf = self._smi_try(amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" + kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} # Bus / PCIe + bus = StaticBus( + bdf=bdf, + max_pcie_width=None, + max_pcie_speed=None, + pcie_interface_version="unknown", + slot_type="Unknown", + ) + if callable(pcie_fn): p = self._smi_try(pcie_fn, h, default={}) or {} - if isinstance(p, dict): - max_w = p.get("max_pcie_width") - max_s = p.get("max_pcie_speed") - pcie_ver = p.get("pcie_interface_version") + d = p.get("pcie_static", p) if isinstance(p, dict) else {} + + if isinstance(d, dict): + max_w = d.get("max_pcie_width") + max_s = d.get("max_pcie_speed") + pcie_ver = d.get("pcie_interface_version") + + # MT/s -> GT/s + ms_val = self._to_number(max_s) + gtps = ( + (cast(float, ms_val) / 1000.0) + if (isinstance(ms_val, (int, float)) and ms_val >= 1000) + else ms_val + ) + bus = StaticBus( bdf=bdf, max_pcie_width=self._vu(max_w, "x"), - max_pcie_speed=self._vu(max_s, "GT/s"), + max_pcie_speed=self._vu(gtps, "GT/s"), pcie_interface_version=self._nz(pcie_ver), - slot_type=self._nz(p.get("slot_type"), slot_type=True), + slot_type=self._nz(d.get("slot_type"), slot_type=True), ) - else: - bus = StaticBus( - bdf=bdf, - max_pcie_width=None, - max_pcie_speed=None, - pcie_interface_version="unknown", - slot_type="Unknown", - ) - else: - bus = StaticBus( - bdf=bdf, - max_pcie_width=None, - max_pcie_speed=None, - pcie_interface_version="unknown", - slot_type="Unknown", - ) # ASIC asic_model = StaticAsic( @@ -520,7 +583,7 @@ def get_static(self) -> list[AmdSmiStatic] | None: # Driver driver_model = None - drv_fn = getattr(self._amdsmi, "amdsmi_get_gpu_driver_info", None) + drv_fn = getattr(amdsmi, "amdsmi_get_gpu_driver_info", None) if callable(drv_fn): drv = self._smi_try(drv_fn, h, default={}) or {} driver_model = StaticDriver( @@ -593,7 +656,7 @@ def get_static(self) -> list[AmdSmiStatic] | None: asic=asic_model, bus=bus, vbios=vbios_model, - limit=None, # not available via API + limit=None, driver=driver_model, board=board_model, soc_pstate=soc_pstate_model, @@ -618,7 +681,8 @@ def get_static(self) -> list[AmdSmiStatic] | None: return out def _get_soc_pstate(self, h) -> StaticSocPstate | None: - fn = getattr(self._amdsmi, "amdsmi_get_soc_pstate", None) + amdsmi = self._amdsmi_mod() + fn = getattr(amdsmi, "amdsmi_get_soc_pstate", None) if not callable(fn): self._log_event( category=EventCategory.APPLICATION, @@ -671,7 +735,8 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None: return None def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: - fn = getattr(self._amdsmi, "amdsmi_get_xgmi_plpd", None) + amdsmi = self._amdsmi_mod() + fn = getattr(amdsmi, "amdsmi_get_xgmi_plpd", None) if not callable(fn): self._log_event( category=EventCategory.APPLICATION, @@ -724,12 +789,12 @@ def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: return None def _get_cache_info(self, h) -> list[StaticCacheInfoItem]: - """Map amdsmi_get_gpu_cache_info -> List[StaticCacheInfoItem].""" - raw = self._smi_try(self._amdsmi.amdsmi_get_gpu_cache_info, h, default=None) - if raw is None: + amdsmi = self._amdsmi_mod() + raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, h, default=None) + if not isinstance(raw, dict) or not isinstance(raw.get("cache"), list): return [] - items = raw if isinstance(raw, list) else [raw] + items = raw["cache"] def _as_list_str(v) -> list[str]: if isinstance(v, list): @@ -744,14 +809,16 @@ def _as_list_str(v) -> list[str]: if not isinstance(e, dict): continue - cache_level = self._vu(e.get("cache_level"), "", required=True) - max_num_cu_shared = self._vu(e.get("max_num_cu_shared"), "", required=True) - num_cache_instance = self._vu(e.get("num_cache_instance"), "", required=True) + cache_level = self._vu_req(e.get("cache_level"), "") + max_num_cu_shared = self._vu_req(e.get("max_num_cu_shared"), "") + num_cache_instance = self._vu_req(e.get("num_cache_instance"), "") cache_size = self._vu(e.get("cache_size"), "", required=False) cache_props = _as_list_str(e.get("cache_properties")) - # AMDSMI doesn’t give a name , "Lable_" as the label??? - cache_label_val = f"Lable_{int(cache_level.value) if isinstance(cache_level.value, (int, float)) else cache_level.value}" + lvl_val = cache_level.value + cache_label_val = ( + f"Lable_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}" + ) cache_label = ValueUnit(value=cache_label_val, unit="") try: @@ -777,9 +844,9 @@ def _as_list_str(v) -> list[str]: return out def _get_clock(self, h) -> StaticClockData | None: - """ """ - fn = getattr(self._amdsmi, "amdsmi_get_clk_freq", None) - clk_type = getattr(self._amdsmi, "AmdSmiClkType", None) + amdsmi = self._amdsmi_mod() + fn = getattr(amdsmi, "amdsmi_get_clk_freq", None) + clk_type = getattr(amdsmi, "AmdSmiClkType", None) if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"): return None @@ -788,13 +855,25 @@ def _get_clock(self, h) -> StaticClockData | None: return None freqs_raw = data.get("frequency") - if not isinstance(freqs_raw, list): + if not isinstance(freqs_raw, list) or not freqs_raw: return None + def _to_mhz(v: object) -> int | None: + x = self._to_number(v) + if x is None: + return None + xf = float(x) + if xf >= 1e7: + return int(round(xf / 1_000_000.0)) + if xf >= 1e4: + return int(round(xf / 1_000.0)) + return int(round(xf)) + freqs_mhz: list[int] = [] for v in freqs_raw: - if isinstance(v, (int, float)): - freqs_mhz.append(int(round(float(v) / 1_000_000.0))) + mhz = _to_mhz(v) + if mhz is not None: + freqs_mhz.append(mhz) if not freqs_mhz: return None @@ -807,13 +886,22 @@ def _fmt(n: int | None) -> str | None: level2: str | None = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None cur_raw = data.get("current") - try: - current: int | None = None if cur_raw in (None, "", "N/A") else int(cur_raw) - except Exception: + current: int | None + if isinstance(cur_raw, (int, float)): + current = int(cur_raw) + elif isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A": + try: + current = int(cur_raw.strip()) + except Exception: + current = None + else: current = None try: - levels = StaticFrequencyLevels(Level_0=level0, Level_1=level1, Level_2=level2) + levels = StaticFrequencyLevels.model_validate( + {"Level 0": level0, "Level 1": level1, "Level 2": level2} + ) + return StaticClockData(frequency=levels, current=current) except ValidationError: return None @@ -827,8 +915,9 @@ def collect_data( self.result.status = ExecutionStatus.NOT_RAN return self.result, None + amdsmi = self._amdsmi_mod() try: - self._amdsmi.amdsmi_init(self._amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS) + amdsmi.amdsmi_init(amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS) # type: ignore[attr-defined] amd_smi_data = self._get_amdsmi_data() if amd_smi_data is None: @@ -847,51 +936,6 @@ def collect_data( return self.result, None finally: try: - self._amdsmi.amdsmi_shut_down() + amdsmi.amdsmi_shut_down() except Exception: pass - - def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | None: - """ - Build ValueUnit from mixed numeric/string input. - Returns: - None for None/''/'N/A' unless required=True, in which case ValueUnit(0, unit). - """ - if v in (None, "", "N/A"): - return ValueUnit(value=0, unit=unit) if required else None - try: - if isinstance(v, str): - s = v.strip() - try: - n = int(s) - except Exception: - n = float(s) - elif isinstance(v, (int, float)): - n = v - else: - n = int(v) - except Exception: - return ValueUnit(value=0, unit=unit) if required else None - return ValueUnit(value=n, unit=unit) - - def _nz(self, val: object, default: str = "unknown", *, slot_type: bool = False) -> str: - """ - Normalize strings: - - Generic: return trimmed value unless empty/'N/A', else `default`. - - slot_type=True: map to one of {'OAM','PCIE','CEM','Unknown'}. - """ - s = str(val).strip() if val is not None else "" - if not s or s.upper() == "N/A": - return "Unknown" if slot_type else default - - if slot_type: - u = s.upper().replace(" ", "").replace("-", "") - if u == "OAM": - return "OAM" - if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"): - return "PCIE" - if u == "CEM": - return "CEM" - return "Unknown" - - return s From 315c7d4394f37123d17c4c8b0cce0d470fe0801f Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Mon, 22 Sep 2025 16:00:46 -0500 Subject: [PATCH 22/38] added more analyzer parts --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 218 +++++++++++++++++- .../plugins/inband/amdsmi/amdsmi_plugin.py | 8 +- .../plugins/inband/amdsmi/analyzer_args.py | 21 +- 3 files changed, 228 insertions(+), 19 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 1f263cb0..d781fce2 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -29,7 +29,7 @@ from nodescraper.interfaces import DataAnalyzer from nodescraper.models import TaskResult -from .amdsmidata import AmdSmiDataModel, Fw, Partition, Processes +from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes from .analyzer_args import AmdSmiAnalyzerArgs @@ -41,6 +41,75 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3 L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1 + def check_expected_max_power( + self, + amdsmi_static_data: list[AmdSmiStatic], + expected_max_power: int, + ): + """Check the max power for all GPUs. If the max power is not as expected, log an error event""" + incorrect_max_power_gpus: dict[int, int | str | float] = {} + for gpu in amdsmi_static_data: + if gpu.limit is None or gpu.limit.max_power is None: + self._log_event( + category=EventCategory.PLATFORM, + description=f"GPU: {gpu.gpu} has no max power limit set", + priority=EventPriority.WARNING, + data={"gpu": gpu.gpu}, + ) + continue + max_power_value = gpu.limit.max_power.value + try: + max_power_float = float(max_power_value) + except ValueError: + self._log_event( + category=EventCategory.PLATFORM, + description=f"GPU: {gpu.gpu} has an invalid max power limit", + priority=EventPriority.ERROR, + data={ + "gpu": gpu.gpu, + "max_power_value": max_power_value, + }, + ) + continue + if max_power_float != expected_max_power: + incorrect_max_power_gpus[gpu.gpu] = max_power_float + if incorrect_max_power_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Max power mismatch", + priority=EventPriority.ERROR, + data={ + "gpus": list(incorrect_max_power_gpus.keys()), + "max_power_values": incorrect_max_power_gpus, + "expected_max_power": expected_max_power, + }, + ) + + def check_expected_driver_version( + self, + amdsmi_static_data: list[AmdSmiStatic], + expected_driver_version: str, + ): + bad_driver_gpus = [] + for gpu in amdsmi_static_data: + if gpu.driver.version != expected_driver_version: + bad_driver_gpus.append(gpu.gpu) + if bad_driver_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="Driver Version Mismatch", + priority=EventPriority.ERROR, + data={ + "gpus": bad_driver_gpus, + "driver_version": { + gpu.gpu: gpu.driver.version + for gpu in amdsmi_static_data + if gpu.gpu in bad_driver_gpus + }, + "expected_driver_version": expected_driver_version, + }, + ) + def expected_gpu_processes( self, processes_data: list[Processes] | None, max_num_processes: int ): @@ -63,7 +132,7 @@ def expected_gpu_processes( # Skip if there are no processes or the process info is a string which indicates no processes continue - process_count = len(process.process_list) # Number of processes for GPU + process_count = len(process.process_list) if process_count > max_num_processes: gpu_exceeds_num_processes[process.gpu] = process_count @@ -78,6 +147,85 @@ def expected_gpu_processes( console_log=True, ) + def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): + """Check the static data for all GPUs. If the static data is not consistent, log an error event""" + consistancy_data: dict[str, set[str] | set[int]] = { + "market_name": {gpu.asic.market_name for gpu in amdsmi_static_data}, + "vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data}, + "vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data}, + "subvendor_id": {gpu.asic.subvendor_id for gpu in amdsmi_static_data}, + "subsystem_id": {gpu.asic.subsystem_id for gpu in amdsmi_static_data}, + "device_id": {gpu.asic.device_id for gpu in amdsmi_static_data}, + "rev_id": {gpu.asic.rev_id for gpu in amdsmi_static_data}, + "num_compute_units": {gpu.asic.num_compute_units for gpu in amdsmi_static_data}, + "target_graphics_version": { + gpu.asic.target_graphics_version for gpu in amdsmi_static_data + }, + } + for key, value in consistancy_data.items(): + if len(value) > 1: + self._log_event( + category=EventCategory.PLATFORM, + description=f"{key} is not consistent across all GPUs", + priority=EventPriority.ERROR, + data={ + "field": key, + "non_consistent_values": value, + }, + ) + + def check_static_data( + self, + amdsmi_static_data: list[AmdSmiStatic], + vendor_id: str | None, + subvendor_id: str | None, + device_id: tuple[str | None, str | None], + subsystem_id: tuple[str | None, str | None], + sku_name: str, + ): + mismatch_gpus: list[tuple[int, str, str]] = [] + expected_data: dict[str, str | None] = { + "vendor_id": vendor_id, + "subvendor_id": subvendor_id, + "vendor_name": "Advanced Micro Devices Inc", + "market_name": sku_name, + } + for gpu_data in amdsmi_static_data: + for key in expected_data: + collected_data: dict[str, str] = { + "vendor_id": gpu_data.asic.vendor_id, + "subvendor_id": gpu_data.asic.subvendor_id, + "vendor_name": gpu_data.asic.vendor_name, + "market_name": sku_name, + } + if expected_data[key] is not None: + if expected_data[key] not in collected_data[key]: + mismatch_gpus.append((gpu_data.gpu, key, collected_data[key])) + break + if device_id[0] is not None and device_id[1] is not None: + if ( + device_id[0].upper() not in gpu_data.asic.device_id.upper() + and device_id[1].upper() not in gpu_data.asic.device_id.upper() + ): + mismatch_gpus.append((gpu_data.gpu, "device_id", gpu_data.asic.device_id)) + if subsystem_id[0] is not None and subsystem_id[1] is not None: + if ( + subsystem_id[0].upper() not in gpu_data.asic.subsystem_id.upper() + and subsystem_id[1].upper() not in gpu_data.asic.subsystem_id.upper() + ): + mismatch_gpus.append((gpu_data.gpu, "subsystem_id", gpu_data.asic.subsystem_id)) + if mismatch_gpus: + self._log_event( + category=EventCategory.PLATFORM, + description="amd-smi static data mismatch", + priority=EventPriority.ERROR, + data={ + "gpus": [data[0] for data in mismatch_gpus], + "key": [data[1] for data in mismatch_gpus], + "collected_data": [data[2] for data in mismatch_gpus], + }, + ) + def check_pldm_version( self, amdsmi_fw_data: list[Fw] | None, @@ -98,9 +246,9 @@ def check_pldm_version( for fw_data in amdsmi_fw_data: gpu = fw_data.gpu for fw_info in fw_data.fw_list: - if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version: + if PLDM_STRING == fw_info.fw_name and expected_pldm_version != fw_info.fw_version: mismatched_gpus.append(gpu) - if PLDM_STRING == fw_info.fw_id: + if PLDM_STRING == fw_info.fw_name: break else: pldm_missing_gpus.append(gpu) @@ -131,21 +279,32 @@ def check_expected_memory_partition_mode( ) return bad_memory_partition_mode_gpus = [] - for partition_current in partition_data.current_partition: + for partition_current in partition_data.memory_partition: if ( expected_memory_partition_mode is not None - and partition_current.memory != expected_memory_partition_mode - ) or ( + and partition_current.partition_type != expected_memory_partition_mode + ): + bad_memory_partition_mode_gpus.append( + { + "gpu_id": partition_current.gpu_id, + "memory_partition_mode": partition_current.partition_type, + } + ) + + for partition_current in partition_data.compute_partition: + if ( expected_compute_partition_mode is not None - and partition_current.accelerator_type != expected_compute_partition_mode + and partition_current.partition_type != expected_compute_partition_mode ): bad_memory_partition_mode_gpus.append( { "gpu_id": partition_current.gpu_id, - "compute_partition_mode": partition_current.accelerator_type, - "memory_partition_mode": partition_current.memory, + "compute_partition_mode": partition_current.partition_type, } ) + + # accelerator currently not avaialbe in API + if bad_memory_partition_mode_gpus: self._log_event( category=EventCategory.PLATFORM, @@ -163,13 +322,52 @@ def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult: if args is None: args = AmdSmiAnalyzerArgs() + if args.l0_to_recovery_count_error_threshold is None: + args.l0_to_recovery_count_error_threshold = self.L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD + if args.l0_to_recovery_count_warning_threshold is None: + args.l0_to_recovery_count_warning_threshold = ( + self.L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD + ) + if args.expected_gpu_processes: self.expected_gpu_processes(data.process, args.expected_gpu_processes) + + if data.static is None or len(data.static) == 0: + self._log_event( + category=EventCategory.PLATFORM, + description="No AMD SMI static data available", + priority=EventPriority.WARNING, + data={"amdsmi_static_data": data.static}, + ) + else: + if args.expected_max_power: + self.check_expected_max_power(data.static, args.expected_max_power) + if args.expected_driver_version: + self.check_expected_driver_version(data.static, args.expected_driver_version) if args.expected_memory_partition_mode or args.expected_compute_partition_mode: self.check_expected_memory_partition_mode( data.partition, args.expected_memory_partition_mode, args.expected_compute_partition_mode, ) + self.static_consistancy_check(data.static) + if ( + self.system_info.sku + and args.devid_ep + and args.devid_ep_vf + and args.vendorid_ep + and args.check_static_data + ) or args.check_static_data: + self.check_static_data( + data.static, + args.vendorid_ep, + args.vendorid_ep, + (args.devid_ep, args.devid_ep), + (args.devid_ep, args.devid_ep), + sku_name=args.sku_name, + ) + + if args.expected_pldm_version: + self.check_pldm_version(data.firmware, args.expected_pldm_version) return self.result diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py index fa652822..67eda944 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py @@ -25,13 +25,19 @@ ############################################################################### from nodescraper.base import InBandDataPlugin +from .amdsmi_analyzer import AmdSmiAnalyzer from .amdsmi_collector import AmdSmiCollector from .amdsmidata import AmdSmiDataModel +from .analyzer_args import AmdSmiAnalyzerArgs -class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, None]): +class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, AmdSmiAnalyzerArgs]): """Plugin for collection and analysis of amdsmi data""" DATA_MODEL = AmdSmiDataModel COLLECTOR = AmdSmiCollector + + ANALYZER = AmdSmiAnalyzer + + ANALYZER_ARGS = AmdSmiAnalyzerArgs diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 7e6dbf3f..b0e10343 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -30,11 +30,16 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): - check_static_data: bool = False - expected_gpu_processes: Optional[int] = None - expected_max_power: Optional[int] = None - expected_driver_version: Optional[str] = None - expected_memory_partition_mode: Optional[str] = None - expected_compute_partition_mode: Optional[str] = None - expected_pldm_version: Optional[str] = None - xgmi_speed_override: Optional[float] = None + check_static_data: bool = True + expected_gpu_processes: Optional[int] = 12 + expected_max_power: Optional[int] = 2 + expected_driver_version: Optional[str] = "5" + expected_memory_partition_mode: Optional[str] = "test" + expected_compute_partition_mode: Optional[str] = "test2" + expected_pldm_version: Optional[str] = "test3" + l0_to_recovery_count_error_threshold: Optional[int] = 1 + l0_to_recovery_count_warning_threshold: Optional[int] = 2 + vendorid_ep: Optional["str"] = "vendorid_ep" + vendorid_ep_vf: Optional["str"] = "vendorid_ep_vf" + devid_ep: Optional["str"] = "devid_ep" + sku_name: Optional["str"] = "sku_name" From f652fe3089d76826e9946f858e269aa396f312ed Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Tue, 23 Sep 2025 11:01:35 -0500 Subject: [PATCH 23/38] fixed payload for static data mismatch --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 113 ++++++++++++------ .../plugins/inband/amdsmi/amdsmidata.py | 89 +------------- 2 files changed, 81 insertions(+), 121 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index d781fce2..184fb689 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -23,7 +23,8 @@ # SOFTWARE. # ############################################################################### - +from collections import defaultdict +from typing import Any, Dict, List from nodescraper.enums import EventCategory, EventPriority from nodescraper.interfaces import DataAnalyzer @@ -89,11 +90,18 @@ def check_expected_driver_version( self, amdsmi_static_data: list[AmdSmiStatic], expected_driver_version: str, - ): - bad_driver_gpus = [] + ) -> None: + bad_driver_gpus: list[int] = [] + + versions_by_gpu: dict[int, str | None] = {} for gpu in amdsmi_static_data: - if gpu.driver.version != expected_driver_version: + ver: str | None = None + if gpu.driver is not None: + ver = gpu.driver.version + versions_by_gpu[gpu.gpu] = ver + if ver != expected_driver_version: bad_driver_gpus.append(gpu.gpu) + if bad_driver_gpus: self._log_event( category=EventCategory.PLATFORM, @@ -101,11 +109,7 @@ def check_expected_driver_version( priority=EventPriority.ERROR, data={ "gpus": bad_driver_gpus, - "driver_version": { - gpu.gpu: gpu.driver.version - for gpu in amdsmi_static_data - if gpu.gpu in bad_driver_gpus - }, + "driver_version": {g: versions_by_gpu[g] for g in bad_driver_gpus}, "expected_driver_version": expected_driver_version, }, ) @@ -182,50 +186,91 @@ def check_static_data( device_id: tuple[str | None, str | None], subsystem_id: tuple[str | None, str | None], sku_name: str, - ): - mismatch_gpus: list[tuple[int, str, str]] = [] + ) -> None: + mismatches: list[tuple[int, str, str, str]] = [] expected_data: dict[str, str | None] = { "vendor_id": vendor_id, "subvendor_id": subvendor_id, "vendor_name": "Advanced Micro Devices Inc", "market_name": sku_name, } + for gpu_data in amdsmi_static_data: - for key in expected_data: - collected_data: dict[str, str] = { - "vendor_id": gpu_data.asic.vendor_id, - "subvendor_id": gpu_data.asic.subvendor_id, - "vendor_name": gpu_data.asic.vendor_name, - "market_name": sku_name, - } - if expected_data[key] is not None: - if expected_data[key] not in collected_data[key]: - mismatch_gpus.append((gpu_data.gpu, key, collected_data[key])) - break + collected_data: dict[str, str] = { + "vendor_id": gpu_data.asic.vendor_id, + "subvendor_id": gpu_data.asic.subvendor_id, + "vendor_name": gpu_data.asic.vendor_name, + "market_name": sku_name, + } + + for key, expected in expected_data.items(): + if expected is None: + continue + actual = collected_data[key] + if expected not in actual: + mismatches.append((gpu_data.gpu, key, expected, actual)) + break + if device_id[0] is not None and device_id[1] is not None: + dev_actual = gpu_data.asic.device_id if ( - device_id[0].upper() not in gpu_data.asic.device_id.upper() - and device_id[1].upper() not in gpu_data.asic.device_id.upper() + device_id[0].upper() not in dev_actual.upper() + and device_id[1].upper() not in dev_actual.upper() ): - mismatch_gpus.append((gpu_data.gpu, "device_id", gpu_data.asic.device_id)) + mismatches.append( + (gpu_data.gpu, "device_id", f"{device_id[0]}|{device_id[1]}", dev_actual) + ) + if subsystem_id[0] is not None and subsystem_id[1] is not None: + subsys_actual = gpu_data.asic.subsystem_id if ( - subsystem_id[0].upper() not in gpu_data.asic.subsystem_id.upper() - and subsystem_id[1].upper() not in gpu_data.asic.subsystem_id.upper() + subsystem_id[0].upper() not in subsys_actual.upper() + and subsystem_id[1].upper() not in subsys_actual.upper() ): - mismatch_gpus.append((gpu_data.gpu, "subsystem_id", gpu_data.asic.subsystem_id)) - if mismatch_gpus: + mismatches.append( + ( + gpu_data.gpu, + "subsystem_id", + f"{subsystem_id[0]}|{subsystem_id[1]}", + subsys_actual, + ) + ) + + if mismatches: + payload = self._format_static_mismatch_payload(mismatches) self._log_event( category=EventCategory.PLATFORM, description="amd-smi static data mismatch", priority=EventPriority.ERROR, - data={ - "gpus": [data[0] for data in mismatch_gpus], - "key": [data[1] for data in mismatch_gpus], - "collected_data": [data[2] for data in mismatch_gpus], - }, + data=payload, ) + def _format_static_mismatch_payload( + self, + mismatches: List[tuple[int, str, str, str]], + ) -> Dict[str, Any]: + """ """ + per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list) + field_set: set[str] = set() + + for gpu, field, expected, actual in mismatches: + field_set.add(field) + per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual}) + + per_gpu_list: List[Dict[str, Any]] = [ + {"gpu": gpu, "mismatches": entries} + for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0]) + ] + + return { + "summary": { + "gpus_affected": len(per_gpu), + "fields": sorted(field_set), + "total_mismatches": sum(len(v) for v in per_gpu.values()), + }, + "per_gpu": per_gpu_list, + } + def check_pldm_version( self, amdsmi_fw_data: list[Fw] | None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index be1f11c9..ea4b6bcb 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1,9 +1,7 @@ import re -from enum import Enum from typing import Any, List, Mapping from pydantic import ( - AliasChoices, BaseModel, ConfigDict, Field, @@ -38,7 +36,7 @@ def na_to_none_dict(values: object) -> dict[str, Any] | None: return None if isinstance(values, str) and values.strip().upper() in {"N/A", "NA", ""}: return None - if not isinstance(values, Mapping): # guard: pydantic may pass non-dicts in 'before' mode + if not isinstance(values, Mapping): return None out: dict[str, Any] = {} @@ -141,60 +139,6 @@ def _clean_unit(cls, u): return "" if u is None else str(u).strip() -class EccState(Enum): - ENABLED = "ENABLED" - DISABLED = "DISABLED" - NONE = "NONE" - PARITY = "PARITY" - SING_C = "SING_C" - MULT_UC = "MULT_UC" - POISON = "POISON" - NA = "N/A" - - -### LINK DATA ### - - -class LinkStatusTable(Enum): - UP = "U" - DOWN = "D" - DISABLED = "X" - - -class BiDirectionalTable(Enum): - SELF = "SELF" - TRUE = "T" - - -class DmaTable(Enum): - SELF = "SELF" - TRUE = "T" - - -class AtomicsTable(Enum): - SELF = "SELF" - TRUE = "64,32" - THIRTY_TWO = "32" - SIXTY_FOUR = "64" - - -class LinkTypes(Enum): - XGMI = "XGMI" - PCIE = "PCIE" - SELF = "SELF" - - -class AccessTable(Enum): - ENABLED = "ENABLED" - DISABLED = "DISABLED" - - -class CoherentTable(Enum): - COHERANT = "C" - NON_COHERANT = "NC" - SELF = "SELF" - - # Process class ProcessMemoryUsage(BaseModel): gtt_mem: ValueUnit | None @@ -297,22 +241,6 @@ class PartitionCompute(BaseModel): partition_type: str | None = None -# class PartitionProfiles(AmdSmiBaseModel): -# """Partition Profiles data""" -# -# gpu_id: int -# profile_index: str | None = None -# memory_partition_caps: str | None = None -# accelerator_type: str | None = None -# partition_id: str | None = None -# num_partitions: str | None = None -# num_resources: str | None = None -# resource_index: str | None = None -# resource_type: str | None = None -# resource_instances: str | None = None -# resources_shared: str | None = None - - class Partition(BaseModel): """Contains the partition info for amd-smi""" @@ -393,21 +321,9 @@ class StaticBoard(BaseModel): manufacturer_name: str -class StaticRas(BaseModel): - eeprom_version: str - parity_schema: EccState - single_bit_schema: EccState - double_bit_schema: EccState - poison_schema: EccState - ecc_block_state: dict[str, EccState] - - class StaticPartition(BaseModel): - # The name for compute_partition has changed we will support both for now - compute_partition: str = Field( - validation_alias=AliasChoices("compute_partition", "accelerator_partition") - ) + compute_partition: str memory_partition: str partition_id: int @@ -483,7 +399,6 @@ class AmdSmiStatic(BaseModel): limit: StaticLimit | None driver: StaticDriver | None board: StaticBoard - # ras: StaticRas soc_pstate: StaticSocPstate | None xgmi_plpd: StaticXgmiPlpd | None process_isolation: str From b217c248e3738f47ab7d00200a86961ac6469859 Mon Sep 17 00:00:00 2001 From: Alex Bara Date: Tue, 23 Sep 2025 14:01:57 -0500 Subject: [PATCH 24/38] temporarily removed the pytest + some cleanup --- .../plugins/inband/amdsmi/analyzer_args.py | 26 +-- test/unit/plugin/test_amdsmi_collector.py | 218 ------------------ 2 files changed, 13 insertions(+), 231 deletions(-) delete mode 100644 test/unit/plugin/test_amdsmi_collector.py diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index b0e10343..f7dfa683 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -30,16 +30,16 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): - check_static_data: bool = True - expected_gpu_processes: Optional[int] = 12 - expected_max_power: Optional[int] = 2 - expected_driver_version: Optional[str] = "5" - expected_memory_partition_mode: Optional[str] = "test" - expected_compute_partition_mode: Optional[str] = "test2" - expected_pldm_version: Optional[str] = "test3" - l0_to_recovery_count_error_threshold: Optional[int] = 1 - l0_to_recovery_count_warning_threshold: Optional[int] = 2 - vendorid_ep: Optional["str"] = "vendorid_ep" - vendorid_ep_vf: Optional["str"] = "vendorid_ep_vf" - devid_ep: Optional["str"] = "devid_ep" - sku_name: Optional["str"] = "sku_name" + check_static_data: bool = False + expected_gpu_processes: Optional[int] = None + expected_max_power: Optional[int] = None + expected_driver_version: Optional[str] = None + expected_memory_partition_mode: Optional[str] = None + expected_compute_partition_mode: Optional[str] = None + expected_pldm_version: Optional[str] = None + l0_to_recovery_count_error_threshold: Optional[int] = None + l0_to_recovery_count_warning_threshold: Optional[int] = None + vendorid_ep: Optional["str"] = None + vendorid_ep_vf: Optional["str"] = None + devid_ep: Optional["str"] = None + sku_name: Optional["str"] = None diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py deleted file mode 100644 index 7bdbf82d..00000000 --- a/test/unit/plugin/test_amdsmi_collector.py +++ /dev/null @@ -1,218 +0,0 @@ -import types - -import pytest -from pydantic import BaseModel - -import nodescraper.plugins.inband.amdsmi.amdsmi_collector as mod -from nodescraper.enums.systeminteraction import SystemInteractionLevel -from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector - - -@pytest.fixture -def collector(system_info, conn_mock): - c = AmdSmiCollector( - system_info=system_info, - system_interaction_level=SystemInteractionLevel.PASSIVE, - connection=conn_mock, - ) - c._events = [] - - def _log_event(**kwargs): - c._events.append(kwargs) - - c._log_event = _log_event - c.result = types.SimpleNamespace(status=None) - c.logger = types.SimpleNamespace( - log=lambda *a, **k: None, - warning=lambda *a, **k: None, - info=lambda *a, **k: None, - error=lambda *a, **k: None, - ) - - return c - - -class FakeAmdSmiException(Exception): - """Stand-in for amdsmi.AmdSmiException.""" - - -def set_handles(monkeypatch, handles): - monkeypatch.setattr(mod, "amdsmi_get_processor_handles", lambda: handles) - - -def test_get_handles_success(monkeypatch, collector): - handles = ["h0", "h1"] - set_handles(monkeypatch, handles) - assert collector._get_handles() == handles - assert collector._events == [] - - -def test_get_amdsmi_version(monkeypatch, collector): - monkeypatch.setattr(mod, "amdsmi_get_lib_version", lambda: "25.3.0") - monkeypatch.setattr(mod, "amdsmi_get_rocm_version", lambda: "6.4.0") - v = collector._get_amdsmi_version() - assert v is not None - assert v.version == "25.3.0" - assert v.rocm_version == "6.4.0" - - -def test_get_gpu_list_with_compute_partition(monkeypatch, collector): - handles = ["h0", "h1"] - set_handles(monkeypatch, handles) - monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) - - calls = { - "bdf": {"h0": "0000:01:00.0", "h1": "0001:01:00.0"}, - "uuid": {"h0": "U0", "h1": "U1"}, - "kfd": {"h0": {"kfd_id": "7", "node_id": 3}, "h1": {}}, - "cp": {"h0": {"partition_id": "2"}, "h1": {"partition_id": 0}}, - "mp": {"h0": {}, "h1": {}}, - } - - monkeypatch.setattr(mod, "amdsmi_get_gpu_device_bdf", lambda h: calls["bdf"][h]) - monkeypatch.setattr(mod, "amdsmi_get_gpu_device_uuid", lambda h: calls["uuid"][h]) - monkeypatch.setattr(mod, "amdsmi_get_gpu_kfd_info", lambda h: calls["kfd"][h]) - monkeypatch.setattr(mod, "amdsmi_get_gpu_compute_partition", lambda h: calls["cp"][h]) - monkeypatch.setattr(mod, "amdsmi_get_gpu_memory_partition", lambda h: calls["mp"][h]) - - out = collector.get_gpu_list() - assert out == [ - { - "gpu": 0, - "bdf": "0000:01:00.0", - "uuid": "U0", - "kfd_id": 7, - "node_id": 3, - "partition_id": 2, - }, - { - "gpu": 1, - "bdf": "0001:01:00.0", - "uuid": "U1", - "kfd_id": 0, - "node_id": 0, - "partition_id": 0, - }, - ] - - -def test_get_gpu_list_fallback_to_memory_partition(monkeypatch, collector): - handles = ["h0"] - set_handles(monkeypatch, handles) - monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) - - monkeypatch.setattr(mod, "amdsmi_get_gpu_device_bdf", lambda h: "0000:01:00.0") - monkeypatch.setattr(mod, "amdsmi_get_gpu_device_uuid", lambda h: "U0") - monkeypatch.setattr(mod, "amdsmi_get_gpu_kfd_info", lambda h: {"kfd_id": 1, "node_id": "9"}) - - def raise_cp(h): - raise FakeAmdSmiException(2) - - monkeypatch.setattr(mod, "amdsmi_get_gpu_compute_partition", raise_cp) - monkeypatch.setattr( - mod, "amdsmi_get_gpu_memory_partition", lambda h: {"current_partition_id": "4"} - ) - - out = collector.get_gpu_list() - assert out[0]["partition_id"] == 4 - - -def test_get_process_mixed(monkeypatch, collector): - handles = ["h0"] - set_handles(monkeypatch, handles) - monkeypatch.setattr(mod, "amdsmi_get_gpu_process_list", lambda h: [111, 222]) - - def get_info(h, pid): - if pid == 111: - return {"name": "proc111", "vram_mem": 42, "gtt_mem": 1, "cpu_mem": 2} - raise FakeAmdSmiException(2) - - monkeypatch.setattr(mod, "amdsmi_get_gpu_compute_process_info", get_info) - monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) - - out = collector.get_process() - assert out and out[0]["gpu"] == 0 - plist = out[0]["process_list"] - assert plist[0]["process_info"]["name"] == "proc111" - assert plist[1]["process_info"] == "222" - - -def test_get_partition(monkeypatch, collector): - handles = ["h0", "h1"] - set_handles(monkeypatch, handles) - monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) - - monkeypatch.setattr( - mod, "amdsmi_get_gpu_compute_partition", lambda h: {"memory": "X", "partition_id": 1} - ) - monkeypatch.setattr( - mod, - "amdsmi_get_gpu_memory_partition", - lambda h: {"current_partition_id": 1, "memory_partition_caps": [1, 2]}, - ) - - out = collector.get_partition() - assert "current_partition" in out and len(out["current_partition"]) == 2 - assert "memory_partition" in out and len(out["memory_partition"]) == 2 - - -def test_get_firmware_various_shapes(monkeypatch, collector): - handles = ["h0", "h1", "h2"] - set_handles(monkeypatch, handles) - monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) - - fw_map = { - "h0": [{"fw_id": "SMU", "fw_version": "1.2.3"}, {"fw_name": "VBIOS", "version": "abc"}], - "h1": {"fw_list": [{"name": "PMFW", "ver": "9.9"}]}, - "h2": {"SMU": "4.5.6", "XGMI": "7.8.9"}, - } - monkeypatch.setattr(mod, "amdsmi_get_fw_info", lambda h: fw_map[h]) - - out = collector.get_firmware() - assert out and len(out) == 3 - assert out[0]["fw_list"][0] == {"fw_id": "SMU", "fw_version": "1.2.3"} - assert out[0]["fw_list"][1] == {"fw_id": "VBIOS", "fw_version": "abc"} - assert out[1]["fw_list"][0]["fw_id"] in ("PMFW", "name", "") - ids = {e["fw_id"] for e in out[2]["fw_list"]} - assert {"SMU", "XGMI"}.issubset(ids) - - -def test_smi_try_not_supported(monkeypatch, collector): - monkeypatch.setattr(mod, "AmdSmiException", FakeAmdSmiException) - - def fn(): - raise FakeAmdSmiException(2) - - ret = collector._smi_try(fn, default="X") - assert ret == "X" - assert any("not supported" in e["description"] for e in collector._events) - - -def test_collect_data(monkeypatch, collector): - init_called = [] - shut_called = [] - - monkeypatch.setattr(mod, "amdsmi_init", lambda *a, **k: init_called.append(True)) - monkeypatch.setattr(mod, "amdsmi_shut_down", lambda *a, **k: shut_called.append(True)) - monkeypatch.setattr(AmdSmiCollector, "_get_amdsmi_data", lambda self: {"ok": True}) - - res, data = collector.collect_data() - assert data == {"ok": True} - assert init_called and shut_called - - -def test_build_amdsmi_sub_data(collector): - class M(BaseModel): - a: int - - out = collector.build_amdsmi_sub_data(M, [{"a": 1}, {"a": 2}]) - assert [m.a for m in out] == [1, 2] - - out2 = collector.build_amdsmi_sub_data(M, {"a": 5}) - assert out2.a == 5 - - out3 = collector.build_amdsmi_sub_data(M, ["not-a-dict"]) - assert out3 is None - assert any( - "Invalid data type for amd-smi sub data" in e["description"] for e in collector._events - ) From 1e456c3d7cc5bedb0cbfae7ceb4eeeafbd90ce5b Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 26 Sep 2025 11:23:43 -0500 Subject: [PATCH 25/38] updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 39 +++++++------------ 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 54c74d7f..c7fa1688 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -235,7 +235,13 @@ def _to_int(x, default=0): uuid = self._smi_try(amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} - partition_id = 0 # no profile id available yet + kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} + partition_id = 0 + if isinstance(kfd, dict): + try: + partition_id = int(kfd.get("current_partition_id", 0) or 0) + except Exception: + partition_id = 0 try: out.append( @@ -349,26 +355,12 @@ def get_partition(self) -> Partition | None: computeparts: list[PartitionCompute] = [] for idx, h in enumerate(devices): - compute_partition = ( - self._smi_try(amdsmi.amdsmi_get_gpu_compute_partition, h, default={}) or {} - ) - memory_partition = ( - self._smi_try(amdsmi.amdsmi_get_gpu_memory_partition, h, default={}) or {} - ) - - mem_pt: Optional[str] = None - if isinstance(memory_partition, dict): - mem_pt = cast(Optional[str], memory_partition.get("partition_type")) - comp_pt: Optional[str] = None - if isinstance(compute_partition, dict): - comp_pt = cast(Optional[str], compute_partition.get("partition_type")) + mem_pt = self._smi_try(amdsmi.amdsmi_get_gpu_memory_partition, h, default=None) + comp_pt = self._smi_try(amdsmi.amdsmi_get_gpu_compute_partition, h, default=None) try: memparts.append( - PartitionMemory( - gpu_id=idx, - partition_type=mem_pt, - ) + PartitionMemory(gpu_id=idx, partition_type=cast(Optional[str], mem_pt)) ) except ValidationError as e: self._log_event( @@ -377,17 +369,14 @@ def get_partition(self) -> Partition | None: data={ "exception": get_exception_traceback(e), "gpu_index": idx, - "data": memory_partition, + "data": mem_pt, }, priority=EventPriority.WARNING, ) try: computeparts.append( - PartitionCompute( - gpu_id=idx, - partition_type=comp_pt, - ) + PartitionCompute(gpu_id=idx, partition_type=cast(Optional[str], comp_pt)) ) except ValidationError as e: self._log_event( @@ -396,7 +385,7 @@ def get_partition(self) -> Partition | None: data={ "exception": get_exception_traceback(e), "gpu_index": idx, - "data": compute_partition, + "data": comp_pt, }, priority=EventPriority.WARNING, ) @@ -817,7 +806,7 @@ def _as_list_str(v) -> list[str]: lvl_val = cache_level.value cache_label_val = ( - f"Lable_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}" + f"Label_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}" ) cache_label = ValueUnit(value=cache_label_val, unit="") From 640699419bf392e7db9fedb767fd8a7ee979b41b Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 29 Sep 2025 15:18:15 -0500 Subject: [PATCH 26/38] docstring + mypy --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 92 +++++++++-- .../plugins/inband/amdsmi/amdsmi_collector.py | 144 ++++++++++++++++-- .../plugins/inband/amdsmi/analyzer_args.py | 11 +- 3 files changed, 218 insertions(+), 29 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 184fb689..73d237eb 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -24,7 +24,7 @@ # ############################################################################### from collections import defaultdict -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from nodescraper.enums import EventCategory, EventPriority from nodescraper.interfaces import DataAnalyzer @@ -47,7 +47,12 @@ def check_expected_max_power( amdsmi_static_data: list[AmdSmiStatic], expected_max_power: int, ): - """Check the max power for all GPUs. If the max power is not as expected, log an error event""" + """Check against expected max power + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model + expected_max_power (int): expected max power + """ incorrect_max_power_gpus: dict[int, int | str | float] = {} for gpu in amdsmi_static_data: if gpu.limit is None or gpu.limit.max_power is None: @@ -91,6 +96,12 @@ def check_expected_driver_version( amdsmi_static_data: list[AmdSmiStatic], expected_driver_version: str, ) -> None: + """Check expectecd driver version + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model + expected_driver_version (str): expected driver version + """ bad_driver_gpus: list[int] = [] versions_by_gpu: dict[int, str | None] = {} @@ -117,8 +128,12 @@ def check_expected_driver_version( def expected_gpu_processes( self, processes_data: list[Processes] | None, max_num_processes: int ): - """Check the number of GPU processes running. If the number of processes is greater than the expected - number of processes, log an error event""" + """Check the number of GPU processes running + + Args: + processes_data (list[Processes] | None): list of processes per GPU + max_num_processes (int): max number of expected processes + """ gpu_exceeds_num_processes: dict[int, int] = {} if processes_data is None or len(processes_data) == 0: self._log_event( @@ -133,7 +148,7 @@ def expected_gpu_processes( if len(process.process_list) == 0 or isinstance( process.process_list[0].process_info, str ): - # Skip if there are no processes or the process info is a string which indicates no processes + # Skip if there are no processes continue process_count = len(process.process_list) @@ -152,7 +167,11 @@ def expected_gpu_processes( ) def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): - """Check the static data for all GPUs. If the static data is not consistent, log an error event""" + """Check consistency of expected data + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model + """ consistancy_data: dict[str, set[str] | set[int]] = { "market_name": {gpu.asic.market_name for gpu in amdsmi_static_data}, "vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data}, @@ -185,9 +204,21 @@ def check_static_data( subvendor_id: str | None, device_id: tuple[str | None, str | None], subsystem_id: tuple[str | None, str | None], - sku_name: str, + sku_name: str | None, ) -> None: + """Check expected static data + + Args: + amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data + vendor_id (str | None): expected vendor_id + subvendor_id (str | None): expected subvendor_id + device_id (tuple[str | None, str | None]): expected device_id + subsystem_id (tuple[str | None, str | None]): expected subsystem_id + sku_name (str | None): expected sku_name + """ + mismatches: list[tuple[int, str, str, str]] = [] + expected_data: dict[str, str | None] = { "vendor_id": vendor_id, "subvendor_id": subvendor_id, @@ -200,7 +231,7 @@ def check_static_data( "vendor_id": gpu_data.asic.vendor_id, "subvendor_id": gpu_data.asic.subvendor_id, "vendor_name": gpu_data.asic.vendor_name, - "market_name": sku_name, + "market_name": gpu_data.asic.market_name, } for key, expected in expected_data.items(): @@ -249,7 +280,14 @@ def _format_static_mismatch_payload( self, mismatches: List[tuple[int, str, str, str]], ) -> Dict[str, Any]: - """ """ + """Helper function for pretty printing mismatch in expected data + + Args: + mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU + + Returns: + Dict[str, Any]: dict of mismatched data per GPU + """ per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list) field_set: set[str] = set() @@ -276,7 +314,12 @@ def check_pldm_version( amdsmi_fw_data: list[Fw] | None, expected_pldm_version: str | None, ): - """Check the PLDM version for all GPUs. If the PLDM version is not as expected, log an error event for which GPUs don't have a match""" + """Check expected pldm version + + Args: + amdsmi_fw_data (list[Fw] | None): data model + expected_pldm_version (str | None): expected pldm version + """ PLDM_STRING = "PLDM_BUNDLE" if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0: self._log_event( @@ -316,6 +359,13 @@ def check_expected_memory_partition_mode( expected_memory_partition_mode: str | None, expected_compute_partition_mode: str | None, ): + """Check expected mem partition mode + + Args: + partition_data (Partition | None): data model + expected_memory_partition_mode (str | None): expected mem partition mode + expected_compute_partition_mode (str | None): expected compute partition mode + """ if partition_data is None: self._log_event( category=EventCategory.PLATFORM, @@ -336,15 +386,15 @@ def check_expected_memory_partition_mode( } ) - for partition_current in partition_data.compute_partition: + for compute_current in partition_data.compute_partition: if ( expected_compute_partition_mode is not None - and partition_current.partition_type != expected_compute_partition_mode + and compute_current.partition_type != expected_compute_partition_mode ): bad_memory_partition_mode_gpus.append( { - "gpu_id": partition_current.gpu_id, - "compute_partition_mode": partition_current.partition_type, + "gpu_id": compute_current.gpu_id, + "compute_partition_mode": compute_current.partition_type, } ) @@ -362,7 +412,19 @@ def check_expected_memory_partition_mode( }, ) - def analyze_data(self, data: AmdSmiDataModel, args=None) -> TaskResult: + def analyze_data( + self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None + ) -> TaskResult: + """Analyze the amdsmi data against expected data + + Args: + data (AmdSmiDataModel): the AmdSmi data model + args (_type_, optional): optional AmdSmi analyzer args. Defaults to None. + + Returns: + TaskResult: the result of the analysis indicating weather the AmdSmi data model + matched the expected data + """ if args is None: args = AmdSmiAnalyzerArgs() diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index c7fa1688..13cf77e9 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -65,7 +65,7 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): - """class for collection of inband tool amd-smi data.""" + """Class for collection of inband tool amd-smi data.""" SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} @@ -74,10 +74,23 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): _amdsmi: Any | None = None # dynamic import def _amdsmi_mod(self) -> Any: + """Check for amdsmi installation + + Returns: + Any: local instance of amdsmi module + """ assert self._amdsmi is not None, "amdsmi module not bound" return self._amdsmi def _to_number(self, v: object) -> Union[int, float] | None: + """Helper function to return number from str, float or "N/A" + + Args: + v (object): non number object + + Returns: + Union[int, float] | None: number version of input + """ if v in (None, "", "N/A"): return None try: @@ -94,9 +107,15 @@ def _to_number(self, v: object) -> Union[int, float] | None: return None def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | None: - """ - Build ValueUnit from mixed numeric/string input. - None/''/'N/A' -> None unless required=True (then 0{unit}) + """Build ValueUnit instance from object + + Args: + v (object): object to be turned into ValueUnit + unit (str): unit of measurement + required (bool, optional): bool to force instance creation. Defaults to False. + + Returns: + ValueUnit | None: ValueUnit Instance """ n = self._to_number(v) if n is None: @@ -104,15 +123,28 @@ def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | No return ValueUnit(value=n, unit=unit) def _vu_req(self, v: object, unit: str) -> ValueUnit: + """Helper function to force ValueUnit instance creation + + Args: + v (object): object + unit (str): unit of measurement + + Returns: + ValueUnit: instance of ValueUnit + """ vu = self._vu(v, unit, required=True) assert vu is not None return vu - def _nz(self, val: object, default: str = "unknown", *, slot_type: bool = False) -> str: - """ - Normalize strings: - - Generic: return trimmed value unless empty/'N/A', else `default`. - - slot_type=True: map to one of {'OAM','PCIE','CEM','Unknown'}. + def _nz(self, val: object, default: str = "unknown", slot_type: bool = False) -> str: + """Normalize strings + + Args: + val (object): object + default (str, optional): default option. Defaults to "unknown". + slot_type (bool, optional): map to one of {'OAM','PCIE','CEM','Unknown'}. + Returns: + str: normalized string """ s = str(val).strip() if val is not None else "" if not s or s.upper() == "N/A": @@ -131,6 +163,11 @@ def _nz(self, val: object, default: str = "unknown", *, slot_type: bool = False) return s def _bind_amdsmi_or_log(self) -> bool: + """Bind to local amdsmi lib or log that it is not found + + Returns: + bool: True if module is found, false otherwise + """ if getattr(self, "_amdsmi", None) is not None: return True try: @@ -147,6 +184,11 @@ def _bind_amdsmi_or_log(self) -> bool: return False def _get_handles(self): + """get amdsmi handles + + Returns: + List[c_void_p]: list of processor handles + """ amdsmi = self._amdsmi_mod() try: return amdsmi.amdsmi_get_processor_handles() @@ -161,6 +203,11 @@ def _get_handles(self): return [] def _get_amdsmi_data(self) -> AmdSmiDataModel | None: + """Fill in information for AmdSmi data model + + Returns: + AmdSmiDataModel | None: instance of the AmdSmi data model + """ try: version = self._get_amdsmi_version() processes = self.get_process() @@ -199,6 +246,11 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: return None def _get_amdsmi_version(self) -> AmdSmiVersion | None: + """Check amdsmi library version + + Returns: + AmdSmiVersion | None: version of the library + """ amdsmi = self._amdsmi_mod() try: lib_ver = amdsmi.amdsmi_get_lib_version() or "" @@ -220,6 +272,11 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None: ) def get_gpu_list(self) -> list[AmdSmiListItem] | None: + """Get GPU information from amdsmi lib + + Returns: + list[AmdSmiListItem] | None: list of GPU info items + """ amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[AmdSmiListItem] = [] @@ -265,6 +322,11 @@ def _to_int(x, default=0): return out def get_process(self) -> list[Processes] | None: + """Get process information + + Returns: + list[Processes] | None: list of GPU processes + """ amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[Processes] = [] @@ -349,6 +411,11 @@ def get_process(self) -> list[Processes] | None: return out def get_partition(self) -> Partition | None: + """Check partition information + + Returns: + Partition | None: Partition data if availabe + """ amdsmi = self._amdsmi_mod() devices = self._get_handles() memparts: list[PartitionMemory] = [] @@ -402,6 +469,11 @@ def get_partition(self) -> Partition | None: return None def get_firmware(self) -> list[Fw] | None: + """Get firmware information + + Returns: + list[Fw] | None: List of firmware info per GPU + """ amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[Fw] = [] @@ -449,6 +521,15 @@ def get_firmware(self) -> list[Fw] | None: return out def _smi_try(self, fn, *a, default=None, **kw): + """Helper function to check if amdsmi lib call is availabe + + Args: + fn (function): amdsmi lib function to call + default (_type_, optional): default ret value. Defaults to None. + + Returns: + function call: function call or log error + """ amdsmi = self._amdsmi_mod() try: return fn(*a, **kw) @@ -495,6 +576,11 @@ def _smi_try(self, fn, *a, default=None, **kw): return default def get_static(self) -> list[AmdSmiStatic] | None: + """Get Static info from amdsmi lib + + Returns: + list[AmdSmiStatic] | None: AmdSmiStatic instance or None + """ amdsmi = self._amdsmi_mod() devices = self._get_handles() if not devices: @@ -670,6 +756,14 @@ def get_static(self) -> list[AmdSmiStatic] | None: return out def _get_soc_pstate(self, h) -> StaticSocPstate | None: + """SOC pstate check + + Args: + h (_type_): handle + + Returns: + StaticSocPstate | None: class instance + """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_soc_pstate", None) if not callable(fn): @@ -724,6 +818,14 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None: return None def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: + """Check XGMI plpd + + Args: + h (_type_): handle + + Returns: + StaticXgmiPlpd | None: class instance + """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_xgmi_plpd", None) if not callable(fn): @@ -778,6 +880,14 @@ def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: return None def _get_cache_info(self, h) -> list[StaticCacheInfoItem]: + """check cache info + + Args: + h (_type_): handle + + Returns: + list[StaticCacheInfoItem]: class instance + """ amdsmi = self._amdsmi_mod() raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, h, default=None) if not isinstance(raw, dict) or not isinstance(raw.get("cache"), list): @@ -833,6 +943,14 @@ def _as_list_str(v) -> list[str]: return out def _get_clock(self, h) -> StaticClockData | None: + """Get clock info + + Args: + h (_type_): handle + + Returns: + StaticClockData | None: class instance + """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_clk_freq", None) clk_type = getattr(amdsmi, "AmdSmiClkType", None) @@ -899,6 +1017,14 @@ def collect_data( self, args=None, ) -> tuple[TaskResult, AmdSmiDataModel | None]: + """Collect AmdSmi data from system + + Args: + args (_type_, optional): _description_. Defaults to None. + + Returns: + tuple[TaskResult, AmdSmiDataModel | None]: _description_ + """ if not self._bind_amdsmi_or_log(): self.result.status = ExecutionStatus.NOT_RAN diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index f7dfa683..143d6286 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -30,7 +30,7 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): - check_static_data: bool = False + check_static_data: bool = True expected_gpu_processes: Optional[int] = None expected_max_power: Optional[int] = None expected_driver_version: Optional[str] = None @@ -39,7 +39,8 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): expected_pldm_version: Optional[str] = None l0_to_recovery_count_error_threshold: Optional[int] = None l0_to_recovery_count_warning_threshold: Optional[int] = None - vendorid_ep: Optional["str"] = None - vendorid_ep_vf: Optional["str"] = None - devid_ep: Optional["str"] = None - sku_name: Optional["str"] = None + vendorid_ep: Optional[str] = None + vendorid_ep_vf: Optional[str] = None + devid_ep: Optional[str] = None + devid_ep_vf: Optional[str] = None + sku_name: Optional[str] = None From a9b3ed3a812aa85386ab3d1b760d21321fa51d01 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 30 Sep 2025 09:44:21 -0500 Subject: [PATCH 27/38] pytest --- test/unit/plugin/test_amdsmi_collector.py | 315 ++++++++++++++++++++++ 1 file changed, 315 insertions(+) create mode 100644 test/unit/plugin/test_amdsmi_collector.py diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py new file mode 100644 index 00000000..b046521f --- /dev/null +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -0,0 +1,315 @@ +import importlib +import sys +import types +from typing import Tuple + +import pytest + +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector + + +class _BaseAmdSmiError(Exception): + def __init__(self, ret_code: int, *args): + super().__init__(ret_code, *args) + self.ret_code = ret_code + + +class AmdSmiLibraryError(_BaseAmdSmiError): ... + + +class AmdSmiRetryError(_BaseAmdSmiError): ... + + +class AmdSmiParameterError(_BaseAmdSmiError): ... + + +class AmdSmiTimeoutError(_BaseAmdSmiError): ... + + +def make_fake_amdsmi( + *, + handles: Tuple[object, ...] | None = None, + lib_version="1.2.3", + rocm_version="6.1.0", + pcie_static=True, + raise_on_handles=False, +): + if handles is None: + handles = (object(),) + + m = types.SimpleNamespace() + m.AmdSmiException = _BaseAmdSmiError + m.AmdSmiLibraryException = AmdSmiLibraryError + m.AmdSmiRetryException = AmdSmiRetryError + m.AmdSmiParameterException = AmdSmiParameterError + m.AmdSmiTimeoutException = AmdSmiTimeoutError + + class AmdSmiInitFlags: + INIT_AMD_GPUS = 1 + + m.AmdSmiInitFlags = AmdSmiInitFlags + + class AmdSmiMemoryType: + VRAM = 0 + VIS_VRAM = 1 + GTT = 2 + + m.AmdSmiMemoryType = AmdSmiMemoryType + + def amdsmi_init(_flags): + return None + + def amdsmi_shut_down(): + return None + + m.amdsmi_init = amdsmi_init + m.amdsmi_shut_down = amdsmi_shut_down + + m.amdsmi_get_lib_version = lambda: lib_version + m.amdsmi_get_rocm_version = lambda: rocm_version + + def amdsmi_get_processor_handles(): + if raise_on_handles: + raise AmdSmiLibraryError(5) + return list(handles) + + m.amdsmi_get_processor_handles = amdsmi_get_processor_handles + + m.amdsmi_get_gpu_device_bdf = lambda h: "0000:0b:00.0" + m.amdsmi_get_gpu_device_uuid = lambda h: "GPU-UUID-123" + m.amdsmi_get_gpu_kfd_info = lambda h: { + "kfd_id": 7, + "node_id": 3, + "cpu_affinity": 0xFF, + "current_partition_id": 0, + } + m.amdsmi_get_gpu_board_info = lambda h: { + "vbios_name": "vbiosA", + "vbios_build_date": "2024-01-01", + "vbios_part_number": "PN123", + "vbios_version": "V1", + "model_number": "Board-42", + "product_serial": "SN0001", + "fru_id": "FRU-1", + "product_name": "ExampleBoard", + "manufacturer_name": "ACME", + } + m.amdsmi_get_gpu_asic_info = lambda h: { + "market_name": "SomeGPU", + "vendor_id": "1002", + "vendor_name": "AMD", + "subvendor_id": "1ABC", + "device_id": "0x1234", + "subsystem_id": "0x5678", + "rev_id": "A1", + "asic_serial": "ASERIAL", + "oam_id": 0, + "num_compute_units": 224, + "target_graphics_version": "GFX940", + "vram_type": "HBM3", + "vram_vendor": "Micron", + "vram_bit_width": 4096, + "vram_size_bytes": 64 * 1024 * 1024 * 1024, + } + m.amdsmi_get_gpu_driver_info = lambda h: { + "driver_name": "amdgpu", + "driver_version": "6.1.0", + } + + if pcie_static: + + def amdsmi_get_pcie_info(h): + return { + "pcie_static": { + "max_pcie_width": 16, + "max_pcie_speed": 16000, + "pcie_interface_version": "PCIe 5.0", + "slot_type": "PCIe", + } + } + + m.amdsmi_get_pcie_info = amdsmi_get_pcie_info + + m.amdsmi_get_gpu_cache_info = lambda h: { + "cache": [ + { + "cache_level": 1, + "max_num_cu_shared": 8, + "num_cache_instance": 32, + "cache_size": 256 * 1024, + "cache_properties": "PropertyA, PropertyB; PropertyC", + } + ] + } + + def amdsmi_get_clk_freq(h, clk_type): + return { + "frequency": [500_000_000, 1_500_000_000, 2_000_000_000], + "current": 1, + } + + m.amdsmi_get_clk_freq = amdsmi_get_clk_freq + + m.amdsmi_get_fw_info = lambda h: { + "fw_list": [ + {"fw_name": "SMU", "fw_version": "55.33"}, + {"fw_name": "VBIOS", "fw_version": "V1"}, + ] + } + + m.amdsmi_get_gpu_process_list = lambda h: [ + { + "name": "python", + "pid": 4242, + "mem": 1024, + "engine_usage": {"gfx": 1_000_000, "enc": 0}, + "memory_usage": {"gtt_mem": 0, "cpu_mem": 4096, "vram_mem": 2048}, + "cu_occupancy": 12, + }, + { + "name": "N/A", + "pid": "9999", + "mem": "0", + "engine_usage": {"gfx": "0", "enc": "0"}, + "memory_usage": {"gtt_mem": "0", "cpu_mem": "0", "vram_mem": "0"}, + "cu_occupancy": "0", + }, + ] + + m.amdsmi_get_gpu_memory_partition = lambda h: {"partition_type": "NPS1"} + m.amdsmi_get_gpu_compute_partition = lambda h: {"partition_type": "CPX_DISABLED"} + + return m + + +@pytest.fixture +def install_fake_amdsmi(monkeypatch): + fake = make_fake_amdsmi() + mod = types.ModuleType("amdsmi") + for k, v in fake.__dict__.items(): + setattr(mod, k, v) + monkeypatch.setitem(sys.modules, "amdsmi", mod) + return mod + + +@pytest.fixture +def collector(install_fake_amdsmi, conn_mock, system_info): + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + assert c._bind_amdsmi_or_log() is True + return c + + +def test_collect_data(collector): + result, data = collector.collect_data() + assert data is not None + assert data.version is not None + assert data.version.tool == "amdsmi" + # gpu_list + assert data.gpu_list is not None and len(data.gpu_list) == 1 + assert data.gpu_list[0].bdf == "0000:0b:00.0" + assert data.gpu_list[0].uuid == "GPU-UUID-123" + # processes + assert data.process is not None and len(data.process) == 1 + assert len(data.process[0].process_list) == 2 + # static + assert data.static is not None and len(data.static) == 1 + s = data.static[0] + assert s.bus is not None and s.bus.max_pcie_speed is not None + assert float(s.bus.max_pcie_speed.value) == pytest.approx(16.0) + + +def test_bind_failure(monkeypatch, conn_mock, system_info): + monkeypatch.setattr( + importlib, "import_module", lambda name: (_ for _ in ()).throw(ImportError("nope")) + ) + sys.modules.pop("amdsmi", None) + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + result, data = c.collect_data() + assert data is None + assert result.status.name == "NOT_RAN" + + +def test_handles_exception(monkeypatch, collector): + fake = make_fake_amdsmi(raise_on_handles=True) + mod = types.ModuleType("amdsmi") + for k, v in fake.__dict__.items(): + setattr(mod, k, v) + monkeypatch.setitem(sys.modules, "amdsmi", mod) + collector._amdsmi = mod + + gl = collector.get_gpu_list() + assert gl == [] or gl is None + + gp = collector.get_process() + assert gp == [] or gp is None + + part = collector.get_partition() + assert part is not None + + fw = collector.get_firmware() + assert fw == [] or fw is None + + st = collector.get_static() + assert st == [] or st is None + + +def test_partition(collector, install_fake_amdsmi): + amdsmi = install_fake_amdsmi + amdsmi.amdsmi_get_gpu_memory_partition = lambda h: "NPS2" + amdsmi.amdsmi_get_gpu_compute_partition = lambda h: "CPX_ENABLED" + p = collector.get_partition() + assert p is not None + assert len(p.memory_partition) == 1 and len(p.compute_partition) == 1 + assert p.memory_partition[0].partition_type == "NPS2" + assert p.compute_partition[0].partition_type == "CPX_ENABLED" + + +def test_pcie(collector, install_fake_amdsmi): + if hasattr(install_fake_amdsmi, "amdsmi_get_pcie_info"): + delattr(install_fake_amdsmi, "amdsmi_get_pcie_info") + stat = collector.get_static() + assert stat is not None and len(stat) == 1 + assert stat[0].bus is not None + ms = stat[0].bus.max_pcie_speed + assert ms is None or ms.unit == "GT/s" + + +def test_cache(collector): + stat = collector.get_static() + item = stat[0].cache_info[0] + assert isinstance(item.cache.value, str) and item.cache.value.startswith("Label_") + assert item.cache_properties + assert {"PropertyA", "PropertyB", "PropertyC"}.issubset(set(item.cache_properties)) + + +def test_process_list(collector): + procs = collector.get_process() + assert procs and procs[0].process_list + p0 = procs[0].process_list[0].process_info + assert p0.pid == 4242 + assert p0.mem is not None and p0.mem.unit == "B" + assert p0.usage.gfx is not None and p0.usage.gfx.unit == "ns" + p1 = procs[0].process_list[1].process_info + assert p1.name == "N/A" + assert isinstance(p1.pid, int) + + +def test_smi_try(monkeypatch, install_fake_amdsmi, collector): + def raise_not_supported(*a, **kw): + raise AmdSmiLibraryError(2) # NOT_SUPPORTED + + install_fake_amdsmi.amdsmi_get_gpu_memory_partition = raise_not_supported + + p = collector.get_partition() + assert p is not None + assert len(p.memory_partition) == 1 From 6613dedbeba56903b2bc792783c0d043159a4a2e Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 30 Sep 2025 11:51:34 -0500 Subject: [PATCH 28/38] fixed some tpos --- nodescraper/plugins/inband/amdsmi/amdsmidata.py | 10 +++++----- nodescraper/plugins/inband/amdsmi/analyzer_args.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index ea4b6bcb..36564666 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -62,9 +62,8 @@ class AmdSmiBaseModel(BaseModel): extra="forbid", # Forbid extra fields not defined in the model ) - # During building if a field contains a ValueUnit in its tuple, convert input into a ValueUnit def __init__(self, **data): - # Convert all fields that are supposed to be ValueUnit to ValueUnit if they are int | str | float + # Convert int | str | float -> ValueUnit for field_name, field_type in self.model_fields.items(): annotation = field_type.annotation target_type, container = find_annotation_in_container(annotation, ValueUnit) @@ -72,7 +71,7 @@ def __init__(self, **data): continue if field_name in data and isinstance(data[field_name], (int, str, float)): - # If the field is a primitive type, convert it to ValueUnit dict and let validtor handle it + # If the field is a primitive type, convert it to ValueUnit dict for validator data[field_name] = { "value": data[field_name], "unit": "", @@ -104,7 +103,6 @@ def na(x) -> bool: if na(v): return None - # Dict form: normalize value and possibly extract unit if isinstance(v, dict): val = v.get("value") unit = v.get("unit", "") @@ -218,7 +216,7 @@ def _stringify(cls, v): class PartitionAccelerator(BaseModel): - """Contains the tition data for the GPUs""" + """Accelerator partition data""" gpu_id: int memory: str | None = None @@ -392,6 +390,8 @@ class StaticClockData(BaseModel): class AmdSmiStatic(BaseModel): + """Contains all static data""" + gpu: int asic: StaticAsic bus: StaticBus diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 143d6286..5b88b241 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -30,7 +30,7 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): - check_static_data: bool = True + check_static_data: bool = False expected_gpu_processes: Optional[int] = None expected_max_power: Optional[int] = None expected_driver_version: Optional[str] = None From 0eec9f77d645c2e9898145876c42246fe522bb13 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 11 Nov 2025 13:47:43 -0600 Subject: [PATCH 29/38] addressed reviews --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 68 +++---- .../plugins/inband/amdsmi/amdsmi_collector.py | 171 +++++++++--------- .../plugins/inband/amdsmi/amdsmidata.py | 126 ++++++------- .../plugins/inband/amdsmi/analyzer_args.py | 4 +- nodescraper/utils.py | 8 +- 5 files changed, 186 insertions(+), 191 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 73d237eb..fa575745 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -24,7 +24,7 @@ # ############################################################################### from collections import defaultdict -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from nodescraper.enums import EventCategory, EventPriority from nodescraper.interfaces import DataAnalyzer @@ -39,9 +39,6 @@ class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel - L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD = 3 - L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD = 1 - def check_expected_max_power( self, amdsmi_static_data: list[AmdSmiStatic], @@ -53,7 +50,7 @@ def check_expected_max_power( amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model expected_max_power (int): expected max power """ - incorrect_max_power_gpus: dict[int, int | str | float] = {} + incorrect_max_power_gpus: dict[int, Union[int, str, float]] = {} for gpu in amdsmi_static_data: if gpu.limit is None or gpu.limit.max_power is None: self._log_event( @@ -104,9 +101,9 @@ def check_expected_driver_version( """ bad_driver_gpus: list[int] = [] - versions_by_gpu: dict[int, str | None] = {} + versions_by_gpu: dict[int, Optional[str]] = {} for gpu in amdsmi_static_data: - ver: str | None = None + ver: Optional[str] = None if gpu.driver is not None: ver = gpu.driver.version versions_by_gpu[gpu.gpu] = ver @@ -126,12 +123,12 @@ def check_expected_driver_version( ) def expected_gpu_processes( - self, processes_data: list[Processes] | None, max_num_processes: int + self, processes_data: Optional[list[Processes]], max_num_processes: int ): """Check the number of GPU processes running Args: - processes_data (list[Processes] | None): list of processes per GPU + processes_data (Optional[list[Processes]]): list of processes per GPU max_num_processes (int): max number of expected processes """ gpu_exceeds_num_processes: dict[int, int] = {} @@ -172,7 +169,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): Args: amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model """ - consistancy_data: dict[str, set[str] | set[int]] = { + consistancy_data: dict[str, Union[set[str], set[int]]] = { "market_name": {gpu.asic.market_name for gpu in amdsmi_static_data}, "vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data}, "vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data}, @@ -190,7 +187,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): self._log_event( category=EventCategory.PLATFORM, description=f"{key} is not consistent across all GPUs", - priority=EventPriority.ERROR, + priority=EventPriority.WARNING, data={ "field": key, "non_consistent_values": value, @@ -200,26 +197,26 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): def check_static_data( self, amdsmi_static_data: list[AmdSmiStatic], - vendor_id: str | None, - subvendor_id: str | None, - device_id: tuple[str | None, str | None], - subsystem_id: tuple[str | None, str | None], - sku_name: str | None, + vendor_id: Optional[str], + subvendor_id: Optional[str], + device_id: tuple[Optional[str], Optional[str]], + subsystem_id: tuple[Optional[str], Optional[str]], + sku_name: Optional[str], ) -> None: """Check expected static data Args: amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data - vendor_id (str | None): expected vendor_id - subvendor_id (str | None): expected subvendor_id - device_id (tuple[str | None, str | None]): expected device_id - subsystem_id (tuple[str | None, str | None]): expected subsystem_id - sku_name (str | None): expected sku_name + vendor_id (Optional[str]): expected vendor_id + subvendor_id (Optional[str]): expected subvendor_id + device_id (tuple[Optional[str], Optional[str]]): expected device_id + subsystem_id (tuple[Optional[str], Optional[str]]): expected subsystem_id + sku_name (Optional[str]): expected sku_name """ mismatches: list[tuple[int, str, str, str]] = [] - expected_data: dict[str, str | None] = { + expected_data: Dict[str, Optional[str]] = { "vendor_id": vendor_id, "subvendor_id": subvendor_id, "vendor_name": "Advanced Micro Devices Inc", @@ -311,14 +308,14 @@ def _format_static_mismatch_payload( def check_pldm_version( self, - amdsmi_fw_data: list[Fw] | None, - expected_pldm_version: str | None, + amdsmi_fw_data: Optional[list[Fw]], + expected_pldm_version: Optional[str], ): """Check expected pldm version Args: - amdsmi_fw_data (list[Fw] | None): data model - expected_pldm_version (str | None): expected pldm version + amdsmi_fw_data (Optional[list[Fw]]): data model + expected_pldm_version (Optional[str]): expected pldm version """ PLDM_STRING = "PLDM_BUNDLE" if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0: @@ -355,16 +352,16 @@ def check_pldm_version( def check_expected_memory_partition_mode( self, - partition_data: Partition | None, - expected_memory_partition_mode: str | None, - expected_compute_partition_mode: str | None, + partition_data: Optional[Partition], + expected_memory_partition_mode: Optional[str], + expected_compute_partition_mode: Optional[str], ): """Check expected mem partition mode Args: - partition_data (Partition | None): data model - expected_memory_partition_mode (str | None): expected mem partition mode - expected_compute_partition_mode (str | None): expected compute partition mode + partition_data (Optional[Partition]): data model + expected_memory_partition_mode (Optional[str]): expected mem partition mode + expected_compute_partition_mode (Optional[str]): expected compute partition mode """ if partition_data is None: self._log_event( @@ -429,13 +426,6 @@ def analyze_data( if args is None: args = AmdSmiAnalyzerArgs() - if args.l0_to_recovery_count_error_threshold is None: - args.l0_to_recovery_count_error_threshold = self.L0_TO_RECOVERY_COUNT_ERROR_THRESHOLD - if args.l0_to_recovery_count_warning_threshold is None: - args.l0_to_recovery_count_warning_threshold = ( - self.L0_TO_RECOVERY_COUNT_WARNING_THRESHOLD - ) - if args.expected_gpu_processes: self.expected_gpu_processes(data.process, args.expected_gpu_processes) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 13cf77e9..6ad35398 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -24,7 +24,7 @@ # ############################################################################### import importlib -from typing import Any, Optional, Union, cast +from typing import Any, Callable, Optional, Union, cast from pydantic import ValidationError @@ -71,7 +71,7 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel - _amdsmi: Any | None = None # dynamic import + _amdsmi: Optional[Any] = None # dynamic import def _amdsmi_mod(self) -> Any: """Check for amdsmi installation @@ -82,14 +82,14 @@ def _amdsmi_mod(self) -> Any: assert self._amdsmi is not None, "amdsmi module not bound" return self._amdsmi - def _to_number(self, v: object) -> Union[int, float] | None: + def _to_number(self, v: object) -> Optional[Union[int, float]]: """Helper function to return number from str, float or "N/A" Args: v (object): non number object Returns: - Union[int, float] | None: number version of input + Optional[Union[int, float]]: number version of input """ if v in (None, "", "N/A"): return None @@ -106,7 +106,7 @@ def _to_number(self, v: object) -> Union[int, float] | None: except Exception: return None - def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | None: + def _valueunit(self, v: object, unit: str, *, required: bool = False) -> Optional[ValueUnit]: """Build ValueUnit instance from object Args: @@ -115,14 +115,14 @@ def _vu(self, v: object, unit: str, *, required: bool = False) -> ValueUnit | No required (bool, optional): bool to force instance creation. Defaults to False. Returns: - ValueUnit | None: ValueUnit Instance + Optional[ValueUnit]: ValueUnit Instance """ n = self._to_number(v) if n is None: return ValueUnit(value=0, unit=unit) if required else None return ValueUnit(value=n, unit=unit) - def _vu_req(self, v: object, unit: str) -> ValueUnit: + def _valueunit_req(self, v: object, unit: str) -> ValueUnit: """Helper function to force ValueUnit instance creation Args: @@ -132,17 +132,18 @@ def _vu_req(self, v: object, unit: str) -> ValueUnit: Returns: ValueUnit: instance of ValueUnit """ - vu = self._vu(v, unit, required=True) + vu = self._valueunit(v, unit, required=True) assert vu is not None return vu - def _nz(self, val: object, default: str = "unknown", slot_type: bool = False) -> str: + def _normalize(self, val: object, default: str = "unknown", slot_type: bool = False) -> str: """Normalize strings Args: val (object): object default (str, optional): default option. Defaults to "unknown". - slot_type (bool, optional): map to one of {'OAM','PCIE','CEM','Unknown'}. + slot_type (bool, optional): map to one of {'OAM','PCIE','CEM','Unknown'}. Defaults to False. + Returns: str: normalized string """ @@ -183,11 +184,11 @@ def _bind_amdsmi_or_log(self) -> bool: ) return False - def _get_handles(self): - """get amdsmi handles + def _get_handles(self) -> list[Any]: + """Get amdsmi handles Returns: - List[c_void_p]: list of processor handles + list[Any]: list of processor handles """ amdsmi = self._amdsmi_mod() try: @@ -202,11 +203,11 @@ def _get_handles(self): ) return [] - def _get_amdsmi_data(self) -> AmdSmiDataModel | None: + def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: """Fill in information for AmdSmi data model Returns: - AmdSmiDataModel | None: instance of the AmdSmi data model + Optional[AmdSmiDataModel]: instance of the AmdSmi data model """ try: version = self._get_amdsmi_version() @@ -245,11 +246,11 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: ) return None - def _get_amdsmi_version(self) -> AmdSmiVersion | None: + def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: """Check amdsmi library version Returns: - AmdSmiVersion | None: version of the library + Optional[AmdSmiVersion]: version of the library """ amdsmi = self._amdsmi_mod() try: @@ -271,17 +272,17 @@ def _get_amdsmi_version(self) -> AmdSmiVersion | None: rocm_version=rocm_ver, ) - def get_gpu_list(self) -> list[AmdSmiListItem] | None: + def get_gpu_list(self) -> Optional[list[AmdSmiListItem]]: """Get GPU information from amdsmi lib Returns: - list[AmdSmiListItem] | None: list of GPU info items + Optional[list[AmdSmiListItem]]: list of GPU info items """ amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[AmdSmiListItem] = [] - def _to_int(x, default=0): + def _to_int(x: Any, default: int = 0) -> int: try: return int(x) except Exception: @@ -321,11 +322,11 @@ def _to_int(x, default=0): return out - def get_process(self) -> list[Processes] | None: + def get_process(self) -> Optional[list[Processes]]: """Get process information Returns: - list[Processes] | None: list of GPU processes + Optional[list[Processes]]: list of GPU processes """ amdsmi = self._amdsmi_mod() devices = self._get_handles() @@ -348,22 +349,22 @@ def get_process(self) -> list[Processes] | None: except Exception: pid = 0 - mem_vu = self._vu(entry.get("mem"), "B") + mem_vu = self._valueunit(entry.get("mem"), "B") mu = entry.get("memory_usage") or {} mem_usage = ProcessMemoryUsage( - gtt_mem=self._vu(mu.get("gtt_mem"), "B"), - cpu_mem=self._vu(mu.get("cpu_mem"), "B"), - vram_mem=self._vu(mu.get("vram_mem"), "B"), + gtt_mem=self._valueunit(mu.get("gtt_mem"), "B"), + cpu_mem=self._valueunit(mu.get("cpu_mem"), "B"), + vram_mem=self._valueunit(mu.get("vram_mem"), "B"), ) eu = entry.get("engine_usage") or {} usage = ProcessUsage( - gfx=self._vu(eu.get("gfx"), "ns"), - enc=self._vu(eu.get("enc"), "ns"), + gfx=self._valueunit(eu.get("gfx"), "ns"), + enc=self._valueunit(eu.get("enc"), "ns"), ) - cu_occ = self._vu(entry.get("cu_occupancy"), "") + cu_occ = self._valueunit(entry.get("cu_occupancy"), "") try: plist.append( @@ -410,11 +411,11 @@ def get_process(self) -> list[Processes] | None: return out - def get_partition(self) -> Partition | None: + def get_partition(self) -> Optional[Partition]: """Check partition information Returns: - Partition | None: Partition data if availabe + Optional[Partition]: Partition data if available """ amdsmi = self._amdsmi_mod() devices = self._get_handles() @@ -468,11 +469,11 @@ def get_partition(self) -> Partition | None: ) return None - def get_firmware(self) -> list[Fw] | None: + def get_firmware(self) -> Optional[list[Fw]]: """Get firmware information Returns: - list[Fw] | None: List of firmware info per GPU + Optional[list[Fw]]: List of firmware info per GPU """ amdsmi = self._amdsmi_mod() devices = self._get_handles() @@ -520,15 +521,17 @@ def get_firmware(self) -> list[Fw] | None: return out - def _smi_try(self, fn, *a, default=None, **kw): - """Helper function to check if amdsmi lib call is availabe + def _smi_try(self, fn: Callable[..., Any], *a: Any, default: Any = None, **kw: Any) -> Any: + """Helper function to check if amdsmi lib call is available Args: - fn (function): amdsmi lib function to call - default (_type_, optional): default ret value. Defaults to None. + fn (Callable[..., Any]): amdsmi lib function to call + *a (Any): variable positional arguments to pass to the function + default (Any, optional): default return value. Defaults to None. + **kw (Any): variable keyword arguments to pass to the function Returns: - function call: function call or log error + Any: result of function call or default value on error """ amdsmi = self._amdsmi_mod() try: @@ -550,7 +553,7 @@ def _smi_try(self, fn, *a, default=None, **kw): 6: "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS", 7: "AMDSMI_STATUS_NOT_FOUND", } - name = CODE2NAME.get(code, "unknown") + name = CODE2NAME.get(code, "unknown") if isinstance(code, int) else "unknown" if name in ("AMDSMI_STATUS_NOT_SUPPORTED", "AMDSMI_STATUS_NOT_FOUND"): self._log_event( @@ -575,11 +578,11 @@ def _smi_try(self, fn, *a, default=None, **kw): ) return default - def get_static(self) -> list[AmdSmiStatic] | None: + def get_static(self) -> Optional[list[AmdSmiStatic]]: """Get Static info from amdsmi lib Returns: - list[AmdSmiStatic] | None: AmdSmiStatic instance or None + Optional[list[AmdSmiStatic]]: list of AmdSmiStatic instances or empty list """ amdsmi = self._amdsmi_mod() devices = self._get_handles() @@ -624,15 +627,17 @@ def get_static(self) -> list[AmdSmiStatic] | None: bus = StaticBus( bdf=bdf, - max_pcie_width=self._vu(max_w, "x"), - max_pcie_speed=self._vu(gtps, "GT/s"), - pcie_interface_version=self._nz(pcie_ver), - slot_type=self._nz(d.get("slot_type"), slot_type=True), + max_pcie_width=self._valueunit(max_w, "x"), + max_pcie_speed=self._valueunit(gtps, "GT/s"), + pcie_interface_version=self._normalize(pcie_ver), + slot_type=self._normalize(d.get("slot_type"), slot_type=True), ) # ASIC asic_model = StaticAsic( - market_name=self._nz(asic.get("market_name") or asic.get("asic_name"), default=""), + market_name=self._normalize( + asic.get("market_name") or asic.get("asic_name"), default="" + ), vendor_id=str(asic.get("vendor_id", "")), vendor_name=str(asic.get("vendor_name", "")), subvendor_id=str(asic.get("subvendor_id", "")), @@ -662,8 +667,8 @@ def get_static(self) -> list[AmdSmiStatic] | None: if callable(drv_fn): drv = self._smi_try(drv_fn, h, default={}) or {} driver_model = StaticDriver( - name=self._nz(drv.get("driver_name"), default="unknown"), - version=self._nz(drv.get("driver_version"), default="unknown"), + name=self._normalize(drv.get("driver_name"), default="unknown"), + version=self._normalize(drv.get("driver_version"), default="unknown"), ) # VBIOS @@ -672,7 +677,7 @@ def get_static(self) -> list[AmdSmiStatic] | None: for k in ("vbios_name", "vbios_build_date", "vbios_part_number", "vbios_version") if k in board } - vbios_model: StaticVbios | None = None + vbios_model: Optional[StaticVbios] = None if vb: vbios_model = StaticVbios( name=str(vb.get("vbios_name", "")), @@ -699,7 +704,7 @@ def get_static(self) -> list[AmdSmiStatic] | None: vram_type = str(asic.get("vram_type", "") or "unknown") vram_vendor = asic.get("vram_vendor") vram_bits = asic.get("vram_bit_width") - vram_size_b: int | None = None + vram_size_b: Optional[int] = None if asic.get("vram_size_bytes") is not None: try: vram_size_b = int(asic["vram_size_bytes"]) @@ -714,8 +719,8 @@ def get_static(self) -> list[AmdSmiStatic] | None: vram_model = StaticVram( type=vram_type, vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor), - size=self._vu(vram_size_b, "B"), - bit_width=self._vu(vram_bits, "bit"), + size=self._valueunit(vram_size_b, "B"), + bit_width=self._valueunit(vram_bits, "bit"), max_bandwidth=None, ) @@ -755,14 +760,14 @@ def get_static(self) -> list[AmdSmiStatic] | None: return out - def _get_soc_pstate(self, h) -> StaticSocPstate | None: + def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]: """SOC pstate check Args: - h (_type_): handle + handle (Any): GPU device handle Returns: - StaticSocPstate | None: class instance + Optional[StaticSocPstate]: StaticSocPstate instance or None """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_soc_pstate", None) @@ -774,7 +779,7 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None: ) return None - data = self._smi_try(fn, h, default=None) + data = self._smi_try(fn, handle, default=None) if not isinstance(data, dict): return None @@ -817,14 +822,14 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None: except ValidationError: return None - def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: + def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]: """Check XGMI plpd Args: - h (_type_): handle + handle (Any): GPU device handle Returns: - StaticXgmiPlpd | None: class instance + Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_xgmi_plpd", None) @@ -836,7 +841,7 @@ def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: ) return None - data = self._smi_try(fn, h, default=None) + data = self._smi_try(fn, handle, default=None) if not isinstance(data, dict): return None @@ -879,23 +884,23 @@ def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: except ValidationError: return None - def _get_cache_info(self, h) -> list[StaticCacheInfoItem]: - """check cache info + def _get_cache_info(self, handle: Any) -> list[StaticCacheInfoItem]: + """Check cache info Args: - h (_type_): handle + handle (Any): GPU device handle Returns: - list[StaticCacheInfoItem]: class instance + list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances """ amdsmi = self._amdsmi_mod() - raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, h, default=None) + raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, handle, default=None) if not isinstance(raw, dict) or not isinstance(raw.get("cache"), list): return [] items = raw["cache"] - def _as_list_str(v) -> list[str]: + def _as_list_str(v: Any) -> list[str]: if isinstance(v, list): return [str(x) for x in v] if isinstance(v, str): @@ -908,10 +913,10 @@ def _as_list_str(v) -> list[str]: if not isinstance(e, dict): continue - cache_level = self._vu_req(e.get("cache_level"), "") - max_num_cu_shared = self._vu_req(e.get("max_num_cu_shared"), "") - num_cache_instance = self._vu_req(e.get("num_cache_instance"), "") - cache_size = self._vu(e.get("cache_size"), "", required=False) + cache_level = self._valueunit_req(e.get("cache_level"), "") + max_num_cu_shared = self._valueunit_req(e.get("max_num_cu_shared"), "") + num_cache_instance = self._valueunit_req(e.get("num_cache_instance"), "") + cache_size = self._valueunit(e.get("cache_size"), "", required=False) cache_props = _as_list_str(e.get("cache_properties")) lvl_val = cache_level.value @@ -942,14 +947,14 @@ def _as_list_str(v) -> list[str]: return out - def _get_clock(self, h) -> StaticClockData | None: + def _get_clock(self, handle: Any) -> Optional[StaticClockData]: """Get clock info Args: - h (_type_): handle + handle (Any): GPU device handle Returns: - StaticClockData | None: class instance + Optional[StaticClockData]: StaticClockData instance or None """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_clk_freq", None) @@ -957,7 +962,7 @@ def _get_clock(self, h) -> StaticClockData | None: if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"): return None - data = self._smi_try(fn, h, clk_type.SYS, default=None) + data = self._smi_try(fn, handle, clk_type.SYS, default=None) if not isinstance(data, dict): return None @@ -965,7 +970,7 @@ def _get_clock(self, h) -> StaticClockData | None: if not isinstance(freqs_raw, list) or not freqs_raw: return None - def _to_mhz(v: object) -> int | None: + def _to_mhz(v: object) -> Optional[int]: x = self._to_number(v) if x is None: return None @@ -985,15 +990,15 @@ def _to_mhz(v: object) -> int | None: if not freqs_mhz: return None - def _fmt(n: int | None) -> str | None: + def _fmt(n: Optional[int]) -> Optional[str]: return None if n is None else f"{n} MHz" level0: str = _fmt(freqs_mhz[0]) or "0 MHz" - level1: str | None = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None - level2: str | None = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None + level1: Optional[str] = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None + level2: Optional[str] = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None cur_raw = data.get("current") - current: int | None + current: Optional[int] if isinstance(cur_raw, (int, float)): current = int(cur_raw) elif isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A": @@ -1015,15 +1020,15 @@ def _fmt(n: int | None) -> str | None: def collect_data( self, - args=None, - ) -> tuple[TaskResult, AmdSmiDataModel | None]: + args: Any = None, + ) -> tuple[TaskResult, Optional[AmdSmiDataModel]]: """Collect AmdSmi data from system Args: - args (_type_, optional): _description_. Defaults to None. + args (Any, optional): optional arguments for data collection. Defaults to None. Returns: - tuple[TaskResult, AmdSmiDataModel | None]: _description_ + tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model """ if not self._bind_amdsmi_or_log(): diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 36564666..821bb10b 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1,5 +1,5 @@ import re -from typing import Any, List, Mapping +from typing import Any, List, Mapping, Optional, Union from pydantic import ( BaseModel, @@ -15,21 +15,21 @@ _NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$") -def na_to_none(values: int | str): +def na_to_none(values: Union[int, str]): if values == "N/A": return None return values -def na_to_none_list(values: List[int | str | None]) -> List[int | str | None]: - ret_list: List[int | str | None] = values.copy() +def na_to_none_list(values: List[Union[int, str, None]]) -> List[Union[int, str, None]]: + ret_list: List[Union[int, str, None]] = values.copy() for i in range(len(ret_list)): if ret_list[i] == "N/A": ret_list[i] = None return ret_list -def na_to_none_dict(values: object) -> dict[str, Any] | None: +def na_to_none_dict(values: object) -> Optional[dict[str, Any]]: """Normalize mapping-like fields where 'N/A' or empty should become None. Accepts None; returns None for 'N/A'/'NA'/'' or non-mapping inputs.""" if values is None: @@ -63,7 +63,7 @@ class AmdSmiBaseModel(BaseModel): ) def __init__(self, **data): - # Convert int | str | float -> ValueUnit + # Convert Union[int, str, float] -> ValueUnit for field_name, field_type in self.model_fields.items(): annotation = field_type.annotation target_type, container = find_annotation_in_container(annotation, ValueUnit) @@ -90,7 +90,7 @@ class ValueUnit(BaseModel): - "N/A" / "NA" / "" / None -> None """ - value: int | float | str + value: Union[int, float, str] unit: str = "" @model_validator(mode="before") @@ -139,17 +139,17 @@ def _clean_unit(cls, u): # Process class ProcessMemoryUsage(BaseModel): - gtt_mem: ValueUnit | None - cpu_mem: ValueUnit | None - vram_mem: ValueUnit | None + gtt_mem: Optional[ValueUnit] + cpu_mem: Optional[ValueUnit] + vram_mem: Optional[ValueUnit] na_validator = field_validator("gtt_mem", "cpu_mem", "vram_mem", mode="before")(na_to_none) class ProcessUsage(BaseModel): # AMDSMI reports engine usage in nanoseconds - gfx: ValueUnit | None - enc: ValueUnit | None + gfx: Optional[ValueUnit] + enc: Optional[ValueUnit] na_validator = field_validator("gfx", "enc", mode="before")(na_to_none) @@ -157,15 +157,15 @@ class ProcessInfo(BaseModel): name: str pid: int - mem: ValueUnit | None = None + mem: Optional[ValueUnit] = None memory_usage: ProcessMemoryUsage usage: ProcessUsage - cu_occupancy: ValueUnit | None = None + cu_occupancy: Optional[ValueUnit] = None na_validator = field_validator("mem", "cu_occupancy", mode="before")(na_to_none) class ProcessListItem(BaseModel): - process_info: ProcessInfo | str + process_info: Union[ProcessInfo, str] class Processes(BaseModel): @@ -196,12 +196,12 @@ class AmdSmiListItem(BaseModel): class AmdSmiVersion(BaseModel): """Contains the versioning info for amd-smi""" - tool: str | None = None - version: str | None = None - amdsmi_library_version: str | None = None - rocm_version: str | None = None - amdgpu_version: str | None = None - amd_hsmp_driver_version: str | None = None + tool: Optional[str] = None + version: Optional[str] = None + amdsmi_library_version: Optional[str] = None + rocm_version: Optional[str] = None + amdgpu_version: Optional[str] = None + amd_hsmp_driver_version: Optional[str] = None @field_validator("*", mode="before") @classmethod @@ -219,24 +219,24 @@ class PartitionAccelerator(BaseModel): """Accelerator partition data""" gpu_id: int - memory: str | None = None - accelerator_type: str | None = None - accelerator_profile_index: str | int | None = None - partition_id: int | None = None + memory: Optional[str] = None + accelerator_type: Optional[str] = None + accelerator_profile_index: Optional[Union[str, int]] = None + partition_id: Optional[int] = None class PartitionMemory(BaseModel): """Memory Partition data""" gpu_id: int - partition_type: str | None = None + partition_type: Optional[str] = None class PartitionCompute(BaseModel): """Compute Partition data""" gpu_id: int - partition_type: str | None = None + partition_type: Optional[str] = None class Partition(BaseModel): @@ -263,8 +263,8 @@ class StaticAsic(BaseModel): class StaticBus(AmdSmiBaseModel): bdf: str - max_pcie_width: ValueUnit | None = None - max_pcie_speed: ValueUnit | None = None + max_pcie_width: Optional[ValueUnit] = None + max_pcie_speed: Optional[ValueUnit] = None pcie_interface_version: str = "unknown" slot_type: str = "unknown" @@ -277,15 +277,15 @@ class StaticVbios(BaseModel): class StaticLimit(AmdSmiBaseModel): - max_power: ValueUnit | None - min_power: ValueUnit | None - socket_power: ValueUnit | None - slowdown_edge_temperature: ValueUnit | None - slowdown_hotspot_temperature: ValueUnit | None - slowdown_vram_temperature: ValueUnit | None - shutdown_edge_temperature: ValueUnit | None - shutdown_hotspot_temperature: ValueUnit | None - shutdown_vram_temperature: ValueUnit | None + max_power: Optional[ValueUnit] + min_power: Optional[ValueUnit] + socket_power: Optional[ValueUnit] + slowdown_edge_temperature: Optional[ValueUnit] + slowdown_hotspot_temperature: Optional[ValueUnit] + slowdown_vram_temperature: Optional[ValueUnit] + shutdown_edge_temperature: Optional[ValueUnit] + shutdown_hotspot_temperature: Optional[ValueUnit] + shutdown_vram_temperature: Optional[ValueUnit] na_validator = field_validator( "max_power", "min_power", @@ -350,10 +350,10 @@ class StaticNuma(BaseModel): class StaticVram(AmdSmiBaseModel): type: str - vendor: str | None - size: ValueUnit | None - bit_width: ValueUnit | None - max_bandwidth: ValueUnit | None = None + vendor: Optional[str] + size: Optional[ValueUnit] + bit_width: Optional[ValueUnit] + max_bandwidth: Optional[ValueUnit] = None na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")( na_to_none ) @@ -362,7 +362,7 @@ class StaticVram(AmdSmiBaseModel): class StaticCacheInfoItem(AmdSmiBaseModel): cache: ValueUnit cache_properties: List[str] - cache_size: ValueUnit | None + cache_size: Optional[ValueUnit] cache_level: ValueUnit max_num_cu_shared: ValueUnit num_cache_instance: ValueUnit @@ -375,8 +375,8 @@ class StaticFrequencyLevels(BaseModel): ) Level_0: str = Field(..., alias="Level 0") - Level_1: str | None = Field(default=None, alias="Level 1") - Level_2: str | None = Field(default=None, alias="Level 2") + Level_1: Optional[str] = Field(default=None, alias="Level 1") + Level_2: Optional[str] = Field(default=None, alias="Level 2") class StaticClockData(BaseModel): @@ -385,7 +385,7 @@ class StaticClockData(BaseModel): ) frequency: StaticFrequencyLevels - current: int | None = Field(..., alias="current") + current: Optional[int] = Field(..., alias="current") na_validator = field_validator("current", mode="before")(na_to_none) @@ -395,18 +395,18 @@ class AmdSmiStatic(BaseModel): gpu: int asic: StaticAsic bus: StaticBus - vbios: StaticVbios | None - limit: StaticLimit | None - driver: StaticDriver | None + vbios: Optional[StaticVbios] + limit: Optional[StaticLimit] + driver: Optional[StaticDriver] board: StaticBoard - soc_pstate: StaticSocPstate | None - xgmi_plpd: StaticXgmiPlpd | None + soc_pstate: Optional[StaticSocPstate] + xgmi_plpd: Optional[StaticXgmiPlpd] process_isolation: str numa: StaticNuma vram: StaticVram cache_info: List[StaticCacheInfoItem] - partition: StaticPartition | None = None - clock: StaticClockData | None = None + partition: Optional[StaticPartition] = None + clock: Optional[StaticClockData] = None na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( na_to_none @@ -429,14 +429,14 @@ class AmdSmiDataModel(DataModel): populate_by_name=True, ) - version: AmdSmiVersion | None = None - gpu_list: list[AmdSmiListItem] | None = Field(default_factory=list) - partition: Partition | None = None - process: list[Processes] | None = Field(default_factory=list) - firmware: list[Fw] | None = Field(default_factory=list) - static: list[AmdSmiStatic] | None = Field(default_factory=list) + version: Optional[AmdSmiVersion] = None + gpu_list: Optional[list[AmdSmiListItem]] = Field(default_factory=list) + partition: Optional[Partition] = None + process: Optional[list[Processes]] = Field(default_factory=list) + firmware: Optional[list[Fw]] = Field(default_factory=list) + static: Optional[list[AmdSmiStatic]] = Field(default_factory=list) - def get_list(self, gpu: int) -> AmdSmiListItem | None: + def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: """Get the gpu list item for the given gpu id.""" if self.gpu_list is None: return None @@ -445,7 +445,7 @@ def get_list(self, gpu: int) -> AmdSmiListItem | None: return item return None - def get_process(self, gpu: int) -> Processes | None: + def get_process(self, gpu: int) -> Optional[Processes]: """Get the process data for the given gpu id.""" if self.process is None: return None @@ -454,7 +454,7 @@ def get_process(self, gpu: int) -> Processes | None: return item return None - def get_firmware(self, gpu: int) -> Fw | None: + def get_firmware(self, gpu: int) -> Optional[Fw]: """Get the firmware data for the given gpu id.""" if self.firmware is None: return None @@ -463,7 +463,7 @@ def get_firmware(self, gpu: int) -> Fw | None: return item return None - def get_static(self, gpu: int) -> AmdSmiStatic | None: + def get_static(self, gpu: int) -> Optional[AmdSmiStatic]: """Get the static data for the given gpu id.""" if self.static is None: return None diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 5b88b241..b8721014 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -37,8 +37,8 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): expected_memory_partition_mode: Optional[str] = None expected_compute_partition_mode: Optional[str] = None expected_pldm_version: Optional[str] = None - l0_to_recovery_count_error_threshold: Optional[int] = None - l0_to_recovery_count_warning_threshold: Optional[int] = None + l0_to_recovery_count_error_threshold: Optional[int] = 3 + l0_to_recovery_count_warning_threshold: Optional[int] = 1 vendorid_ep: Optional[str] = None vendorid_ep_vf: Optional[str] = None devid_ep: Optional[str] = None diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 65c8cee7..ceaccea3 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -27,7 +27,7 @@ import re import traceback from enum import Enum -from typing import Any, TypeVar, get_args, get_origin +from typing import Any, TypeVar, Union, get_args, get_origin T = TypeVar("T") @@ -173,14 +173,14 @@ def bytes_to_human_readable(input_bytes: int) -> str: def find_annotation_in_container( annotation, target_type -) -> tuple[Any, list[Any]] | tuple[None, list[Any]]: +) -> Union[tuple[Any, list[Any]], tuple[None, list[Any]]]: """Recursively search for a target type in an annotation and return the target type and the containers supported container types are generic types, Callable, Tuple, Union, Literal, Final, ClassVar and Annotated. If the target type is not found then None is returned. Examples: find_annotation_in_container(Union[int, str], int) -> int, [Union[int, str]] - find_annotation_in_container(int | dict[str, list[MyClass]], MyClass) -> MyClass, [list,dict,union] + find_annotation_in_container(Union[int, dict[str, list[MyClass]]], MyClass) -> MyClass, [list,dict,union] find_annotation_in_container(Union[int, str], MyClass) -> None, [] Parameters @@ -192,7 +192,7 @@ def find_annotation_in_container( Returns ------- - tuple[Any, list[Any]] | tuple[None, []] + Union[tuple[Any, list[Any]], tuple[None, []]] The target type and the containers if found, otherwise None and an empty list. """ containers: list[Any] = [] From 05b4772fe8a60d3f9abb334ec620d7433edd64ae Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 11 Nov 2025 14:01:16 -0600 Subject: [PATCH 30/38] moved check outside static if else --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index fa575745..c15c5c6e 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -441,12 +441,7 @@ def analyze_data( self.check_expected_max_power(data.static, args.expected_max_power) if args.expected_driver_version: self.check_expected_driver_version(data.static, args.expected_driver_version) - if args.expected_memory_partition_mode or args.expected_compute_partition_mode: - self.check_expected_memory_partition_mode( - data.partition, - args.expected_memory_partition_mode, - args.expected_compute_partition_mode, - ) + self.static_consistancy_check(data.static) if ( self.system_info.sku @@ -464,6 +459,13 @@ def analyze_data( sku_name=args.sku_name, ) + if args.expected_memory_partition_mode or args.expected_compute_partition_mode: + self.check_expected_memory_partition_mode( + data.partition, + args.expected_memory_partition_mode, + args.expected_compute_partition_mode, + ) + if args.expected_pldm_version: self.check_pldm_version(data.firmware, args.expected_pldm_version) From 84f24f24fd6839bb505a5f8179698e06abb85de4 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 11 Nov 2025 14:58:46 -0600 Subject: [PATCH 31/38] fixed utest for py3.9 --- test/unit/plugin/test_amdsmi_collector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index b046521f..2a34551b 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -1,7 +1,7 @@ import importlib import sys import types -from typing import Tuple +from typing import Optional, Tuple import pytest @@ -29,7 +29,7 @@ class AmdSmiTimeoutError(_BaseAmdSmiError): ... def make_fake_amdsmi( *, - handles: Tuple[object, ...] | None = None, + handles: Optional[Tuple[object, ...]] = None, lib_version="1.2.3", rocm_version="6.1.0", pcie_static=True, From 107fd46c76c4e4d48a5afedd9e3a984c7def20ea Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 13 Nov 2025 10:25:12 -0600 Subject: [PATCH 32/38] removed deprecated API call --- .../plugins/inband/amdsmi/amdsmi_collector.py | 14 ++------------ test/unit/plugin/test_amdsmi_collector.py | 2 -- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 6ad35398..4ddb736a 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -291,15 +291,8 @@ def _to_int(x: Any, default: int = 0) -> int: for idx, h in enumerate(devices): bdf = self._smi_try(amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" uuid = self._smi_try(amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" - kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} - kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} partition_id = 0 - if isinstance(kfd, dict): - try: - partition_id = int(kfd.get("current_partition_id", 0) or 0) - except Exception: - partition_id = 0 try: out.append( @@ -692,12 +685,9 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: numa_node = int(kfd.get("node_id", 0) or 0) except Exception: numa_node = 0 - try: - affinity = int(kfd.get("cpu_affinity", 0) or 0) - except Exception: - affinity = 0 else: - numa_node, affinity = 0, 0 + numa_node = 0 + affinity = 0 numa_model = StaticNuma(node=numa_node, affinity=affinity) # VRAM diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index 2a34551b..2256c3e9 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -81,8 +81,6 @@ def amdsmi_get_processor_handles(): m.amdsmi_get_gpu_kfd_info = lambda h: { "kfd_id": 7, "node_id": 3, - "cpu_affinity": 0xFF, - "current_partition_id": 0, } m.amdsmi_get_gpu_board_info = lambda h: { "vbios_name": "vbiosA", From fccc0ebcf3e35561a511293ae7ac8627d8923e75 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 14 Nov 2025 11:58:57 -0600 Subject: [PATCH 33/38] adding version info during run --- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 4ddb736a..c32ca789 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -261,7 +261,7 @@ def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: category=EventCategory.APPLICATION, description="Failed to read AMD SMI versions", data={"exception": get_exception_traceback(e)}, - priority=EventPriority.WARNING, + priority=EventPriority.INFO, ) return None @@ -1028,6 +1028,10 @@ def collect_data( amdsmi = self._amdsmi_mod() try: amdsmi.amdsmi_init(amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS) # type: ignore[attr-defined] + version = self._get_amdsmi_version() + if version is not None: + self.logger.info("amdsmi version: %s", version.version) + self.logger.info("ROCm version: %s", version.rocm_version) amd_smi_data = self._get_amdsmi_data() if amd_smi_data is None: From 5cbe96faca68b7fef526b7fcc5dd00651a4c67fb Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 14 Nov 2025 20:20:02 -0600 Subject: [PATCH 34/38] removed python API calls and using cmd line tool to enable remote runs --- .../plugins/inband/amdsmi/amdsmi_collector.py | 644 ++++++++---------- test/unit/plugin/test_amdsmi_collector.py | 573 +++++++++------- 2 files changed, 610 insertions(+), 607 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index c32ca789..fc20df14 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -23,8 +23,8 @@ # SOFTWARE. # ############################################################################### -import importlib -from typing import Any, Callable, Optional, Union, cast +import json +from typing import Any, Optional, Union from pydantic import ValidationError @@ -67,20 +67,77 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): """Class for collection of inband tool amd-smi data.""" + AMD_SMI_EXE = "amd-smi" + SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} DATA_MODEL = AmdSmiDataModel - _amdsmi: Optional[Any] = None # dynamic import + CMD_VERSION = "amd-smi version --json" + CMD_LIST = "amd-smi list --json" + CMD_PROCESS = "amd-smi process --json" + CMD_PARTITION = "amd-smi partition --json" + CMD_FIRMWARE = "amd-smi firmware --json" + CMD_STATIC = "amd-smi static -g all --json" + + def _check_amdsmi_installed(self) -> bool: + """Check if amd-smi is installed + + Returns: + bool: True if amd-smi is installed, False otherwise + """ + cmd_ret = self._run_sut_cmd("which amd-smi") + return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout) + + def _run_amd_smi(self, cmd: str) -> Optional[str]: + """Run amd-smi command + + Args: + cmd (str): command arguments to pass to amd-smi + + Returns: + Optional[str]: stdout from command or None on error + """ + cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}") + if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi command", + data={ + "command": cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + return cmd_ret.stdout + + def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]: + """Run amd-smi command with json output - def _amdsmi_mod(self) -> Any: - """Check for amdsmi installation + Args: + cmd (str): command arguments to pass to amd-smi Returns: - Any: local instance of amdsmi module + Optional[Union[dict, list[dict]]]: parsed JSON output or None on error """ - assert self._amdsmi is not None, "amdsmi module not bound" - return self._amdsmi + cmd += " --json" + cmd_ret = self._run_amd_smi(cmd) + if cmd_ret: + try: + return json.loads(cmd_ret) + except json.JSONDecodeError as e: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Error parsing command: `{cmd}` json data", + data={"cmd": cmd, "exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + return None def _to_number(self, v: object) -> Optional[Union[int, float]]: """Helper function to return number from str, float or "N/A" @@ -163,46 +220,6 @@ def _normalize(self, val: object, default: str = "unknown", slot_type: bool = Fa return s - def _bind_amdsmi_or_log(self) -> bool: - """Bind to local amdsmi lib or log that it is not found - - Returns: - bool: True if module is found, false otherwise - """ - if getattr(self, "_amdsmi", None) is not None: - return True - try: - self._amdsmi = importlib.import_module("amdsmi") - return True - except Exception as e: - self._log_event( - category=EventCategory.APPLICATION, - description="Failed to import amdsmi package, please ensure amdsmi is installed and Python bindings are available", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.ERROR, - console_log=True, - ) - return False - - def _get_handles(self) -> list[Any]: - """Get amdsmi handles - - Returns: - list[Any]: list of processor handles - """ - amdsmi = self._amdsmi_mod() - try: - return amdsmi.amdsmi_get_processor_handles() - except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] - self._log_event( - category=EventCategory.APPLICATION, - description="amdsmi_get_processor_handles failed", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.ERROR, - console_log=True, - ) - return [] - def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: """Fill in information for AmdSmi data model @@ -247,39 +264,46 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: return None def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: - """Check amdsmi library version + """Get amdsmi version and data Returns: - Optional[AmdSmiVersion]: version of the library + Optional[AmdSmiVersion]: version information or None on error """ - amdsmi = self._amdsmi_mod() + ret = self._run_amd_smi_dict("version") + if not ret or not isinstance(ret, list) or len(ret) == 0: + return None + + version_data = ret[0] if isinstance(ret, list) else ret + if not isinstance(version_data, dict): + return None + try: - lib_ver = amdsmi.amdsmi_get_lib_version() or "" - rocm_ver = amdsmi.amdsmi_get_rocm_version() or "" - except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] + return AmdSmiVersion( + tool="amdsmi", + version=version_data.get("amdsmi_library_version", ""), + amdsmi_library_version=version_data.get("amdsmi_library_version", ""), + rocm_version=version_data.get("rocm_version", ""), + ) + except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, - description="Failed to read AMD SMI versions", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.INFO, + description="Failed to build AmdSmiVersion", + data=get_exception_details(e), + priority=EventPriority.WARNING, ) return None - return AmdSmiVersion( - tool="amdsmi", - version=lib_ver, - amdsmi_library_version=lib_ver, - rocm_version=rocm_ver, - ) - def get_gpu_list(self) -> Optional[list[AmdSmiListItem]]: - """Get GPU information from amdsmi lib + """Get GPU information from amd-smi list command Returns: Optional[list[AmdSmiListItem]]: list of GPU info items """ - amdsmi = self._amdsmi_mod() - devices = self._get_handles() + ret = self._run_amd_smi_dict("list") + if not ret: + return [] + + gpu_data = ret if isinstance(ret, list) else [ret] out: list[AmdSmiListItem] = [] def _to_int(x: Any, default: int = 0) -> int: @@ -288,28 +312,26 @@ def _to_int(x: Any, default: int = 0) -> int: except Exception: return default - for idx, h in enumerate(devices): - bdf = self._smi_try(amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" - uuid = self._smi_try(amdsmi.amdsmi_get_gpu_device_uuid, h, default="") or "" - kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} - partition_id = 0 + for item in gpu_data: + if not isinstance(item, dict): + continue try: out.append( AmdSmiListItem( - gpu=idx, - bdf=bdf, - uuid=uuid, - kfd_id=_to_int(kfd.get("kfd_id", 0)) if isinstance(kfd, dict) else 0, - node_id=_to_int(kfd.get("node_id", 0)) if isinstance(kfd, dict) else 0, - partition_id=partition_id, + gpu=_to_int(item.get("gpu", 0)), + bdf=str(item.get("bdf", "")), + uuid=str(item.get("uuid", "")), + kfd_id=_to_int(item.get("kfd_id", 0)), + node_id=_to_int(item.get("node_id", 0)), + partition_id=_to_int(item.get("partition_id", 0)), ) ) except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiListItem", - data={"exception": get_exception_traceback(e), "gpu_index": idx}, + data={"exception": get_exception_traceback(e), "item": item}, priority=EventPriority.WARNING, ) @@ -321,84 +343,86 @@ def get_process(self) -> Optional[list[Processes]]: Returns: Optional[list[Processes]]: list of GPU processes """ - amdsmi = self._amdsmi_mod() - devices = self._get_handles() + ret = self._run_amd_smi_dict("process") + if not ret: + return [] + + process_data = ret if isinstance(ret, list) else [ret] out: list[Processes] = [] - for idx, h in enumerate(devices): - try: - raw_list = self._smi_try(amdsmi.amdsmi_get_gpu_process_list, h, default=[]) or [] - plist: list[ProcessListItem] = [] - - for entry in raw_list: - if not isinstance(entry, dict): - plist.append(ProcessListItem(process_info=str(entry))) - continue - - name = entry.get("name", "N/A") - pid_val = entry.get("pid", 0) - try: - pid = int(pid_val) if pid_val not in (None, "") else 0 - except Exception: - pid = 0 - - mem_vu = self._valueunit(entry.get("mem"), "B") - - mu = entry.get("memory_usage") or {} - mem_usage = ProcessMemoryUsage( - gtt_mem=self._valueunit(mu.get("gtt_mem"), "B"), - cpu_mem=self._valueunit(mu.get("cpu_mem"), "B"), - vram_mem=self._valueunit(mu.get("vram_mem"), "B"), - ) + for item in process_data: + if not isinstance(item, dict): + continue - eu = entry.get("engine_usage") or {} - usage = ProcessUsage( - gfx=self._valueunit(eu.get("gfx"), "ns"), - enc=self._valueunit(eu.get("enc"), "ns"), - ) + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + process_list_raw = item.get("process_list", []) + if not isinstance(process_list_raw, list): + continue - cu_occ = self._valueunit(entry.get("cu_occupancy"), "") - - try: - plist.append( - ProcessListItem( - process_info=ProcessInfo( - name=str(name), - pid=pid, - mem=mem_vu, - memory_usage=mem_usage, - usage=usage, - cu_occupancy=cu_occ, - ) - ) - ) - except ValidationError as e: - self._log_event( - category=EventCategory.APPLICATION, - description="Failed to build ProcessListItem; skipping entry", - data={ - "exception": get_exception_traceback(e), - "gpu_index": idx, - "entry": repr(entry), - }, - priority=EventPriority.WARNING, - ) - continue + plist: list[ProcessListItem] = [] + + for entry in process_list_raw: + if not isinstance(entry, dict): + plist.append(ProcessListItem(process_info=str(entry))) + continue + + name = entry.get("name", "N/A") + pid_val = entry.get("pid", 0) + try: + pid = int(pid_val) if pid_val not in (None, "") else 0 + except Exception: + pid = 0 + + mem_vu = self._valueunit(entry.get("mem"), "B") + + mu = entry.get("memory_usage") or {} + mem_usage = ProcessMemoryUsage( + gtt_mem=self._valueunit(mu.get("gtt_mem"), "B"), + cpu_mem=self._valueunit(mu.get("cpu_mem"), "B"), + vram_mem=self._valueunit(mu.get("vram_mem"), "B"), + ) + + eu = entry.get("engine_usage") or {} + usage = ProcessUsage( + gfx=self._valueunit(eu.get("gfx"), "ns"), + enc=self._valueunit(eu.get("enc"), "ns"), + ) + + cu_occ = self._valueunit(entry.get("cu_occupancy"), "") try: - out.append(Processes(gpu=idx, process_list=plist)) + plist.append( + ProcessListItem( + process_info=ProcessInfo( + name=str(name), + pid=pid, + mem=mem_vu, + memory_usage=mem_usage, + usage=usage, + cu_occupancy=cu_occ, + ) + ) + ) except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, - description="Failed to build Processes", - data={"exception": get_exception_traceback(e), "gpu_index": idx}, + description="Failed to build ProcessListItem; skipping entry", + data={ + "exception": get_exception_traceback(e), + "gpu_index": gpu_idx, + "entry": repr(entry), + }, priority=EventPriority.WARNING, ) - except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] + continue + + try: + out.append(Processes(gpu=gpu_idx, process_list=plist)) + except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, - description="Process collection failed", - data={"exception": get_exception_traceback(e), "gpu_index": idx}, + description="Failed to build Processes", + data={"exception": get_exception_traceback(e), "gpu_index": gpu_idx}, priority=EventPriority.WARNING, ) @@ -410,18 +434,25 @@ def get_partition(self) -> Optional[Partition]: Returns: Optional[Partition]: Partition data if available """ - amdsmi = self._amdsmi_mod() - devices = self._get_handles() + ret = self._run_amd_smi_dict("partition") + if not ret: + return None + + partition_data = ret if isinstance(ret, list) else [ret] memparts: list[PartitionMemory] = [] computeparts: list[PartitionCompute] = [] - for idx, h in enumerate(devices): - mem_pt = self._smi_try(amdsmi.amdsmi_get_gpu_memory_partition, h, default=None) - comp_pt = self._smi_try(amdsmi.amdsmi_get_gpu_compute_partition, h, default=None) + for item in partition_data: + if not isinstance(item, dict): + continue + + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + mem_pt = item.get("memory_partition") + comp_pt = item.get("compute_partition") try: memparts.append( - PartitionMemory(gpu_id=idx, partition_type=cast(Optional[str], mem_pt)) + PartitionMemory(gpu_id=gpu_idx, partition_type=str(mem_pt) if mem_pt else None) ) except ValidationError as e: self._log_event( @@ -429,7 +460,7 @@ def get_partition(self) -> Optional[Partition]: description="Failed to build PartitionMemory", data={ "exception": get_exception_traceback(e), - "gpu_index": idx, + "gpu_index": gpu_idx, "data": mem_pt, }, priority=EventPriority.WARNING, @@ -437,7 +468,9 @@ def get_partition(self) -> Optional[Partition]: try: computeparts.append( - PartitionCompute(gpu_id=idx, partition_type=cast(Optional[str], comp_pt)) + PartitionCompute( + gpu_id=gpu_idx, partition_type=str(comp_pt) if comp_pt else None + ) ) except ValidationError as e: self._log_event( @@ -445,7 +478,7 @@ def get_partition(self) -> Optional[Partition]: description="Failed to build PartitionCompute", data={ "exception": get_exception_traceback(e), - "gpu_index": idx, + "gpu_index": gpu_idx, "data": comp_pt, }, priority=EventPriority.WARNING, @@ -468,23 +501,25 @@ def get_firmware(self) -> Optional[list[Fw]]: Returns: Optional[list[Fw]]: List of firmware info per GPU """ - amdsmi = self._amdsmi_mod() - devices = self._get_handles() + ret = self._run_amd_smi_dict("firmware") + if not ret: + return [] + + firmware_data = ret if isinstance(ret, list) else [ret] out: list[Fw] = [] - for idx, h in enumerate(devices): - raw = self._smi_try(amdsmi.amdsmi_get_fw_info, h, default=None) - if ( - not isinstance(raw, dict) - or "fw_list" not in raw - or not isinstance(raw["fw_list"], list) - ): + for item in firmware_data: + if not isinstance(item, dict): continue - items = raw["fw_list"] + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 + fw_list_raw = item.get("fw_list", []) + + if not isinstance(fw_list_raw, list): + continue normalized: list[FwListItem] = [] - for e in items: + for e in fw_list_raw: if isinstance(e, dict): fid = e.get("fw_name") ver = e.get("fw_version") @@ -503,128 +538,59 @@ def get_firmware(self) -> Optional[list[Fw]]: ) try: - out.append(Fw(gpu=idx, fw_list=normalized)) + out.append(Fw(gpu=gpu_idx, fw_list=normalized)) except ValidationError as e: self._log_event( category=EventCategory.APPLICATION, description="Failed to build Fw", - data={"exception": get_exception_traceback(e), "gpu_index": idx}, + data={"exception": get_exception_traceback(e), "gpu_index": gpu_idx}, priority=EventPriority.WARNING, ) return out - def _smi_try(self, fn: Callable[..., Any], *a: Any, default: Any = None, **kw: Any) -> Any: - """Helper function to check if amdsmi lib call is available - - Args: - fn (Callable[..., Any]): amdsmi lib function to call - *a (Any): variable positional arguments to pass to the function - default (Any, optional): default return value. Defaults to None. - **kw (Any): variable keyword arguments to pass to the function - - Returns: - Any: result of function call or default value on error - """ - amdsmi = self._amdsmi_mod() - try: - return fn(*a, **kw) - except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] - self.logger.warning(e) - code = getattr(e, "ret_code", None) - if code is None: - try: - code = int(e.args[0]) if getattr(e, "args", None) else None - except Exception: - code = None - CODE2NAME = { - 1: "AMDSMI_STATUS_SUCCESS", - 2: "AMDSMI_STATUS_NOT_SUPPORTED", - 3: "AMDSMI_STATUS_PERMISSION", - 4: "AMDSMI_STATUS_OUT_OF_RESOURCES", - 5: "AMDSMI_STATUS_INIT_ERROR", - 6: "AMDSMI_STATUS_INPUT_OUT_OF_BOUNDS", - 7: "AMDSMI_STATUS_NOT_FOUND", - } - name = CODE2NAME.get(code, "unknown") if isinstance(code, int) else "unknown" - - if name in ("AMDSMI_STATUS_NOT_SUPPORTED", "AMDSMI_STATUS_NOT_FOUND"): - self._log_event( - category=EventCategory.APPLICATION, - description=f"{fn.__name__} not supported on this device/mode (status={name}, code={code})", - priority=EventPriority.WARNING, - ) - return default - if name == "AMDSMI_STATUS_PERMISSION": - self._log_event( - category=EventCategory.APPLICATION, - description=f"{fn.__name__} permission denied (need access to /dev/kfd & render nodes, or root for RAS). status={name}, code={code})", - priority=EventPriority.WARNING, - ) - return default - - self._log_event( - category=EventCategory.APPLICATION, - description=f"{fn.__name__} failed (status={name}, code={code})", - data={"exception": get_exception_traceback(e)}, - priority=EventPriority.WARNING, - ) - return default - def get_static(self) -> Optional[list[AmdSmiStatic]]: - """Get Static info from amdsmi lib + """Get Static info from amd-smi static command Returns: Optional[list[AmdSmiStatic]]: list of AmdSmiStatic instances or empty list """ - amdsmi = self._amdsmi_mod() - devices = self._get_handles() - if not devices: + ret = self._run_amd_smi_dict("static -g all") + if not ret: return [] - pcie_fn = getattr(amdsmi, "amdsmi_get_pcie_info", None) + if isinstance(ret, dict) and "gpu_data" in ret: + ret = ret["gpu_data"] + static_data = ret if isinstance(ret, list) else [ret] out: list[AmdSmiStatic] = [] - for idx, h in enumerate(devices): - board = self._smi_try(amdsmi.amdsmi_get_gpu_board_info, h, default={}) or {} - asic = self._smi_try(amdsmi.amdsmi_get_gpu_asic_info, h, default={}) or {} - bdf = self._smi_try(amdsmi.amdsmi_get_gpu_device_bdf, h, default="") or "" - kfd = self._smi_try(amdsmi.amdsmi_get_gpu_kfd_info, h, default={}) or {} + for item in static_data: + if not isinstance(item, dict) or "gpu" not in item: + continue - # Bus / PCIe - bus = StaticBus( - bdf=bdf, - max_pcie_width=None, - max_pcie_speed=None, - pcie_interface_version="unknown", - slot_type="Unknown", - ) + gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0 - if callable(pcie_fn): - p = self._smi_try(pcie_fn, h, default={}) or {} - d = p.get("pcie_static", p) if isinstance(p, dict) else {} - - if isinstance(d, dict): - max_w = d.get("max_pcie_width") - max_s = d.get("max_pcie_speed") - pcie_ver = d.get("pcie_interface_version") - - # MT/s -> GT/s - ms_val = self._to_number(max_s) - gtps = ( - (cast(float, ms_val) / 1000.0) - if (isinstance(ms_val, (int, float)) and ms_val >= 1000) - else ms_val - ) + asic = item.get("asic", {}) or {} + board = item.get("board", {}) or {} + bus = item.get("bus", {}) or {} + vbios = item.get("vbios", {}) or {} + driver = item.get("driver", {}) or {} + numa = item.get("numa", {}) or {} + vram = item.get("vram", {}) or {} + cache = item.get("cache", {}) or {} + clock = item.get("clock", {}) or {} + soc_pstate = item.get("soc_pstate", {}) or {} + xgmi_plpd = item.get("xgmi_plpd", {}) or {} - bus = StaticBus( - bdf=bdf, - max_pcie_width=self._valueunit(max_w, "x"), - max_pcie_speed=self._valueunit(gtps, "GT/s"), - pcie_interface_version=self._normalize(pcie_ver), - slot_type=self._normalize(d.get("slot_type"), slot_type=True), - ) + # Bus / PCIe + bus_model = StaticBus( + bdf=str(bus.get("bdf", "")), + max_pcie_width=self._valueunit(bus.get("max_pcie_width"), "x"), + max_pcie_speed=self._valueunit(bus.get("max_pcie_speed"), "GT/s"), + pcie_interface_version=self._normalize(bus.get("pcie_interface_version")), + slot_type=self._normalize(bus.get("slot_type"), slot_type=True), + ) # ASIC asic_model = StaticAsic( @@ -656,53 +622,35 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: # Driver driver_model = None - drv_fn = getattr(amdsmi, "amdsmi_get_gpu_driver_info", None) - if callable(drv_fn): - drv = self._smi_try(drv_fn, h, default={}) or {} + if driver: driver_model = StaticDriver( - name=self._normalize(drv.get("driver_name"), default="unknown"), - version=self._normalize(drv.get("driver_version"), default="unknown"), + name=self._normalize(driver.get("driver_name"), default="unknown"), + version=self._normalize(driver.get("driver_version"), default="unknown"), ) # VBIOS - vb = { - k: board[k] - for k in ("vbios_name", "vbios_build_date", "vbios_part_number", "vbios_version") - if k in board - } vbios_model: Optional[StaticVbios] = None - if vb: + if vbios: vbios_model = StaticVbios( - name=str(vb.get("vbios_name", "")), - build_date=str(vb.get("vbios_build_date", "")), - part_number=str(vb.get("vbios_part_number", "")), - version=str(vb.get("vbios_version", "")), + name=str(vbios.get("vbios_name", "")), + build_date=str(vbios.get("vbios_build_date", "")), + part_number=str(vbios.get("vbios_part_number", "")), + version=str(vbios.get("vbios_version", "")), ) - # NUMA (via KFD) - if isinstance(kfd, dict): - try: - numa_node = int(kfd.get("node_id", 0) or 0) - except Exception: - numa_node = 0 - else: - numa_node = 0 - affinity = 0 + # NUMA + numa_node = int(numa.get("node", 0) or 0) + affinity = int(numa.get("affinity", 0) or 0) numa_model = StaticNuma(node=numa_node, affinity=affinity) # VRAM - vram_type = str(asic.get("vram_type", "") or "unknown") - vram_vendor = asic.get("vram_vendor") - vram_bits = asic.get("vram_bit_width") + vram_type = str(vram.get("vram_type", "") or "unknown") + vram_vendor = vram.get("vram_vendor") + vram_bits = vram.get("vram_bit_width") vram_size_b: Optional[int] = None - if asic.get("vram_size_bytes") is not None: - try: - vram_size_b = int(asic["vram_size_bytes"]) - except Exception: - vram_size_b = None - elif asic.get("vram_size_mb") is not None: + if vram.get("vram_size_mb") is not None: try: - vram_size_b = int(asic["vram_size_mb"]) * 1024 * 1024 + vram_size_b = int(vram["vram_size_mb"]) * 1024 * 1024 except Exception: vram_size_b = None @@ -714,17 +662,24 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: max_bandwidth=None, ) - soc_pstate_model = self._get_soc_pstate(h) - xgmi_plpd_model = self._get_xgmi_plpd(h) - cache_info_model = self._get_cache_info(h) - clock_model = self._get_clock(h) + # SOC P-state + soc_pstate_model = self._parse_soc_pstate(soc_pstate) + + # XGMI PLPD + xgmi_plpd_model = self._parse_xgmi_plpd(xgmi_plpd) + + # Cache info + cache_info_model = self._parse_cache_info(cache) + + # Clock + clock_model = self._parse_clock(clock) try: out.append( AmdSmiStatic( - gpu=idx, + gpu=gpu_idx, asic=asic_model, - bus=bus, + bus=bus_model, vbios=vbios_model, limit=None, driver=driver_model, @@ -744,32 +699,21 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiStatic", - data={"exception": get_exception_traceback(e), "gpu_index": idx}, + data={"exception": get_exception_traceback(e), "gpu_index": gpu_idx}, priority=EventPriority.WARNING, ) return out - def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]: - """SOC pstate check + def _parse_soc_pstate(self, data: dict) -> Optional[StaticSocPstate]: + """Parse SOC P-state data Args: - handle (Any): GPU device handle + data (dict): SOC P-state data from amd-smi Returns: Optional[StaticSocPstate]: StaticSocPstate instance or None """ - amdsmi = self._amdsmi_mod() - fn = getattr(amdsmi, "amdsmi_get_soc_pstate", None) - if not callable(fn): - self._log_event( - category=EventCategory.APPLICATION, - description="amdsmi_get_soc_pstate not exposed by amdsmi build", - priority=EventPriority.INFO, - ) - return None - - data = self._smi_try(fn, handle, default=None) if not isinstance(data, dict): return None @@ -812,26 +756,15 @@ def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]: except ValidationError: return None - def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]: - """Check XGMI plpd + def _parse_xgmi_plpd(self, data: dict) -> Optional[StaticXgmiPlpd]: + """Parse XGMI PLPD data Args: - handle (Any): GPU device handle + data (dict): XGMI PLPD data from amd-smi Returns: Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None """ - amdsmi = self._amdsmi_mod() - fn = getattr(amdsmi, "amdsmi_get_xgmi_plpd", None) - if not callable(fn): - self._log_event( - category=EventCategory.APPLICATION, - description="XGMI PLPD not exposed by this amdsmi build", - priority=EventPriority.INFO, - ) - return None - - data = self._smi_try(fn, handle, default=None) if not isinstance(data, dict): return None @@ -874,21 +807,19 @@ def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]: except ValidationError: return None - def _get_cache_info(self, handle: Any) -> list[StaticCacheInfoItem]: - """Check cache info + def _parse_cache_info(self, data: dict) -> list[StaticCacheInfoItem]: + """Parse cache info data Args: - handle (Any): GPU device handle + data (dict): Cache data from amd-smi Returns: list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances """ - amdsmi = self._amdsmi_mod() - raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, handle, default=None) - if not isinstance(raw, dict) or not isinstance(raw.get("cache"), list): + if not isinstance(data, dict) or not isinstance(data.get("cache"), list): return [] - items = raw["cache"] + items = data["cache"] def _as_list_str(v: Any) -> list[str]: if isinstance(v, list): @@ -929,7 +860,7 @@ def _as_list_str(v: Any) -> list[str]: except ValidationError as ve: self._log_event( category=EventCategory.APPLICATION, - description="Bad cache info entry from AMDSMI; skipping", + description="Bad cache info entry from amd-smi; skipping", data={"entry": repr(e), "exception": get_exception_traceback(ve)}, priority=EventPriority.WARNING, ) @@ -937,22 +868,15 @@ def _as_list_str(v: Any) -> list[str]: return out - def _get_clock(self, handle: Any) -> Optional[StaticClockData]: - """Get clock info + def _parse_clock(self, data: dict) -> Optional[StaticClockData]: + """Parse clock data Args: - handle (Any): GPU device handle + data (dict): Clock data from amd-smi Returns: Optional[StaticClockData]: StaticClockData instance or None """ - amdsmi = self._amdsmi_mod() - fn = getattr(amdsmi, "amdsmi_get_clk_freq", None) - clk_type = getattr(amdsmi, "AmdSmiClkType", None) - if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"): - return None - - data = self._smi_try(fn, handle, clk_type.SYS, default=None) if not isinstance(data, dict): return None @@ -1021,17 +945,22 @@ def collect_data( tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model """ - if not self._bind_amdsmi_or_log(): + if not self._check_amdsmi_installed(): + self._log_event( + category=EventCategory.APPLICATION, + description="amd-smi is not installed", + priority=EventPriority.WARNING, + console_log=True, + ) self.result.status = ExecutionStatus.NOT_RAN return self.result, None - amdsmi = self._amdsmi_mod() try: - amdsmi.amdsmi_init(amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS) # type: ignore[attr-defined] version = self._get_amdsmi_version() if version is not None: - self.logger.info("amdsmi version: %s", version.version) + self.logger.info("amd-smi version: %s", version.version) self.logger.info("ROCm version: %s", version.rocm_version) + amd_smi_data = self._get_amdsmi_data() if amd_smi_data is None: @@ -1048,8 +977,3 @@ def collect_data( ) self.result.status = ExecutionStatus.EXECUTION_FAILURE return self.result, None - finally: - try: - amdsmi.amdsmi_shut_down() - except Exception: - pass diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index 2256c3e9..e7b13cd4 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -1,7 +1,6 @@ -import importlib -import sys -import types -from typing import Optional, Tuple +import json +from typing import Any +from unittest.mock import MagicMock import pytest @@ -9,280 +8,338 @@ from nodescraper.plugins.inband.amdsmi.amdsmi_collector import AmdSmiCollector -class _BaseAmdSmiError(Exception): - def __init__(self, ret_code: int, *args): - super().__init__(ret_code, *args) - self.ret_code = ret_code - - -class AmdSmiLibraryError(_BaseAmdSmiError): ... - - -class AmdSmiRetryError(_BaseAmdSmiError): ... - - -class AmdSmiParameterError(_BaseAmdSmiError): ... - - -class AmdSmiTimeoutError(_BaseAmdSmiError): ... - - -def make_fake_amdsmi( - *, - handles: Optional[Tuple[object, ...]] = None, - lib_version="1.2.3", - rocm_version="6.1.0", - pcie_static=True, - raise_on_handles=False, -): - if handles is None: - handles = (object(),) - - m = types.SimpleNamespace() - m.AmdSmiException = _BaseAmdSmiError - m.AmdSmiLibraryException = AmdSmiLibraryError - m.AmdSmiRetryException = AmdSmiRetryError - m.AmdSmiParameterException = AmdSmiParameterError - m.AmdSmiTimeoutException = AmdSmiTimeoutError - - class AmdSmiInitFlags: - INIT_AMD_GPUS = 1 - - m.AmdSmiInitFlags = AmdSmiInitFlags - - class AmdSmiMemoryType: - VRAM = 0 - VIS_VRAM = 1 - GTT = 2 - - m.AmdSmiMemoryType = AmdSmiMemoryType - - def amdsmi_init(_flags): - return None - - def amdsmi_shut_down(): - return None - - m.amdsmi_init = amdsmi_init - m.amdsmi_shut_down = amdsmi_shut_down - - m.amdsmi_get_lib_version = lambda: lib_version - m.amdsmi_get_rocm_version = lambda: rocm_version - - def amdsmi_get_processor_handles(): - if raise_on_handles: - raise AmdSmiLibraryError(5) - return list(handles) - - m.amdsmi_get_processor_handles = amdsmi_get_processor_handles - - m.amdsmi_get_gpu_device_bdf = lambda h: "0000:0b:00.0" - m.amdsmi_get_gpu_device_uuid = lambda h: "GPU-UUID-123" - m.amdsmi_get_gpu_kfd_info = lambda h: { - "kfd_id": 7, - "node_id": 3, - } - m.amdsmi_get_gpu_board_info = lambda h: { - "vbios_name": "vbiosA", - "vbios_build_date": "2024-01-01", - "vbios_part_number": "PN123", - "vbios_version": "V1", - "model_number": "Board-42", - "product_serial": "SN0001", - "fru_id": "FRU-1", - "product_name": "ExampleBoard", - "manufacturer_name": "ACME", - } - m.amdsmi_get_gpu_asic_info = lambda h: { - "market_name": "SomeGPU", - "vendor_id": "1002", - "vendor_name": "AMD", - "subvendor_id": "1ABC", - "device_id": "0x1234", - "subsystem_id": "0x5678", - "rev_id": "A1", - "asic_serial": "ASERIAL", - "oam_id": 0, - "num_compute_units": 224, - "target_graphics_version": "GFX940", - "vram_type": "HBM3", - "vram_vendor": "Micron", - "vram_bit_width": 4096, - "vram_size_bytes": 64 * 1024 * 1024 * 1024, - } - m.amdsmi_get_gpu_driver_info = lambda h: { - "driver_name": "amdgpu", - "driver_version": "6.1.0", - } - - if pcie_static: - - def amdsmi_get_pcie_info(h): - return { - "pcie_static": { - "max_pcie_width": 16, - "max_pcie_speed": 16000, - "pcie_interface_version": "PCIe 5.0", - "slot_type": "PCIe", - } - } - - m.amdsmi_get_pcie_info = amdsmi_get_pcie_info - - m.amdsmi_get_gpu_cache_info = lambda h: { - "cache": [ - { - "cache_level": 1, - "max_num_cu_shared": 8, - "num_cache_instance": 32, - "cache_size": 256 * 1024, - "cache_properties": "PropertyA, PropertyB; PropertyC", - } - ] - } - - def amdsmi_get_clk_freq(h, clk_type): - return { - "frequency": [500_000_000, 1_500_000_000, 2_000_000_000], - "current": 1, - } - - m.amdsmi_get_clk_freq = amdsmi_get_clk_freq - - m.amdsmi_get_fw_info = lambda h: { - "fw_list": [ - {"fw_name": "SMU", "fw_version": "55.33"}, - {"fw_name": "VBIOS", "fw_version": "V1"}, - ] - } - - m.amdsmi_get_gpu_process_list = lambda h: [ - { - "name": "python", - "pid": 4242, - "mem": 1024, - "engine_usage": {"gfx": 1_000_000, "enc": 0}, - "memory_usage": {"gtt_mem": 0, "cpu_mem": 4096, "vram_mem": 2048}, - "cu_occupancy": 12, - }, - { - "name": "N/A", - "pid": "9999", - "mem": "0", - "engine_usage": {"gfx": "0", "enc": "0"}, - "memory_usage": {"gtt_mem": "0", "cpu_mem": "0", "vram_mem": "0"}, - "cu_occupancy": "0", - }, - ] - - m.amdsmi_get_gpu_memory_partition = lambda h: {"partition_type": "NPS1"} - m.amdsmi_get_gpu_compute_partition = lambda h: {"partition_type": "CPX_DISABLED"} - - return m +def make_cmd_result(stdout: str, stderr: str = "", exit_code: int = 0) -> MagicMock: + """Create a mock command result""" + result = MagicMock() + result.stdout = stdout + result.stderr = stderr + result.exit_code = exit_code + return result + + +def make_json_response(data: Any) -> str: + """Convert data to JSON string""" + return json.dumps(data) @pytest.fixture -def install_fake_amdsmi(monkeypatch): - fake = make_fake_amdsmi() - mod = types.ModuleType("amdsmi") - for k, v in fake.__dict__.items(): - setattr(mod, k, v) - monkeypatch.setitem(sys.modules, "amdsmi", mod) - return mod +def mock_commands(monkeypatch): + """Mock all amd-smi commands with sample data""" + + def mock_run_sut_cmd(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + + if "version --json" in cmd: + return make_cmd_result( + make_json_response( + [{"tool": "amdsmi", "amdsmi_library_version": "1.2.3", "rocm_version": "6.1.0"}] + ) + ) + + if "list --json" in cmd: + return make_cmd_result( + make_json_response( + [ + { + "gpu": 0, + "bdf": "0000:0b:00.0", + "uuid": "GPU-UUID-123", + "kfd_id": 7, + "node_id": 3, + "partition_id": 0, + } + ] + ) + ) + + if "process --json" in cmd: + return make_cmd_result( + make_json_response( + [ + { + "gpu": 0, + "process_list": [ + { + "name": "python", + "pid": 4242, + "mem": 1024, + "engine_usage": {"gfx": 1000000, "enc": 0}, + "memory_usage": { + "gtt_mem": 0, + "cpu_mem": 4096, + "vram_mem": 2048, + }, + "cu_occupancy": 12, + }, + { + "name": "test", + "pid": 9999, + "mem": 0, + "engine_usage": {"gfx": 0, "enc": 0}, + "memory_usage": {"gtt_mem": 0, "cpu_mem": 0, "vram_mem": 0}, + "cu_occupancy": 0, + }, + ], + } + ] + ) + ) + + if "partition --json" in cmd: + return make_cmd_result( + make_json_response( + [{"gpu": 0, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}] + ) + ) + + if "firmware --json" in cmd: + return make_cmd_result( + make_json_response( + [ + { + "gpu": 0, + "fw_list": [ + {"fw_name": "SMU", "fw_version": "55.33"}, + {"fw_name": "VBIOS", "fw_version": "V1"}, + ], + } + ] + ) + ) + + if "static -g all --json" in cmd: + return make_cmd_result( + make_json_response( + { + "gpu_data": [ + { + "gpu": 0, + "asic": { + "market_name": "SomeGPU", + "vendor_id": "1002", + "vendor_name": "AMD", + "subvendor_id": "1ABC", + "device_id": "0x1234", + "subsystem_id": "0x5678", + "rev_id": "A1", + "asic_serial": "ASERIAL", + "oam_id": 0, + "num_compute_units": 224, + "target_graphics_version": "GFX940", + "vram_type": "HBM3", + "vram_vendor": "Micron", + "vram_bit_width": 4096, + }, + "board": { + "model_number": "Board-42", + "product_serial": "SN0001", + "fru_id": "FRU-1", + "product_name": "ExampleBoard", + "manufacturer_name": "ACME", + }, + "bus": { + "bdf": "0000:0b:00.0", + "max_pcie_width": 16, + "max_pcie_speed": 16.0, + "pcie_interface_version": "PCIe 5.0", + "slot_type": "PCIe", + }, + "vbios": { + "vbios_name": "vbiosA", + "vbios_build_date": "2024-01-01", + "vbios_part_number": "PN123", + "vbios_version": "V1", + }, + "driver": {"driver_name": "amdgpu", "driver_version": "6.1.0"}, + "numa": {"node": 3, "affinity": 0}, + "vram": { + "vram_type": "HBM3", + "vram_vendor": "Micron", + "vram_bit_width": 4096, + "vram_size_mb": 65536, + }, + "cache": { + "cache": [ + { + "cache_level": 1, + "max_num_cu_shared": 8, + "num_cache_instance": 32, + "cache_size": 262144, + "cache_properties": "PropertyA, PropertyB; PropertyC", + } + ] + }, + "clock": {"frequency": [500, 1500, 2000], "current": 1}, + "soc_pstate": {}, + "xgmi_plpd": {}, + } + ] + } + ) + ) + + return make_cmd_result("", f"Unknown command: {cmd}", 1) + + return mock_run_sut_cmd @pytest.fixture -def collector(install_fake_amdsmi, conn_mock, system_info): +def collector(mock_commands, conn_mock, system_info, monkeypatch): + """Create a collector with mocked commands""" c = AmdSmiCollector( system_info=system_info, system_interaction_level=SystemInteractionLevel.PASSIVE, connection=conn_mock, ) - assert c._bind_amdsmi_or_log() is True + monkeypatch.setattr(c, "_run_sut_cmd", mock_commands) return c +def test_check_amdsmi_installed(collector): + """Test that _check_amdsmi_installed works""" + assert collector._check_amdsmi_installed() is True + + +def test_check_amdsmi_not_installed(conn_mock, system_info, monkeypatch): + """Test when amd-smi is not installed""" + + def mock_which_fail(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("", "no amd-smi in /usr/bin", 1) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_which_fail) + + result, data = c.collect_data() + assert data is None + assert result.status.name == "NOT_RAN" + + def test_collect_data(collector): + """Test full data collection""" result, data = collector.collect_data() assert data is not None assert data.version is not None assert data.version.tool == "amdsmi" + assert data.version.version == "1.2.3" + assert data.version.rocm_version == "6.1.0" + # gpu_list assert data.gpu_list is not None and len(data.gpu_list) == 1 assert data.gpu_list[0].bdf == "0000:0b:00.0" assert data.gpu_list[0].uuid == "GPU-UUID-123" + assert data.gpu_list[0].kfd_id == 7 + assert data.gpu_list[0].node_id == 3 + # processes assert data.process is not None and len(data.process) == 1 assert len(data.process[0].process_list) == 2 + + # partition + assert data.partition is not None + assert len(data.partition.memory_partition) == 1 + assert data.partition.memory_partition[0].partition_type == "NPS1" + + # firmware + assert data.firmware is not None and len(data.firmware) == 1 + assert len(data.firmware[0].fw_list) == 2 + # static assert data.static is not None and len(data.static) == 1 s = data.static[0] assert s.bus is not None and s.bus.max_pcie_speed is not None assert float(s.bus.max_pcie_speed.value) == pytest.approx(16.0) + assert s.bus.pcie_interface_version == "PCIe 5.0" -def test_bind_failure(monkeypatch, conn_mock, system_info): - monkeypatch.setattr( - importlib, "import_module", lambda name: (_ for _ in ()).throw(ImportError("nope")) - ) - sys.modules.pop("amdsmi", None) +def test_get_gpu_list(collector): + """Test GPU list parsing""" + gpu_list = collector.get_gpu_list() + assert gpu_list is not None and len(gpu_list) == 1 + assert gpu_list[0].gpu == 0 + assert gpu_list[0].bdf == "0000:0b:00.0" + assert gpu_list[0].uuid == "GPU-UUID-123" - c = AmdSmiCollector( - system_info=system_info, - system_interaction_level=SystemInteractionLevel.PASSIVE, - connection=conn_mock, - ) - result, data = c.collect_data() - assert data is None - assert result.status.name == "NOT_RAN" - - -def test_handles_exception(monkeypatch, collector): - fake = make_fake_amdsmi(raise_on_handles=True) - mod = types.ModuleType("amdsmi") - for k, v in fake.__dict__.items(): - setattr(mod, k, v) - monkeypatch.setitem(sys.modules, "amdsmi", mod) - collector._amdsmi = mod - - gl = collector.get_gpu_list() - assert gl == [] or gl is None - - gp = collector.get_process() - assert gp == [] or gp is None - part = collector.get_partition() - assert part is not None +def test_get_process(collector): + """Test process list parsing""" + procs = collector.get_process() + assert procs is not None and len(procs) == 1 + assert procs[0].gpu == 0 + assert len(procs[0].process_list) == 2 - fw = collector.get_firmware() - assert fw == [] or fw is None + p0 = procs[0].process_list[0].process_info + assert p0.name == "python" + assert p0.pid == 4242 + assert p0.mem is not None and p0.mem.unit == "B" + assert p0.usage.gfx is not None and p0.usage.gfx.unit == "ns" - st = collector.get_static() - assert st == [] or st is None + p1 = procs[0].process_list[1].process_info + assert p1.name == "test" + assert p1.pid == 9999 -def test_partition(collector, install_fake_amdsmi): - amdsmi = install_fake_amdsmi - amdsmi.amdsmi_get_gpu_memory_partition = lambda h: "NPS2" - amdsmi.amdsmi_get_gpu_compute_partition = lambda h: "CPX_ENABLED" +def test_get_partition(collector): + """Test partition parsing""" p = collector.get_partition() assert p is not None assert len(p.memory_partition) == 1 and len(p.compute_partition) == 1 - assert p.memory_partition[0].partition_type == "NPS2" - assert p.compute_partition[0].partition_type == "CPX_ENABLED" + assert p.memory_partition[0].partition_type == "NPS1" + assert p.compute_partition[0].partition_type == "CPX_DISABLED" + + +def test_get_firmware(collector): + """Test firmware parsing""" + fw = collector.get_firmware() + assert fw is not None and len(fw) == 1 + assert fw[0].gpu == 0 + assert len(fw[0].fw_list) == 2 + assert fw[0].fw_list[0].fw_name == "SMU" + assert fw[0].fw_list[0].fw_version == "55.33" -def test_pcie(collector, install_fake_amdsmi): - if hasattr(install_fake_amdsmi, "amdsmi_get_pcie_info"): - delattr(install_fake_amdsmi, "amdsmi_get_pcie_info") +def test_get_static(collector): + """Test static data parsing""" stat = collector.get_static() assert stat is not None and len(stat) == 1 - assert stat[0].bus is not None - ms = stat[0].bus.max_pcie_speed - assert ms is None or ms.unit == "GT/s" + s = stat[0] + + # ASIC + assert s.asic.market_name == "SomeGPU" + assert s.asic.vendor_name == "AMD" + assert s.asic.num_compute_units == 224 + + # Board + assert s.board.amdsmi_model_number == "Board-42" + assert s.board.manufacturer_name == "ACME" + + # Bus/PCIe + assert s.bus.bdf == "0000:0b:00.0" + assert s.bus.max_pcie_width is not None + assert s.bus.max_pcie_speed is not None + + # VRAM + assert s.vram.type == "HBM3" + assert s.vram.vendor == "Micron" + # Cache + assert s.cache_info is not None and len(s.cache_info) == 1 + cache = s.cache_info[0] + assert cache.cache_level.value == 1 + assert cache.cache_properties -def test_cache(collector): + if s.clock is not None: + assert s.clock.frequency is not None + + +def test_cache_properties_parsing(collector): + """Test cache properties string parsing""" stat = collector.get_static() item = stat[0].cache_info[0] assert isinstance(item.cache.value, str) and item.cache.value.startswith("Label_") @@ -290,24 +347,46 @@ def test_cache(collector): assert {"PropertyA", "PropertyB", "PropertyC"}.issubset(set(item.cache_properties)) -def test_process_list(collector): - procs = collector.get_process() - assert procs and procs[0].process_list - p0 = procs[0].process_list[0].process_info - assert p0.pid == 4242 - assert p0.mem is not None and p0.mem.unit == "B" - assert p0.usage.gfx is not None and p0.usage.gfx.unit == "ns" - p1 = procs[0].process_list[1].process_info - assert p1.name == "N/A" - assert isinstance(p1.pid, int) +def test_json_parse_error(conn_mock, system_info, monkeypatch): + """Test handling of malformed JSON""" + + def mock_bad_json(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "version --json" in cmd: + return make_cmd_result("{ invalid json }") + return make_cmd_result("") + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_bad_json) -def test_smi_try(monkeypatch, install_fake_amdsmi, collector): - def raise_not_supported(*a, **kw): - raise AmdSmiLibraryError(2) # NOT_SUPPORTED + result, data = c.collect_data() + assert data is not None + assert data.version is None + assert len(result.events) > 0 # Should have error events - install_fake_amdsmi.amdsmi_get_gpu_memory_partition = raise_not_supported - p = collector.get_partition() - assert p is not None - assert len(p.memory_partition) == 1 +def test_command_error(conn_mock, system_info, monkeypatch): + """Test handling of command execution errors""" + + def mock_cmd_error(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + return make_cmd_result("", "Command failed", 1) + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_cmd_error) + + result, data = c.collect_data() + assert data is not None + assert data.version is None + assert data.gpu_list == [] + assert len(result.events) > 0 # Should have error events From 555aba8a183c248a6c48f650c33a39bb7fa99b84 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Sat, 15 Nov 2025 10:26:37 -0600 Subject: [PATCH 35/38] undid var name change + added vars back + pytest fix --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 9 ++- .../plugins/inband/amdsmi/amdsmi_collector.py | 81 ++++++++++++++----- .../plugins/inband/amdsmi/amdsmidata.py | 34 ++++---- test/unit/plugin/test_amdsmi_collector.py | 8 +- 4 files changed, 92 insertions(+), 40 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index c15c5c6e..18f2ef85 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -177,7 +177,7 @@ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]): "subsystem_id": {gpu.asic.subsystem_id for gpu in amdsmi_static_data}, "device_id": {gpu.asic.device_id for gpu in amdsmi_static_data}, "rev_id": {gpu.asic.rev_id for gpu in amdsmi_static_data}, - "num_compute_units": {gpu.asic.num_compute_units for gpu in amdsmi_static_data}, + "num_compute_units": {str(gpu.asic.num_compute_units) for gpu in amdsmi_static_data}, "target_graphics_version": { gpu.asic.target_graphics_version for gpu in amdsmi_static_data }, @@ -330,10 +330,13 @@ def check_pldm_version( pldm_missing_gpus: list[int] = [] for fw_data in amdsmi_fw_data: gpu = fw_data.gpu + if isinstance(fw_data.fw_list, str): + pldm_missing_gpus.append(gpu) + continue for fw_info in fw_data.fw_list: - if PLDM_STRING == fw_info.fw_name and expected_pldm_version != fw_info.fw_version: + if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version: mismatched_gpus.append(gpu) - if PLDM_STRING == fw_info.fw_name: + if PLDM_STRING == fw_info.fw_id: break else: pldm_missing_gpus.append(gpu) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index fc20df14..ed9def3f 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -388,18 +388,15 @@ def get_process(self) -> Optional[list[Processes]]: enc=self._valueunit(eu.get("enc"), "ns"), ) - cu_occ = self._valueunit(entry.get("cu_occupancy"), "") - try: plist.append( ProcessListItem( process_info=ProcessInfo( name=str(name), pid=pid, - mem=mem_vu, memory_usage=mem_usage, + mem_usage=mem_vu, usage=usage, - cu_occupancy=cu_occ, ) ) ) @@ -525,7 +522,7 @@ def get_firmware(self) -> Optional[list[Fw]]: ver = e.get("fw_version") normalized.append( FwListItem( - fw_name="" if fid is None else str(fid), + fw_id="" if fid is None else str(fid), fw_version="" if ver is None else str(ver), ) ) @@ -593,6 +590,22 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: ) # ASIC + oam_id_raw = asic.get("oam_id") + if oam_id_raw in (None, "", "N/A"): + oam_id_val: Union[int, str] = "N/A" + elif isinstance(oam_id_raw, str): + oam_id_val = oam_id_raw + else: + oam_id_val = int(oam_id_raw) if oam_id_raw is not None else "N/A" + + num_cu_raw = asic.get("num_compute_units") + if num_cu_raw in (None, "", "N/A"): + num_cu_val: Union[int, str] = "N/A" + elif isinstance(num_cu_raw, str): + num_cu_val = num_cu_raw + else: + num_cu_val = int(num_cu_raw) if num_cu_raw is not None else "N/A" + asic_model = StaticAsic( market_name=self._normalize( asic.get("market_name") or asic.get("asic_name"), default="" @@ -604,8 +617,8 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: subsystem_id=str(asic.get("subsystem_id", "")), rev_id=str(asic.get("rev_id", "")), asic_serial=str(asic.get("asic_serial", "")), - oam_id=int(asic.get("oam_id", 0) or 0), - num_compute_units=int(asic.get("num_compute_units", 0) or 0), + oam_id=oam_id_val, + num_compute_units=num_cu_val, target_graphics_version=str(asic.get("target_graphics_version", "")), ) @@ -621,12 +634,14 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: ) # Driver - driver_model = None - if driver: - driver_model = StaticDriver( - name=self._normalize(driver.get("driver_name"), default="unknown"), - version=self._normalize(driver.get("driver_version"), default="unknown"), - ) + driver_model = StaticDriver( + name=self._normalize( + driver.get("driver_name") if driver else None, default="unknown" + ), + version=self._normalize( + driver.get("driver_version") if driver else None, default="unknown" + ), + ) # VBIOS vbios_model: Optional[StaticVbios] = None @@ -640,8 +655,15 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: # NUMA numa_node = int(numa.get("node", 0) or 0) - affinity = int(numa.get("affinity", 0) or 0) - numa_model = StaticNuma(node=numa_node, affinity=affinity) + affinity_raw = numa.get("affinity") + if affinity_raw in (None, "", "N/A"): + affinity_val: Union[int, str] = "N/A" + elif isinstance(affinity_raw, str): + affinity_val = affinity_raw + else: + affinity_val = int(affinity_raw) if affinity_raw is not None else "N/A" + + numa_model = StaticNuma(node=numa_node, affinity=affinity_val) # VRAM vram_type = str(vram.get("vram_type", "") or "unknown") @@ -672,7 +694,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: cache_info_model = self._parse_cache_info(cache) # Clock - clock_model = self._parse_clock(clock) + clock_dict_model = self._parse_clock_dict(clock) try: out.append( @@ -691,7 +713,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: vram=vram_model, cache_info=cache_info_model, partition=None, - clock=clock_model, + clock=clock_dict_model, ) ) except ValidationError as e: @@ -928,10 +950,33 @@ def _fmt(n: Optional[int]) -> Optional[str]: {"Level 0": level0, "Level 1": level1, "Level 2": level2} ) - return StaticClockData(frequency=levels, current=current) + # Use the alias "current level" as defined in the model + return StaticClockData.model_validate( + {"frequency_levels": levels, "current level": current} + ) except ValidationError: return None + def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockData, None]]]: + """Parse clock data into dictionary structure + + Args: + data (dict): Clock data from amd-smi + + Returns: + Optional[dict[str, Union[StaticClockData, None]]]: dictionary of clock data or None + """ + if not isinstance(data, dict): + return None + + clock_dict: dict[str, Union[StaticClockData, None]] = {} + + clock_data = self._parse_clock(data) + if clock_data: + clock_dict["clk"] = clock_data + + return clock_dict if clock_dict else None + def collect_data( self, args: Any = None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 821bb10b..fd9eae41 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -2,6 +2,7 @@ from typing import Any, List, Mapping, Optional, Union from pydantic import ( + AliasChoices, BaseModel, ConfigDict, Field, @@ -156,12 +157,10 @@ class ProcessUsage(BaseModel): class ProcessInfo(BaseModel): name: str pid: int - - mem: Optional[ValueUnit] = None memory_usage: ProcessMemoryUsage + mem_usage: Optional[ValueUnit] usage: ProcessUsage - cu_occupancy: Optional[ValueUnit] = None - na_validator = field_validator("mem", "cu_occupancy", mode="before")(na_to_none) + na_validator = field_validator("mem_usage", mode="before")(na_to_none) class ProcessListItem(BaseModel): @@ -175,13 +174,13 @@ class Processes(BaseModel): # FW class FwListItem(BaseModel): + fw_id: str fw_version: str - fw_name: str class Fw(BaseModel): gpu: int - fw_list: List[FwListItem] + fw_list: Union[List[FwListItem], str] class AmdSmiListItem(BaseModel): @@ -256,8 +255,8 @@ class StaticAsic(BaseModel): subsystem_id: str rev_id: str asic_serial: str - oam_id: int - num_compute_units: int + oam_id: Union[int, str] # can be N/A + num_compute_units: Union[int, str] # can be N/A target_graphics_version: str @@ -320,8 +319,11 @@ class StaticBoard(BaseModel): class StaticPartition(BaseModel): + # The name for compute_partition has changed we will support both for now - compute_partition: str + compute_partition: str = Field( + validation_alias=AliasChoices("compute_partition", "accelerator_partition") + ) memory_partition: str partition_id: int @@ -345,7 +347,7 @@ class StaticXgmiPlpd(BaseModel): class StaticNuma(BaseModel): node: int - affinity: int + affinity: Union[int, str] # can be N/A class StaticVram(AmdSmiBaseModel): @@ -383,10 +385,10 @@ class StaticClockData(BaseModel): model_config = ConfigDict( populate_by_name=True, ) - frequency: StaticFrequencyLevels + frequency_levels: StaticFrequencyLevels - current: Optional[int] = Field(..., alias="current") - na_validator = field_validator("current", mode="before")(na_to_none) + current_level: Optional[int] = Field(..., alias="current level") + na_validator = field_validator("current_level", mode="before")(na_to_none) class AmdSmiStatic(BaseModel): @@ -397,7 +399,7 @@ class AmdSmiStatic(BaseModel): bus: StaticBus vbios: Optional[StaticVbios] limit: Optional[StaticLimit] - driver: Optional[StaticDriver] + driver: StaticDriver board: StaticBoard soc_pstate: Optional[StaticSocPstate] xgmi_plpd: Optional[StaticXgmiPlpd] @@ -405,8 +407,8 @@ class AmdSmiStatic(BaseModel): numa: StaticNuma vram: StaticVram cache_info: List[StaticCacheInfoItem] - partition: Optional[StaticPartition] = None - clock: Optional[StaticClockData] = None + partition: Optional[StaticPartition] = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ + clock: Optional[dict[str, Union[StaticClockData, None]]] = None na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")( na_to_none diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index e7b13cd4..6783c407 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -277,7 +277,7 @@ def test_get_process(collector): p0 = procs[0].process_list[0].process_info assert p0.name == "python" assert p0.pid == 4242 - assert p0.mem is not None and p0.mem.unit == "B" + assert p0.mem_usage is not None and p0.mem_usage.unit == "B" assert p0.usage.gfx is not None and p0.usage.gfx.unit == "ns" p1 = procs[0].process_list[1].process_info @@ -300,7 +300,7 @@ def test_get_firmware(collector): assert fw is not None and len(fw) == 1 assert fw[0].gpu == 0 assert len(fw[0].fw_list) == 2 - assert fw[0].fw_list[0].fw_name == "SMU" + assert fw[0].fw_list[0].fw_id == "SMU" assert fw[0].fw_list[0].fw_version == "55.33" @@ -335,7 +335,9 @@ def test_get_static(collector): assert cache.cache_properties if s.clock is not None: - assert s.clock.frequency is not None + assert isinstance(s.clock, dict) + if "clk" in s.clock and s.clock["clk"] is not None: + assert s.clock["clk"].frequency_levels is not None def test_cache_properties_parsing(collector): From 705a61ed8837b74839cc128ceb126aed24e40f1c Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Sat, 15 Nov 2025 10:57:59 -0600 Subject: [PATCH 36/38] Logging warning when user is missing group so its not an error --- .../plugins/inband/amdsmi/amdsmi_collector.py | 69 +++++++++++++++---- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index ed9def3f..389e5044 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -99,20 +99,58 @@ def _run_amd_smi(self, cmd: str) -> Optional[str]: Optional[str]: stdout from command or None on error """ cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}") + + # Check for known warnings that can be ignored + is_group_warning = ( + "User is missing the following required groups" in cmd_ret.stderr + or "User is missing the following required groups" in cmd_ret.stdout + ) + + # Log warning if user is missing group if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: - self._log_event( - category=EventCategory.APPLICATION, - description="Error running amd-smi command", - data={ - "command": cmd, - "exit_code": cmd_ret.exit_code, - "stderr": cmd_ret.stderr, - }, - priority=EventPriority.ERROR, - console_log=True, - ) - return None - return cmd_ret.stdout + if not is_group_warning: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running amd-smi command", + data={ + "command": cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + else: + self._log_event( + category=EventCategory.APPLICATION, + description="amd-smi warning (continuing): User missing required groups", + data={ + "command": cmd, + "warning": cmd_ret.stderr or cmd_ret.stdout, + }, + priority=EventPriority.WARNING, + console_log=False, + ) + + stdout = cmd_ret.stdout + if is_group_warning and stdout: + lines = stdout.split("\n") + cleaned_lines = [ + line + for line in lines + if not any( + warn in line + for warn in [ + "RuntimeError:", + "WARNING: User is missing", + "Please add user to these groups", + ] + ) + ] + stdout = "\n".join(cleaned_lines).strip() + + return stdout def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]: """Run amd-smi command with json output @@ -132,7 +170,10 @@ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]: self._log_event( category=EventCategory.APPLICATION, description=f"Error parsing command: `{cmd}` json data", - data={"cmd": cmd, "exception": get_exception_traceback(e)}, + data={ + "cmd": cmd, + "exception": get_exception_traceback(e), + }, priority=EventPriority.ERROR, console_log=True, ) From 7e5830ee2be99e2617dad28ba6cddd060c0d4259 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 17 Nov 2025 08:57:55 -0600 Subject: [PATCH 37/38] addressed reviews --- .../plugins/inband/amdsmi/amdsmi_collector.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 389e5044..f74268a4 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -61,7 +61,7 @@ StaticXgmiPlpd, ValueUnit, ) -from nodescraper.utils import get_exception_details, get_exception_traceback +from nodescraper.utils import get_exception_traceback class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): @@ -294,12 +294,12 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: firmware=firmware, static=statics, ) - except ValidationError as e: - self.logger.warning("Validation err: %s", e) + except ValidationError as err: + self.logger.warning("Validation err: %s", err) self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiDataModel", - data=get_exception_details(e), + data={"errors": err.errors(include_url=False)}, priority=EventPriority.ERROR, ) return None @@ -325,11 +325,11 @@ def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: amdsmi_library_version=version_data.get("amdsmi_library_version", ""), rocm_version=version_data.get("rocm_version", ""), ) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiVersion", - data=get_exception_details(e), + data={"errors": err.errors(include_url=False)}, priority=EventPriority.WARNING, ) return None @@ -368,11 +368,11 @@ def _to_int(x: Any, default: int = 0) -> int: partition_id=_to_int(item.get("partition_id", 0)), ) ) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiListItem", - data={"exception": get_exception_traceback(e), "item": item}, + data={"errors": err.errors(include_url=False), "item": item}, priority=EventPriority.WARNING, ) @@ -441,12 +441,12 @@ def get_process(self) -> Optional[list[Processes]]: ) ) ) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build ProcessListItem; skipping entry", data={ - "exception": get_exception_traceback(e), + "errors": err.errors(include_url=False), "gpu_index": gpu_idx, "entry": repr(entry), }, @@ -456,11 +456,11 @@ def get_process(self) -> Optional[list[Processes]]: try: out.append(Processes(gpu=gpu_idx, process_list=plist)) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build Processes", - data={"exception": get_exception_traceback(e), "gpu_index": gpu_idx}, + data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx}, priority=EventPriority.WARNING, ) @@ -492,12 +492,12 @@ def get_partition(self) -> Optional[Partition]: memparts.append( PartitionMemory(gpu_id=gpu_idx, partition_type=str(mem_pt) if mem_pt else None) ) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build PartitionMemory", data={ - "exception": get_exception_traceback(e), + "errors": err.errors(include_url=False), "gpu_index": gpu_idx, "data": mem_pt, }, @@ -510,12 +510,12 @@ def get_partition(self) -> Optional[Partition]: gpu_id=gpu_idx, partition_type=str(comp_pt) if comp_pt else None ) ) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build PartitionCompute", data={ - "exception": get_exception_traceback(e), + "errors": err.errors(include_url=False), "gpu_index": gpu_idx, "data": comp_pt, }, @@ -524,11 +524,11 @@ def get_partition(self) -> Optional[Partition]: try: return Partition(memory_partition=memparts, compute_partition=computeparts) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build Partition", - data={"exception": get_exception_traceback(e)}, + data={"errors": err.errors(include_url=False)}, priority=EventPriority.WARNING, ) return None @@ -577,11 +577,11 @@ def get_firmware(self) -> Optional[list[Fw]]: try: out.append(Fw(gpu=gpu_idx, fw_list=normalized)) - except ValidationError as e: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Failed to build Fw", - data={"exception": get_exception_traceback(e), "gpu_index": gpu_idx}, + data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx}, priority=EventPriority.WARNING, ) @@ -757,12 +757,12 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: clock=clock_dict_model, ) ) - except ValidationError as e: - self.logger.error(e) + except ValidationError as err: + self.logger.error(err) self._log_event( category=EventCategory.APPLICATION, description="Failed to build AmdSmiStatic", - data={"exception": get_exception_traceback(e), "gpu_index": gpu_idx}, + data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx}, priority=EventPriority.WARNING, ) @@ -920,11 +920,11 @@ def _as_list_str(v: Any) -> list[str]: num_cache_instance=num_cache_instance, ) ) - except ValidationError as ve: + except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, description="Bad cache info entry from amd-smi; skipping", - data={"entry": repr(e), "exception": get_exception_traceback(ve)}, + data={"entry": repr(e), "errors": err.errors(include_url=False)}, priority=EventPriority.WARNING, ) continue From 94026958ff17197201a74cbf424133828c05b405 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 17 Nov 2025 10:05:23 -0600 Subject: [PATCH 38/38] fix for when amd-smi reports extra fields --- .../plugins/inband/amdsmi/amdsmi_collector.py | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index f74268a4..ca1d077c 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -100,15 +100,33 @@ def _run_amd_smi(self, cmd: str) -> Optional[str]: """ cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}") - # Check for known warnings that can be ignored + # Check for known warnings and errors that can be handled is_group_warning = ( "User is missing the following required groups" in cmd_ret.stderr or "User is missing the following required groups" in cmd_ret.stdout ) + # Check for known amd-smi internal bugs + is_amdsmi_internal_error = any( + pattern in cmd_ret.stderr for pattern in ["KeyError:", "AttributeError:", "IndexError:"] + ) + # Log warning if user is missing group if cmd_ret.stderr != "" or cmd_ret.exit_code != 0: - if not is_group_warning: + if is_amdsmi_internal_error: + self._log_event( + category=EventCategory.SW_DRIVER, + description="amd-smi internal error detected", + data={ + "command": cmd, + "exit_code": cmd_ret.exit_code, + "stderr": cmd_ret.stderr, + }, + priority=EventPriority.WARNING, + console_log=True, + ) + return None + elif not is_group_warning: self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi command", @@ -595,7 +613,23 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: """ ret = self._run_amd_smi_dict("static -g all") if not ret: - return [] + self.logger.info("Bulk static query failed, attempting per-GPU fallback") + gpu_list = self.get_gpu_list() + if gpu_list: + fallback_data: list[dict] = [] + for gpu in gpu_list: + gpu_data = self._run_amd_smi_dict(f"static -g {gpu.gpu}") + if gpu_data: + if isinstance(gpu_data, dict): + fallback_data.append(gpu_data) + elif isinstance(gpu_data, list): + fallback_data.extend(gpu_data) + if fallback_data: + ret = fallback_data + else: + return [] + else: + return [] if isinstance(ret, dict) and "gpu_data" in ret: ret = ret["gpu_data"]