diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index fed32aca..81c18f19 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -334,6 +334,7 @@ def process_args( plugin_arg_index = -1 plugin_arg_map = {} + invalid_plugins = [] if plugin_arg_index != -1 and plugin_arg_index != len(raw_arg_input) - 1: top_level_args = raw_arg_input[: plugin_arg_index + 1] plugin_args = raw_arg_input[plugin_arg_index + 1 :] @@ -344,12 +345,26 @@ def process_args( else: cur_plugin = None for arg in plugin_args: - if arg in plugin_names: + # Handle comma-separated plugin names (but not arguments) + if not arg.startswith("-") and "," in arg: + # Split comma-separated plugin names + for potential_plugin in arg.split(","): + potential_plugin = potential_plugin.strip() + if potential_plugin in plugin_names: + plugin_arg_map[potential_plugin] = [] + cur_plugin = potential_plugin + elif potential_plugin: + # Track invalid plugin names to log event later + invalid_plugins.append(potential_plugin) + elif arg in plugin_names: plugin_arg_map[arg] = [] cur_plugin = arg elif cur_plugin: plugin_arg_map[cur_plugin].append(arg) - return (top_level_args, plugin_arg_map) + elif not arg.startswith("-"): + # Track invalid plugin names to log event later + invalid_plugins.append(arg) + return (top_level_args, plugin_arg_map, invalid_plugins) def main(arg_input: Optional[list[str]] = None): @@ -367,7 +382,9 @@ def main(arg_input: Optional[list[str]] = None): parser, plugin_subparser_map = build_parser(plugin_reg, config_reg) try: - top_level_args, plugin_arg_map = process_args(arg_input, list(plugin_subparser_map.keys())) + top_level_args, plugin_arg_map, invalid_plugins = process_args( + arg_input, list(plugin_subparser_map.keys()) + ) parsed_args = parser.parse_args(top_level_args) system_info = get_system_info(parsed_args) @@ -387,6 +404,13 @@ def main(arg_input: Optional[list[str]] = None): if log_path: logger.info("Log path: %s", log_path) + # Log warning if invalid plugin names were provided + if invalid_plugins: + logger.warning( + "Invalid plugin name(s) ignored: %s. Use 'describe plugin' to list available plugins.", + ", ".join(invalid_plugins), + ) + if parsed_args.subcmd == "summary": generate_summary(parsed_args.search_path, parsed_args.output_path, logger) sys.exit(0) diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index e47a6cc8..d03010c6 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -173,6 +173,12 @@ def run_queue(self) -> list[PluginResult]: global_run_args = self.apply_global_args_to_plugin( plugin_inst, plugin_class, self.plugin_config.global_args ) + # Merge analysis_args and collection_args + for args_key in ["analysis_args", "collection_args"]: + if args_key in global_run_args and args_key in run_payload: + # Merge: global args override plugin-specific args keys specified in both global and plugin-specific args + run_payload[args_key].update(global_run_args[args_key]) + del global_run_args[args_key] run_payload.update(global_run_args) except ValueError as ve: self.logger.error( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 5e16f82e..085f022f 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -23,8 +23,9 @@ # SOFTWARE. # ############################################################################### +import io from collections import defaultdict -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union from nodescraper.enums import EventCategory, EventPriority from nodescraper.interfaces import DataAnalyzer @@ -34,16 +35,19 @@ AmdSmiDataModel, AmdSmiMetric, AmdSmiStatic, + AmdSmiTstData, EccData, Fw, Partition, Processes, + XgmiMetrics, ) from .analyzer_args import AmdSmiAnalyzerArgs +from .cper import CperAnalysisTaskMixin -class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]): - """""" +class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]): + """Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics""" DATA_MODEL = AmdSmiDataModel @@ -441,7 +445,7 @@ def check_static_data( mismatches: list[tuple[int, str, str, str]] = [] - expected_data: Dict[str, Optional[str]] = { + expected_data: dict[str, Optional[str]] = { "vendor_id": vendor_id, "subvendor_id": subvendor_id, "vendor_name": "Advanced Micro Devices Inc", @@ -500,24 +504,24 @@ def check_static_data( def _format_static_mismatch_payload( self, - mismatches: List[tuple[int, str, str, str]], - ) -> Dict[str, Any]: + mismatches: list[tuple[int, str, str, str]], + ) -> dict[str, Any]: """Helper function for pretty printing mismatch in expected data Args: - mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU + mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU Returns: - Dict[str, Any]: dict of mismatched data per GPU + dict[str, Any]: dict of mismatched data per GPU """ - per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list) + per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list) field_set: set[str] = set() for gpu, field, expected, actual in mismatches: field_set.add(field) per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual}) - per_gpu_list: List[Dict[str, Any]] = [ + per_gpu_list: list[dict[str, Any]] = [ {"gpu": gpu, "mismatches": entries} for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0]) ] @@ -635,6 +639,97 @@ def check_expected_memory_partition_mode( }, ) + def check_expected_xgmi_link_speed( + self, + xgmi_metric: Optional[list[XgmiMetrics]], + expected_xgmi_speed: Optional[list[float]] = None, + ): + """Check the XGMI link speed for all GPUs + + Args: + xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data + expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s) + """ + if xgmi_metric is None or len(xgmi_metric) == 0: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed data is not available and cannot be checked", + priority=EventPriority.WARNING, + data={"xgmi_metric": xgmi_metric}, + ) + return + + if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0: + self._log_event( + category=EventCategory.IO, + description="Expected XGMI speed not configured, skipping XGMI link speed check", + priority=EventPriority.WARNING, + ) + return + + for xgmi_data in xgmi_metric: + link_metric = xgmi_data.link_metrics + try: + if link_metric.bit_rate is None or link_metric.bit_rate.value is None: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not available", + priority=EventPriority.ERROR, + data={ + "gpu": xgmi_data.gpu, + "xgmi_bit_rate": ( + link_metric.bit_rate.unit if link_metric.bit_rate else "N/A" + ), + }, + ) + continue + + xgmi_float = float(link_metric.bit_rate.value) + except ValueError: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not a valid number", + priority=EventPriority.ERROR, + data={ + "gpu": xgmi_data.gpu, + "xgmi_bit_rate": ( + link_metric.bit_rate.value if link_metric.bit_rate else "N/A" + ), + }, + ) + continue + + if xgmi_float not in expected_xgmi_speed: + self._log_event( + category=EventCategory.IO, + description="XGMI link speed is not as expected", + priority=EventPriority.ERROR, + data={ + "gpu": xgmi_data.gpu, + "xgmi_bit_rate": xgmi_float, + "expected_xgmi_speed": expected_xgmi_speed, + }, + console_log=True, + ) + + def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData): + """Check AMD SMI test results + + Args: + amdsmitst_data (AmdSmiTstData): AMD SMI test data + """ + if amdsmitst_data.failed_test_count > 0: + self._log_event( + category=EventCategory.APPLICATION, + description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst", + priority=EventPriority.ERROR, + data={ + "failed_test_count": amdsmitst_data.failed_test_count, + "failed_tests": amdsmitst_data.failed_tests, + }, + console_log=True, + ) + def analyze_data( self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None ) -> TaskResult: @@ -705,4 +800,22 @@ def analyze_data( if args.expected_pldm_version: self.check_pldm_version(data.firmware, args.expected_pldm_version) + if data.cper_data: + self.analyzer_cpers( + { + file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents) + for file_model_obj in data.cper_data + }, + analysis_range_start=args.analysis_range_start, + analysis_range_end=args.analysis_range_end, + ) + + if data.xgmi_metric and len(data.xgmi_metric) > 0: + self.check_expected_xgmi_link_speed( + data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed + ) + + if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0: + self.check_amdsmitst(data.amdsmitst_data) + return self.result diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index c8a0eb60..4c78e2f5 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -23,19 +23,24 @@ # SOFTWARE. # ############################################################################### +import io import json -from typing import Any, Optional, Union +import re +from tarfile import TarFile +from typing import Any, Dict, List, Optional, Union from pydantic import ValidationError from nodescraper.base.inbandcollectortask import InBandDataCollector from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult +from nodescraper.models.datamodel import FileModel from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiDataModel, AmdSmiListItem, AmdSmiStatic, AmdSmiVersion, + EccState, Fw, FwListItem, Partition, @@ -55,6 +60,7 @@ StaticFrequencyLevels, StaticNuma, StaticPolicy, + StaticRas, StaticSocPstate, StaticVbios, StaticVram, @@ -73,12 +79,14 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]): DATA_MODEL = AmdSmiDataModel - CMD_VERSION = "amd-smi version --json" - CMD_LIST = "amd-smi list --json" - CMD_PROCESS = "amd-smi process --json" - CMD_PARTITION = "amd-smi partition --json" - CMD_FIRMWARE = "amd-smi firmware --json" - CMD_STATIC = "amd-smi static -g all --json" + CMD_VERSION = "version --json" + CMD_LIST = "list --json" + CMD_PROCESS = "process --json" + CMD_PARTITION = "partition --json" + CMD_FIRMWARE = "firmware --json" + CMD_STATIC = "static -g all --json" + CMD_STATIC_GPU = "static -g {gpu_id} --json" + CMD_RAS = "ras --cper --folder={folder}" def _check_amdsmi_installed(self) -> bool: """Check if amd-smi is installed @@ -323,6 +331,7 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: firmware = self.get_firmware() gpu_list = self.get_gpu_list() statics = self.get_static() + cper_data = self.get_cper_data() except Exception as e: self._log_event( category=EventCategory.APPLICATION, @@ -342,6 +351,7 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]: partition=partition, firmware=firmware, static=statics, + cper_data=cper_data, ) except ValidationError as err: self.logger.warning("Validation err: %s", err) @@ -359,7 +369,7 @@ def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: Returns: Optional[AmdSmiVersion]: version information or None on error """ - ret = self._run_amd_smi_dict("version") + ret = self._run_amd_smi_dict(self.CMD_VERSION) if not ret or not isinstance(ret, list) or len(ret) == 0: return None @@ -389,7 +399,7 @@ def get_gpu_list(self) -> Optional[list[AmdSmiListItem]]: Returns: Optional[list[AmdSmiListItem]]: list of GPU info items """ - ret = self._run_amd_smi_dict("list") + ret = self._run_amd_smi_dict(self.CMD_LIST) if not ret: return [] @@ -433,7 +443,7 @@ def get_process(self) -> Optional[list[Processes]]: Returns: Optional[list[Processes]]: list of GPU processes """ - ret = self._run_amd_smi_dict("process") + ret = self._run_amd_smi_dict(self.CMD_PROCESS) if not ret: return [] @@ -521,7 +531,7 @@ def get_partition(self) -> Optional[Partition]: Returns: Optional[Partition]: Partition data if available """ - ret = self._run_amd_smi_dict("partition") + ret = self._run_amd_smi_dict(self.CMD_PARTITION) if not ret: return None @@ -596,7 +606,7 @@ def get_firmware(self) -> Optional[list[Fw]]: Returns: Optional[list[Fw]]: List of firmware info per GPU """ - ret = self._run_amd_smi_dict("firmware") + ret = self._run_amd_smi_dict(self.CMD_FIRMWARE) if not ret: return [] @@ -650,14 +660,14 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: Returns: Optional[list[AmdSmiStatic]]: list of AmdSmiStatic instances or empty list """ - ret = self._run_amd_smi_dict("static -g all") + ret = self._run_amd_smi_dict(self.CMD_STATIC) if not ret: self.logger.info("Bulk static query failed, attempting per-GPU fallback") gpu_list = self.get_gpu_list() if gpu_list: fallback_data: list[dict] = [] for gpu in gpu_list: - gpu_data = self._run_amd_smi_dict(f"static -g {gpu.gpu}") + gpu_data = self._run_amd_smi_dict(self.CMD_STATIC_GPU.format(gpu_id=gpu.gpu)) if gpu_data: if isinstance(gpu_data, dict): fallback_data.append(gpu_data) @@ -689,6 +699,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: driver = item.get("driver", {}) or {} numa = item.get("numa", {}) or {} vram = item.get("vram", {}) or {} + ras = item.get("ras", {}) or {} cache = item.get("cache", {}) or {} clock = item.get("clock", {}) or {} soc_pstate = item.get("soc_pstate", {}) or {} @@ -804,6 +815,9 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: # XGMI PLPD xgmi_plpd_model = self._parse_xgmi_plpd(xgmi_plpd) + # RAS + ras_model = self._parse_ras(ras) + # Cache info cache_info_model = self._parse_cache_info(cache) @@ -820,6 +834,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: limit=None, driver=driver_model, board=board_model, + ras=ras_model, soc_pstate=soc_pstate_model, xgmi_plpd=xgmi_plpd_model, process_isolation="", @@ -943,6 +958,73 @@ def _parse_xgmi_plpd(self, data: dict) -> Optional[StaticXgmiPlpd]: except ValidationError: return None + def _parse_ras(self, data: dict) -> StaticRas: + """Parse RAS/ECC data + + Args: + data (dict): RAS data from amd-smi + + Returns: + StaticRas: StaticRas instance with default values if data is missing + """ + if not isinstance(data, dict): + # Return default RAS data + return StaticRas( + eeprom_version="N/A", + parity_schema=EccState.NA, + single_bit_schema=EccState.NA, + double_bit_schema=EccState.NA, + poison_schema=EccState.NA, + ecc_block_state={}, + ) + + def _to_ecc_state(value: Any) -> EccState: + """Convert string to EccState enum""" + if not value or not isinstance(value, str): + return EccState.NA + try: + return EccState(value.upper()) + except (ValueError, AttributeError): + return EccState.NA + + eeprom_version = str(data.get("eeprom_version", "N/A") or "N/A") + parity_schema = _to_ecc_state(data.get("parity_schema")) + single_bit_schema = _to_ecc_state(data.get("single_bit_schema")) + double_bit_schema = _to_ecc_state(data.get("double_bit_schema")) + poison_schema = _to_ecc_state(data.get("poison_schema")) + + ecc_block_state = data.get("ecc_block_state", {}) + ecc_block_state_final: Union[Dict[str, EccState], str] + if isinstance(ecc_block_state, dict): + parsed_blocks = {} + for block_name, block_state in ecc_block_state.items(): + parsed_blocks[block_name] = _to_ecc_state(block_state) + ecc_block_state_final = parsed_blocks + elif isinstance(ecc_block_state, str): + ecc_block_state_final = ecc_block_state + else: + ecc_block_state_final = {} + + try: + return StaticRas( + eeprom_version=eeprom_version, + parity_schema=parity_schema, + single_bit_schema=single_bit_schema, + double_bit_schema=double_bit_schema, + poison_schema=poison_schema, + ecc_block_state=ecc_block_state_final, + ) + except ValidationError: + # Return default if validation fails + return StaticRas( + eeprom_version="N/A", + parity_schema=EccState.NA, + single_bit_schema=EccState.NA, + double_bit_schema=EccState.NA, + poison_schema=EccState.NA, + ecc_block_state={}, + ) + def _parse_cache_info(self, data: dict) -> list[StaticCacheInfoItem]: """Parse cache info data @@ -1091,6 +1173,99 @@ def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockD return clock_dict if clock_dict else None + def get_cper_data(self) -> List[FileModel]: + """Collect CPER data from amd-smi ras command + + Returns: + list[FileModel]: List of CPER files or empty list if not supported/available + """ + try: + AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper" + # Ensure the cper folder exists but is empty + self._run_sut_cmd( + f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json", + sudo=False, + ) + # Run amd-smi ras command with sudo to collect CPER data + cper_cmd_ret = self._run_sut_cmd( + f"{self.AMD_SMI_EXE} {self.CMD_RAS.format(folder=AMD_SMI_CPER_FOLDER)}", + sudo=True, + ) + if cper_cmd_ret.exit_code != 0: + # Command failed, return empty list + return [] + cper_cmd = cper_cmd_ret.stdout + # search that a CPER is actually created here + regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd) + if not regex_cper_search: + # Early exit if no CPER files were created + return [] + # tar the cper folder + self._run_sut_cmd( + f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .", + sudo=True, + ) + # Load the tar files + cper_zip = self._read_sut_file( + f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False, log_artifact=True + ) + # Since encoding=None, this returns BinaryFileArtifact which has contents: bytes + if hasattr(cper_zip, "contents"): + io_bytes = io.BytesIO(cper_zip.contents) # type: ignore[attr-defined] + else: + return [] + del cper_zip # Free memory after reading the file + try: + with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file: + cper_data = [] + for member in tar_file.getmembers(): + if member.isfile() and member.name.endswith(".cper"): + file_content = tar_file.extractfile(member) + if file_content is not None: + # Decode the content, ignoring errors to avoid issues with binary data + # that may not be valid UTF-8 + file_content_bytes = file_content.read() + else: + file_content_bytes = b"" + cper_data.append( + FileModel(file_contents=file_content_bytes, file_name=member.name) + ) + # Since we do not log the cper data in the data model create an event informing the user if CPER created + if cper_data: + self._log_event( + category=EventCategory.APPLICATION, + description="CPER data has been extracted from amd-smi", + data={ + "cper_count": len(cper_data), + }, + priority=EventPriority.INFO, + ) + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error extracting cper data", + data={ + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return [] + return cper_data + except Exception as e: + # If any unexpected error occurs during CPER collection, log it and return empty list + # This ensures CPER collection failures don't break the entire data collection + self._log_event( + category=EventCategory.APPLICATION, + description="Error collecting CPER data", + data={ + "exception": get_exception_traceback(e), + }, + priority=EventPriority.WARNING, + console_log=False, + ) + return [] + def collect_data( self, args: Any = None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index cdf4af39..aacca2ac 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -1,16 +1,43 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### import re -from typing import Any, List, Mapping, Optional, Union +from enum import Enum +from typing import Any, Mapping, Optional, Union from pydantic import ( AliasChoices, BaseModel, ConfigDict, Field, + computed_field, field_validator, model_validator, ) -from nodescraper.models.datamodel import DataModel +from nodescraper.models.datamodel import DataModel, FileModel from nodescraper.utils import find_annotation_in_container _NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$") @@ -22,8 +49,8 @@ def na_to_none(values: Union[int, str]): return values -def na_to_none_list(values: List[Union[int, str, None]]) -> List[Union[int, str, None]]: - ret_list: List[Union[int, str, None]] = values.copy() +def na_to_none_list(values: list[Union[int, str, None]]) -> list[Union[int, str, None]]: + ret_list: list[Union[int, str, None]] = values.copy() for i in range(len(ret_list)): if ret_list[i] == "N/A": ret_list[i] = None @@ -163,13 +190,24 @@ class ProcessInfo(BaseModel): na_validator = field_validator("mem_usage", mode="before")(na_to_none) +class EccState(Enum): + ENABLED = "ENABLED" + DISABLED = "DISABLED" + NONE = "NONE" + PARITY = "PARITY" + SING_C = "SING_C" + MULT_UC = "MULT_UC" + POISON = "POISON" + NA = "N/A" + + class ProcessListItem(BaseModel): process_info: Union[ProcessInfo, str] class Processes(BaseModel): gpu: int - process_list: List[ProcessListItem] + process_list: list[ProcessListItem] # FW @@ -180,7 +218,7 @@ class FwListItem(BaseModel): class Fw(BaseModel): gpu: int - fw_list: Union[List[FwListItem], str] + fw_list: Union[list[FwListItem], str] class AmdSmiListItem(BaseModel): @@ -318,6 +356,15 @@ class StaticBoard(BaseModel): manufacturer_name: str +class StaticRas(BaseModel): + eeprom_version: str + parity_schema: EccState + single_bit_schema: EccState + double_bit_schema: EccState + poison_schema: EccState + ecc_block_state: Union[dict[str, EccState], str] + + class StaticPartition(BaseModel): # The name for compute_partition has changed we will support both for now @@ -336,13 +383,13 @@ class StaticPolicy(BaseModel): class StaticSocPstate(BaseModel): num_supported: int current_id: int - policies: List[StaticPolicy] + policies: list[StaticPolicy] class StaticXgmiPlpd(BaseModel): num_supported: int current_id: int - plpds: List[StaticPolicy] + plpds: list[StaticPolicy] class StaticNuma(BaseModel): @@ -363,7 +410,7 @@ class StaticVram(AmdSmiBaseModel): class StaticCacheInfoItem(AmdSmiBaseModel): cache: ValueUnit - cache_properties: List[str] + cache_properties: list[str] cache_size: Optional[ValueUnit] cache_level: ValueUnit max_num_cu_shared: ValueUnit @@ -401,12 +448,13 @@ class AmdSmiStatic(BaseModel): limit: Optional[StaticLimit] driver: StaticDriver board: StaticBoard + ras: StaticRas soc_pstate: Optional[StaticSocPstate] xgmi_plpd: Optional[StaticXgmiPlpd] process_isolation: str numa: StaticNuma vram: StaticVram - cache_info: List[StaticCacheInfoItem] + cache_info: list[StaticCacheInfoItem] partition: Optional[StaticPartition] = None # This has been removed in Amd-smi 26.0.0+d30a0afe+ clock: Optional[dict[str, Union[StaticClockData, None]]] = None na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict) @@ -751,6 +799,133 @@ def validate_energy(cls, value: Optional[Any]) -> Optional[MetricEnergy]: return value +### LINK DATA ### + + +class LinkStatusTable(Enum): + UP = "U" + DOWN = "D" + DISABLED = "X" + + +class BiDirectionalTable(Enum): + SELF = "SELF" + TRUE = "T" + + +class DmaTable(Enum): + SELF = "SELF" + TRUE = "T" + + +class AtomicsTable(Enum): + SELF = "SELF" + TRUE = "64,32" + THIRTY_TWO = "32" + SIXTY_FOUR = "64" + + +class LinkTypes(Enum): + XGMI = "XGMI" + PCIE = "PCIE" + SELF = "SELF" + + +class AccessTable(Enum): + ENABLED = "ENABLED" + DISABLED = "DISABLED" + + +# XGMI +class XgmiLink(BaseModel): + gpu: int + bdf: str + read: Optional[ValueUnit] + write: Optional[ValueUnit] + na_validator = field_validator("read", "write", mode="before")(na_to_none) + + +class XgmiLinkMetrics(BaseModel): + bit_rate: Optional[ValueUnit] + max_bandwidth: Optional[ValueUnit] + link_type: str + links: list[XgmiLink] + na_validator = field_validator("max_bandwidth", "bit_rate", mode="before")(na_to_none) + + +class XgmiMetrics(BaseModel): + gpu: int + bdf: str + link_metrics: XgmiLinkMetrics + + +class XgmiLinks(BaseModel): + gpu: int + bdf: str + link_status: list[LinkStatusTable] + + +class CoherentTable(Enum): + COHERANT = "C" + NON_COHERANT = "NC" + SELF = "SELF" + + +# TOPO + + +class TopoLink(BaseModel): + gpu: int + bdf: str + weight: int + link_status: AccessTable + link_type: LinkTypes + num_hops: int + bandwidth: str + # The below fields are sometimes missing, so we use Optional + coherent: Optional[CoherentTable] = None + atomics: Optional[AtomicsTable] = None + dma: Optional[DmaTable] = None + bi_dir: Optional[BiDirectionalTable] = None + + @computed_field + def bandwidth_from(self) -> Optional[int]: + """Get the bandwidth from the link.""" + bw_split = self.bandwidth.split("-") + if len(bw_split) == 2: + return int(bw_split[0]) + else: + # If the bandwidth is not in the expected format, return None + return None + + @computed_field + def bandwidth_to(self) -> Optional[int]: + """Get the bandwidth to the link.""" + bw_split = self.bandwidth.split("-") + if len(bw_split) == 2: + return int(bw_split[1]) + else: + # If the bandwidth is not in the expected format, return None + return None + + +class Topo(BaseModel): + gpu: int + bdf: str + links: list[TopoLink] + + +class AmdSmiTstData(BaseModel): + "Summary of amdsmitst results, with list and count of passing/skipped/failed tests" + + passed_tests: list[str] = Field(default_factory=list) + skipped_tests: list[str] = Field(default_factory=list) + failed_tests: list[str] = Field(default_factory=list) + passed_test_count: int = 0 + skipped_test_count: int = 0 + failed_test_count: int = 0 + + class AmdSmiDataModel(DataModel): """Data model for amd-smi data. @@ -771,10 +946,15 @@ class AmdSmiDataModel(DataModel): gpu_list: Optional[list[AmdSmiListItem]] = Field(default_factory=list) partition: Optional[Partition] = None process: Optional[list[Processes]] = Field(default_factory=list) + topology: Optional[list[Topo]] = Field(default_factory=list) firmware: Optional[list[Fw]] = Field(default_factory=list) bad_pages: Optional[list[BadPages]] = Field(default_factory=list) static: Optional[list[AmdSmiStatic]] = Field(default_factory=list) metric: Optional[list[AmdSmiMetric]] = Field(default_factory=list) + xgmi_metric: Optional[list[XgmiMetrics]] = Field(default_factory=list) + xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list) + cper_data: Optional[list[FileModel]] = Field(default_factory=list) + amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData) def get_list(self, gpu: int) -> Optional[AmdSmiListItem]: """Get the gpu list item for the given gpu id.""" diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index b8721014..333f37ae 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -23,6 +23,7 @@ # SOFTWARE. # ############################################################################### +from datetime import datetime from typing import Optional from nodescraper.models import AnalyzerArgs @@ -44,3 +45,6 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): devid_ep: Optional[str] = None devid_ep_vf: Optional[str] = None sku_name: Optional[str] = None + expected_xgmi_speed: Optional[list[float]] = None + analysis_range_start: Optional[datetime] = None + analysis_range_end: Optional[datetime] = None diff --git a/nodescraper/plugins/inband/amdsmi/cper.py b/nodescraper/plugins/inband/amdsmi/cper.py new file mode 100644 index 00000000..548a38bd --- /dev/null +++ b/nodescraper/plugins/inband/amdsmi/cper.py @@ -0,0 +1,65 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +import io +from datetime import datetime +from typing import Dict, Optional + +from nodescraper.enums import EventCategory, EventPriority + + +class CperAnalysisTaskMixin: + def analyzer_cpers( + self, + cper_data: Dict[str, io.BytesIO], + analysis_range_start: Optional[datetime], + analysis_range_end: Optional[datetime], + ): + """Generate Events from CPER data. + + Note: CPER analysis is not currently implemented. This is a stub that logs + a warning when CPER data is present. + + Args: + cper_data (Dict[str, io.BytesIO]): Dictionary of CPER file names to file contents + analysis_range_start (Optional[datetime]): Optional start time for analysis range + analysis_range_end (Optional[datetime]): Optional end time for analysis range + """ + # check the self._log_event method is defined + if not hasattr(self, "_log_event") or not callable(self._log_event): + raise NotImplementedError("The class must implement the _log_event method.") + + if cper_data: + self._log_event( + category=EventCategory.RAS, + priority=EventPriority.WARNING, + description="CPER data found but analysis is not implemented", + data={ + "cper_file_count": len(cper_data), + "cper_files": list(cper_data.keys()), + "note": "CPER analysis requires additional dependencies not currently available", + }, + ) diff --git a/test/functional/test_run_plugins.py b/test/functional/test_run_plugins.py index c8446f8b..6cb34cb4 100644 --- a/test/functional/test_run_plugins.py +++ b/test/functional/test_run_plugins.py @@ -88,9 +88,28 @@ def test_run_all_plugins_together(run_cli_command, all_plugins, tmp_path): def test_run_plugin_with_invalid_name(run_cli_command): - """Test that running a non-existent plugin fails gracefully.""" + """Test that running a non-existent plugin logs a warning and falls back to default config.""" result = run_cli_command(["run-plugins", "NonExistentPlugin"], check=False) - assert result.returncode != 0 - output = (result.stdout + result.stderr).lower() - assert "error" in output or "invalid" in output or "not found" in output + # Invalid plugin is ignored and default config runs instead + # Exit code depends on whether default config plugins succeed + output = result.stdout + result.stderr + # Check that warning was logged for invalid plugin + assert "Invalid plugin name(s) ignored: NonExistentPlugin" in output + # Check that default config was used + assert "running default config" in output.lower() or "NodeStatus" in output + # Verify it didn't crash + assert "Data written to csv file" in output + + +def test_run_comma_separated_plugins_with_invalid(run_cli_command): + """Test that comma-separated plugins run valid ones and ignore invalid ones.""" + result = run_cli_command(["run-plugins", "AmdSmiPlugin,SomePlugin"], check=False) + + output = result.stdout + result.stderr + # Check that warning was logged for invalid plugin + assert "Invalid plugin name(s) ignored: SomePlugin" in output + # Check that AmdSmiPlugin actually ran + assert "Running plugin AmdSmiPlugin" in output + # Verify it didn't crash + assert "Data written to csv file" in output diff --git a/test/unit/framework/test_cli.py b/test/unit/framework/test_cli.py index 79aca013..cd266ed9 100644 --- a/test/unit/framework/test_cli.py +++ b/test/unit/framework/test_cli.py @@ -115,12 +115,12 @@ def test_system_info_builder(): ( ["--sys-name", "test-sys", "--sys-sku", "test-sku"], ["TestPlugin1", "TestPlugin2"], - (["--sys-name", "test-sys", "--sys-sku", "test-sku"], {}), + (["--sys-name", "test-sys", "--sys-sku", "test-sku"], {}, []), ), ( ["--sys-name", "test-sys", "--sys-sku", "test-sku", "run-plugins", "-h"], ["TestPlugin1", "TestPlugin2"], - (["--sys-name", "test-sys", "--sys-sku", "test-sku", "run-plugins", "-h"], {}), + (["--sys-name", "test-sys", "--sys-sku", "test-sku", "run-plugins", "-h"], {}, []), ), ( [ @@ -143,6 +143,7 @@ def test_system_info_builder(): "TestPlugin1": ["--plugin1_arg", "test-val1"], "TestPlugin2": ["--plugin2_arg", "test-val2"], }, + [], ), ), ], diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py index 2e9b26d8..af7ab0f9 100644 --- a/test/unit/plugin/test_amdsmi_analyzer.py +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -31,7 +31,9 @@ from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiDataModel, AmdSmiStatic, + AmdSmiTstData, AmdSmiVersion, + EccState, Fw, FwListItem, Partition, @@ -48,8 +50,11 @@ StaticDriver, StaticLimit, StaticNuma, + StaticRas, StaticVram, ValueUnit, + XgmiLinkMetrics, + XgmiMetrics, ) from nodescraper.plugins.inband.amdsmi.analyzer_args import AmdSmiAnalyzerArgs @@ -205,6 +210,14 @@ def create_static_gpu( product_name="", manufacturer_name="", ), + ras=StaticRas( + eeprom_version="1.0", + parity_schema=EccState.ENABLED, + single_bit_schema=EccState.ENABLED, + double_bit_schema=EccState.ENABLED, + poison_schema=EccState.ENABLED, + ecc_block_state={}, + ), soc_pstate=None, xgmi_plpd=None, process_isolation="NONE", @@ -540,6 +553,167 @@ def test_check_expected_memory_partition_mode_mismatch(mock_analyzer): assert len(analyzer.result.events) >= 0 +def test_check_expected_xgmi_link_speed_success(mock_analyzer): + """Test check_expected_xgmi_link_speed passes when XGMI speed matches.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=32.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + XgmiMetrics( + gpu=1, + bdf="0000:02:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=32.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_xgmi_link_speed_mismatch(mock_analyzer): + """Test check_expected_xgmi_link_speed logs error when speed doesn't match.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=25.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].category == "IO" + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "XGMI link speed is not as expected" in analyzer.result.events[0].description + + +def test_check_expected_xgmi_link_speed_multiple_valid_speeds(mock_analyzer): + """Test check_expected_xgmi_link_speed with multiple valid speeds.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=36.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + XgmiMetrics( + gpu=1, + bdf="0000:02:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=38.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[36.0, 38.0]) + + assert len(analyzer.result.events) == 0 + + +def test_check_expected_xgmi_link_speed_no_data(mock_analyzer): + """Test check_expected_xgmi_link_speed handles missing XGMI data.""" + analyzer = mock_analyzer + + analyzer.check_expected_xgmi_link_speed(None, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.WARNING + assert "XGMI link speed data is not available" in analyzer.result.events[0].description + + +def test_check_expected_xgmi_link_speed_missing_bit_rate(mock_analyzer): + """Test check_expected_xgmi_link_speed handles missing bit rate value.""" + analyzer = mock_analyzer + + xgmi_data = [ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=None, + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ] + + analyzer.check_expected_xgmi_link_speed(xgmi_data, expected_xgmi_speed=[32.0]) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "XGMI link speed is not available" in analyzer.result.events[0].description + + +def test_check_amdsmitst_success(mock_analyzer): + """Test check_amdsmitst passes when no tests failed.""" + analyzer = mock_analyzer + + tst_data = AmdSmiTstData( + passed_tests=["test1", "test2", "test3"], + skipped_tests=[], + failed_tests=[], + failed_test_count=0, + ) + + analyzer.check_amdsmitst(tst_data) + + assert len(analyzer.result.events) == 0 + + +def test_check_amdsmitst_failures(mock_analyzer): + """Test check_amdsmitst logs error when tests failed.""" + analyzer = mock_analyzer + + tst_data = AmdSmiTstData( + passed_tests=["test1", "test2"], + skipped_tests=["test3"], + failed_tests=["test4", "test5"], + failed_test_count=2, + ) + + analyzer.check_amdsmitst(tst_data) + + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].category == "APPLICATION" + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "2 failed tests running amdsmitst" in analyzer.result.events[0].description + assert analyzer.result.events[0].data["failed_test_count"] == 2 + assert analyzer.result.events[0].data["failed_tests"] == ["test4", "test5"] + + def test_analyze_data_full_workflow(mock_analyzer): """Test full analyze_data workflow with various checks.""" analyzer = mock_analyzer @@ -578,12 +752,31 @@ def test_analyze_data_full_workflow(mock_analyzer): ], partition=None, gpu_list=None, + xgmi_metric=[ + XgmiMetrics( + gpu=0, + bdf="0000:01:00.0", + link_metrics=XgmiLinkMetrics( + bit_rate=ValueUnit(value=32.0, unit="GT/s"), + max_bandwidth=None, + link_type="XGMI", + links=[], + ), + ), + ], + amdsmitst_data=AmdSmiTstData( + passed_tests=["test1", "test2"], + skipped_tests=[], + failed_tests=[], + failed_test_count=0, + ), ) args = AmdSmiAnalyzerArgs( expected_max_power=550, expected_driver_version="1.2.3", expected_gpu_processes=10, + expected_xgmi_speed=[32.0], ) result = analyzer.analyze_data(data, args)