From 3fecf569f5a39d6cda42be5627f5fea9900b7fc5 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 24 Sep 2025 13:47:34 -0500 Subject: [PATCH 1/9] bad pages --- .../plugins/inband/amdsmi/amdsmi_collector.py | 123 +++++++++++++++++- .../plugins/inband/amdsmi/amdsmidata.py | 23 ++++ 2 files changed, 141 insertions(+), 5 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 54c74d7f..49709b7e 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -36,8 +36,10 @@ AmdSmiListItem, AmdSmiStatic, AmdSmiVersion, + BadPages, Fw, FwListItem, + PageData, Partition, PartitionCompute, PartitionMemory, @@ -163,6 +165,7 @@ def _get_handles(self): def _get_amdsmi_data(self) -> AmdSmiDataModel | None: try: version = self._get_amdsmi_version() + bad_pages = self.get_bad_pages() processes = self.get_process() partition = self.get_partition() firmware = self.get_firmware() @@ -182,6 +185,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: try: return AmdSmiDataModel( version=version, + bad_pages=bad_pages, gpu_list=gpu_list, process=processes, partition=partition, @@ -464,13 +468,21 @@ def _smi_try(self, fn, *a, default=None, **kw): try: return fn(*a, **kw) except amdsmi.AmdSmiException as e: # type: ignore[attr-defined] - self.logger.warning(e) + fn_name = getattr(fn, "__name__", str(fn)) + self.logger.warning( + "%s(%s) raised AmdSmiException: %s", + fn_name, + ", ".join(repr(x) for x in a), + e, + ) + code = getattr(e, "ret_code", None) if code is None: try: code = int(e.args[0]) if getattr(e, "args", None) else None except Exception: code = None + CODE2NAME = { 1: "AMDSMI_STATUS_SUCCESS", 2: "AMDSMI_STATUS_NOT_SUPPORTED", @@ -482,25 +494,40 @@ def _smi_try(self, fn, *a, default=None, **kw): } name = CODE2NAME.get(code, "unknown") + common_data = { + "function": fn_name, + "args": [repr(x) for x in a], + "status_name": name, + "status_code": code, + "exception": get_exception_traceback(e), + } + if name in ("AMDSMI_STATUS_NOT_SUPPORTED", "AMDSMI_STATUS_NOT_FOUND"): self._log_event( category=EventCategory.APPLICATION, - description=f"{fn.__name__} not supported on this device/mode (status={name}, code={code})", + description=f"{fn_name} not supported on this device/mode (status={name}, code={code})", + data=common_data, priority=EventPriority.WARNING, ) return default + if name == "AMDSMI_STATUS_PERMISSION": self._log_event( category=EventCategory.APPLICATION, - description=f"{fn.__name__} permission denied (need access to /dev/kfd & render nodes, or root for RAS). status={name}, code={code})", + description=( + f"{fn_name} permission denied " + f"(need access to /dev/kfd & render nodes, or root for RAS). " + f"status={name}, code={code}" + ), + data=common_data, priority=EventPriority.WARNING, ) return default self._log_event( category=EventCategory.APPLICATION, - description=f"{fn.__name__} failed (status={name}, code={code})", - data={"exception": get_exception_traceback(e)}, + description=f"{fn_name} failed (status={name}, code={code})", + data=common_data, priority=EventPriority.WARNING, ) return default @@ -906,6 +933,92 @@ def _fmt(n: int | None) -> str | None: except ValidationError: return None + def get_bad_pages(self) -> list[BadPages] | None: + """ + Collect bad page info per GPU and map to BadPages/PageData models. + + Returns: + List[BadPages] (one item per GPU) or None if no devices. + """ + amdsmi = self._amdsmi_mod() + devices = self._get_handles() + if not devices: + return None + + out: list[BadPages] = [] + + for idx, h in enumerate(devices): + raw = self._smi_try(amdsmi.amdsmi_get_gpu_bad_page_info, h, default=[]) or [] + pages: list[PageData] = [] + + if isinstance(raw, list): + for entry in raw: + if not isinstance(entry, dict): + continue + + pa = entry.get("page_address") + ps = entry.get("page_size") + st = entry.get("status") + val = entry.get("value") + + page_address: int | str + if isinstance(pa, (int, str)): + page_address = pa + else: + page_address = str(pa) + + page_size: int | str + if isinstance(ps, (int, str)): + page_size = ps + else: + page_size = str(ps) + + status = "" if st in (None, "N/A") else str(st) + + value_i: int | None = None + if isinstance(val, int): + value_i = val + elif isinstance(val, str): + s = val.strip() + try: + value_i = int(s, 0) + except Exception: + value_i = None + + try: + pages.append( + PageData( + page_address=page_address, + page_size=page_size, + status=status, + value=value_i, + ) + ) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build PageData; skipping entry", + data={ + "exception": get_exception_traceback(e), + "gpu_index": idx, + "entry": repr(entry), + }, + priority=EventPriority.WARNING, + ) + continue + + try: + out.append(BadPages(gpu=idx, retired=pages)) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build BadPages", + data={"exception": get_exception_traceback(e), "gpu_index": idx}, + priority=EventPriority.WARNING, + ) + + return out + def collect_data( self, args=None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index ea4b6bcb..6f2d5600 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -413,6 +413,19 @@ class AmdSmiStatic(BaseModel): ) +# PAGES +class PageData(BaseModel): + page_address: int | str + page_size: int | str + status: str + value: int | None + + +class BadPages(BaseModel): + gpu: int + retired: list[PageData] + + class AmdSmiDataModel(DataModel): """Data model for amd-smi data. @@ -434,6 +447,7 @@ class AmdSmiDataModel(DataModel): partition: Partition | None = None process: list[Processes] | None = Field(default_factory=list) firmware: list[Fw] | None = Field(default_factory=list) + bad_pages: list[BadPages] | None = Field(default_factory=list) static: list[AmdSmiStatic] | None = Field(default_factory=list) def get_list(self, gpu: int) -> AmdSmiListItem | None: @@ -471,3 +485,12 @@ def get_static(self, gpu: int) -> AmdSmiStatic | None: if item.gpu == gpu: return item return None + + def get_bad_pages(self, gpu: int) -> BadPages | None: + """Get the bad pages data for the given gpu id.""" + if self.bad_pages is None: + return None + for item in self.bad_pages: + if item.gpu == gpu: + return item + return None From 03b0528b96060bc1319ff190d4ab172f3d38d099 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 26 Sep 2025 10:43:45 -0500 Subject: [PATCH 2/9] metric updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 575 +++++++++++++++++- .../plugins/inband/amdsmi/amdsmidata.py | 310 ++++++++++ 2 files changed, 871 insertions(+), 14 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 49709b7e..533904b6 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -34,11 +34,24 @@ from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiDataModel, AmdSmiListItem, + AmdSmiMetric, AmdSmiStatic, AmdSmiVersion, BadPages, Fw, FwListItem, + MetricClockData, + MetricEccTotals, + MetricEnergy, + MetricFan, + MetricMemUsage, + MetricPcie, + MetricPower, + MetricTemperature, + MetricThrottle, + MetricThrottleVu, + MetricUsage, + MetricVoltageCurve, PageData, Partition, PartitionCompute, @@ -171,7 +184,9 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: firmware = self.get_firmware() gpu_list = self.get_gpu_list() statics = self.get_static() + metric = self.get_metric() except Exception as e: + self.logger.error(e) self._log_event( category=EventCategory.APPLICATION, description="Error running amd-smi sub commands", @@ -191,6 +206,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None: partition=partition, firmware=firmware, static=statics, + metric=metric ) except ValidationError as e: self.logger.warning("Validation err: %s", e) @@ -277,35 +293,54 @@ def get_process(self) -> list[Processes] | None: plist.append(ProcessListItem(process_info=str(entry))) continue - name = entry.get("name", "N/A") + raw_name = entry.get("name", None) + name = ( + None + if (raw_name is None or str(raw_name).strip().upper() == "N/A") + else str(raw_name) + ) + pid_val = entry.get("pid", 0) try: pid = int(pid_val) if pid_val not in (None, "") else 0 except Exception: pid = 0 + # memory_usage block + mu = entry.get("memory_usage") or {} + gtt_mem_vu = self._vu(mu.get("gtt_mem"), "B") + cpu_mem_vu = self._vu(mu.get("cpu_mem"), "B") + vram_mem_vu = self._vu(mu.get("vram_mem"), "B") + + # mem mem_vu = self._vu(entry.get("mem"), "B") + if mem_vu is None and vram_mem_vu is not None: + mem_vu = vram_mem_vu + + if (not mu) and mem_vu is not None and vram_mem_vu is None: + vram_mem_vu = mem_vu - mu = entry.get("memory_usage") or {} mem_usage = ProcessMemoryUsage( - gtt_mem=self._vu(mu.get("gtt_mem"), "B"), - cpu_mem=self._vu(mu.get("cpu_mem"), "B"), - vram_mem=self._vu(mu.get("vram_mem"), "B"), + gtt_mem=gtt_mem_vu, + cpu_mem=cpu_mem_vu, + vram_mem=vram_mem_vu, ) + # engine_usage eu = entry.get("engine_usage") or {} - usage = ProcessUsage( - gfx=self._vu(eu.get("gfx"), "ns"), - enc=self._vu(eu.get("enc"), "ns"), - ) + gfx_vu = self._vu(eu.get("gfx"), "ns") or self._vu(0, "ns") + enc_vu = self._vu(eu.get("enc"), "ns") or self._vu(0, "ns") + usage = ProcessUsage(gfx=gfx_vu, enc=enc_vu) - cu_occ = self._vu(entry.get("cu_occupancy"), "") + # CU occupancy, default 0 + cu_raw = entry.get("cu_occupancy", None) + cu_occ = self._vu(cu_raw, "") or self._vu(0, "") try: plist.append( ProcessListItem( process_info=ProcessInfo( - name=str(name), + name=name if name is not None else "N/A", pid=pid, mem=mem_vu, memory_usage=mem_usage, @@ -633,7 +668,7 @@ def get_static(self) -> list[AmdSmiStatic] | None: version=str(vb.get("vbios_version", "")), ) - # NUMA (via KFD) + # NUMA if isinstance(kfd, dict): try: numa_node = int(kfd.get("node_id", 0) or 0) @@ -714,7 +749,7 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None: self._log_event( category=EventCategory.APPLICATION, description="amdsmi_get_soc_pstate not exposed by amdsmi build", - priority=EventPriority.INFO, + priority=EventPriority.WARNING, ) return None @@ -768,7 +803,7 @@ def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None: self._log_event( category=EventCategory.APPLICATION, description="XGMI PLPD not exposed by this amdsmi build", - priority=EventPriority.INFO, + priority=EventPriority.WARNING, ) return None @@ -1019,6 +1054,518 @@ def get_bad_pages(self) -> list[BadPages] | None: return out + def get_metric(self) -> list[AmdSmiMetric] | None: + amdsmi = self._amdsmi_mod() + devices = self._get_handles() + out: list[AmdSmiMetric] = [] + + def _to_int_or_none(v: object) -> int | None: + n = self._to_number(v) + if n is None: + return None + try: + return int(n) + except Exception: + try: + return int(float(n)) + except Exception: + return None + + def _as_list(v: object) -> list[object]: + if isinstance(v, list): + return v + return ( + [] if v in (None, "N/A") else [v] if not isinstance(v, (dict, tuple, set)) else [] + ) + + for idx, h in enumerate(devices): + raw = self._smi_try(amdsmi.amdsmi_get_gpu_metrics_info, h, default=None) + + if not isinstance(raw, dict): + self._log_event( + category=EventCategory.APPLICATION, + description="amdsmi_get_gpu_metrics_info returned no dict; using empty metric", + data={"gpu_index": idx, "type": type(raw).__name__}, + priority=EventPriority.WARNING, + ) + out.append(self._empty_metric(idx)) + continue + + try: + # Usage + usage = MetricUsage( + gfx_activity=self._vu(raw.get("average_gfx_activity"), "%"), + umc_activity=self._vu(raw.get("average_umc_activity"), "%"), + mm_activity=self._vu(raw.get("average_mm_activity"), "%"), + vcn_activity=[self._vu(v, "%") for v in _as_list(raw.get("vcn_activity"))], + jpeg_activity=[self._vu(v, "%") for v in _as_list(raw.get("jpeg_activity"))], + gfx_busy_inst=None, + jpeg_busy=None, + vcn_busy=None, + ) + + # Power / Energy + power = MetricPower( + socket_power=self._vu(raw.get("average_socket_power"), "W"), + gfx_voltage=self._vu(raw.get("voltage_gfx"), "mV"), + soc_voltage=self._vu(raw.get("voltage_soc"), "mV"), + mem_voltage=self._vu(raw.get("voltage_mem"), "mV"), + throttle_status=( + str(raw.get("throttle_status")) + if raw.get("throttle_status") is not None + else None + ), + power_management=self._nz(raw.get("indep_throttle_status"), default="unknown"), + ) + energy = MetricEnergy( + total_energy_consumption=self._vu(raw.get("energy_accumulator"), "uJ") + ) + + # Temperature + temperature = MetricTemperature( + edge=self._vu(raw.get("temperature_edge"), "C"), + hotspot=self._vu(raw.get("temperature_hotspot"), "C"), + mem=self._vu(raw.get("temperature_mem"), "C"), + ) + + # PCIe + speed_raw = self._to_number(raw.get("pcie_link_speed")) + speed_gtps = ( + float(speed_raw) / 10.0 if isinstance(speed_raw, (int, float)) else None + ) + + pcie = MetricPcie( + width=_to_int_or_none(raw.get("pcie_link_width")), + speed=self._vu(speed_gtps, "GT/s"), + bandwidth=self._vu(raw.get("pcie_bandwidth_inst"), "GB/s"), + replay_count=_to_int_or_none(raw.get("pcie_replay_count_acc")), + l0_to_recovery_count=_to_int_or_none(raw.get("pcie_l0_to_recov_count_acc")), + replay_roll_over_count=_to_int_or_none(raw.get("pcie_replay_rover_count_acc")), + nak_sent_count=_to_int_or_none(raw.get("pcie_nak_sent_count_acc")), + nak_received_count=_to_int_or_none(raw.get("pcie_nak_rcvd_count_acc")), + current_bandwidth_sent=None, + current_bandwidth_received=None, + max_packet_size=None, + lc_perf_other_end_recovery=None, + ) + + # Clocks + def _clk(cur_key: str) -> MetricClockData: + return MetricClockData( + clk=self._vu(raw.get(cur_key), "MHz"), + min_clk=None, + max_clk=None, + clk_locked=( + raw.get("gfxclk_lock_status") if cur_key == "current_gfxclk" else None + ), + deep_sleep=None, + ) + + clock: dict[str, MetricClockData] = { + "GFX": _clk("current_gfxclk"), + "SOC": _clk("current_socclk"), + "UCLK": _clk("current_uclk"), + "VCLK0": _clk("current_vclk0"), + "DCLK0": _clk("current_dclk0"), + "VCLK1": _clk("current_vclk1"), + "DCLK1": _clk("current_dclk1"), + } + + # Fan + fan = MetricFan( + rpm=self._vu(raw.get("current_fan_speed"), "RPM"), + speed=None, + max=None, + usage=None, + ) + + # Voltage curve + voltage_curve = self._get_voltage_curve(h) or self._empty_voltage_curve() + + # Memory usage + total_vram_vu: ValueUnit | None = None + used_vram_vu: ValueUnit | None = None + free_vram_vu: ValueUnit | None = None + + vram_usage = self._smi_try(amdsmi.amdsmi_get_gpu_vram_usage, h, default=None) + if isinstance(vram_usage, dict): + used_vram_vu = self._vu(vram_usage.get("vram_used"), "B") + total_vram_vu = self._vu(vram_usage.get("vram_total"), "B") + + mem_enum = getattr(amdsmi, "AmdSmiMemoryType", None) + vis_total_vu: ValueUnit | None = None + gtt_total_vu: ValueUnit | None = None + + if mem_enum is not None: + if total_vram_vu is None: + vram_total_alt = self._smi_try( + amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.VRAM, default=None + ) + if vram_total_alt is not None: + total_vram_vu = self._vu(vram_total_alt, "B") + + vis_total = self._smi_try( + amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.VIS_VRAM, default=None + ) + if vis_total is not None: + vis_total_vu = self._vu(vis_total, "B") + + gtt_total = self._smi_try( + amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.GTT, default=None + ) + if gtt_total is not None: + gtt_total_vu = self._vu(gtt_total, "B") + + # Compute free if possible + if free_vram_vu is None and total_vram_vu is not None and used_vram_vu is not None: + try: + free_num = max(0.0, float(total_vram_vu.value) - float(used_vram_vu.value)) + free_vram_vu = self._vu(free_num, "B") + except Exception: + pass + + # Build mem_usage + mem_usage = MetricMemUsage( + total_vram=total_vram_vu, + used_vram=used_vram_vu, + free_vram=free_vram_vu, + total_visible_vram=vis_total_vu, + used_visible_vram=None, + free_visible_vram=None, + total_gtt=gtt_total_vu, + used_gtt=None, + free_gtt=None, + ) + + # ECC totals + ecc_raw = self._smi_try(amdsmi.amdsmi_get_gpu_total_ecc_count, h, default=None) + if isinstance(ecc_raw, dict): + ecc = MetricEccTotals( + total_correctable_count=_to_int_or_none(ecc_raw.get("correctable_count")), + total_uncorrectable_count=_to_int_or_none( + ecc_raw.get("uncorrectable_count") + ), + total_deferred_count=_to_int_or_none(ecc_raw.get("deferred_count")), + cache_correctable_count=None, + cache_uncorrectable_count=None, + ) + else: + ecc = MetricEccTotals( + total_correctable_count=None, + total_uncorrectable_count=None, + total_deferred_count=None, + cache_correctable_count=None, + cache_uncorrectable_count=None, + ) + + # Throttle + throttle = self.get_throttle(h) or MetricThrottle() + + out.append( + AmdSmiMetric( + gpu=idx, + usage=usage, + power=power, + clock=clock, + temperature=temperature, + pcie=pcie, + ecc=ecc, + ecc_blocks={}, + fan=fan, + voltage_curve=voltage_curve, + perf_level=None, + xgmi_err=None, + energy=energy, + mem_usage=mem_usage, + throttle=throttle, + ) + ) + except ValidationError as e: + self.logger.warning(e) + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build AmdSmiMetric; using empty metric", + data={"exception": get_exception_traceback(e), "gpu_index": idx}, + priority=EventPriority.WARNING, + ) + out.append(self._empty_metric(idx)) + + return out + + def _empty_metric(self, gpu_idx: int) -> AmdSmiMetric: + return AmdSmiMetric( + gpu=gpu_idx, + usage=MetricUsage( + gfx_activity=None, + umc_activity=None, + mm_activity=None, + vcn_activity=[], + jpeg_activity=[], + gfx_busy_inst=None, + jpeg_busy=None, + vcn_busy=None, + ), + power=MetricPower( + socket_power=None, + gfx_voltage=None, + soc_voltage=None, + mem_voltage=None, + throttle_status=None, + power_management=None, + ), + clock={}, + temperature=MetricTemperature(edge=None, hotspot=None, mem=None), + pcie=MetricPcie( + width=None, + speed=None, + bandwidth=None, + replay_count=None, + l0_to_recovery_count=None, + replay_roll_over_count=None, + nak_sent_count=None, + nak_received_count=None, + current_bandwidth_sent=None, + current_bandwidth_received=None, + max_packet_size=None, + lc_perf_other_end_recovery=None, + ), + ecc=MetricEccTotals( + total_correctable_count=None, + total_uncorrectable_count=None, + total_deferred_count=None, + cache_correctable_count=None, + cache_uncorrectable_count=None, + ), + ecc_blocks={}, + fan=MetricFan(speed=None, max=None, rpm=None, usage=None), + voltage_curve=self._empty_voltage_curve(), + perf_level=None, + xgmi_err=None, + energy=None, + mem_usage=MetricMemUsage( + total_vram=None, + used_vram=None, + free_vram=None, + total_visible_vram=None, + used_visible_vram=None, + free_visible_vram=None, + total_gtt=None, + used_gtt=None, + free_gtt=None, + ), + throttle=MetricThrottle(), + ) + + def _get_voltage_curve(self, h) -> MetricVoltageCurve: + amdsmi = self._amdsmi_mod() + raw = self._smi_try(amdsmi.amdsmi_get_gpu_od_volt_info, h, default=None) + if not isinstance(raw, dict): + return self._empty_voltage_curve() + + try: + num_regions = int(raw.get("num_regions", 0) or 0) + except Exception: + num_regions = 0 + if num_regions == 0: + return self._empty_voltage_curve() + + curve = raw.get("curve") or {} + pts = curve.get("vc_points") or raw.get("vc_points") or [] + if not isinstance(pts, list) or len(pts) == 0: + return self._empty_voltage_curve() + + def _pt_get(d: object, *names: str) -> object | None: + if not isinstance(d, dict): + return None + for n in names: + if n in d: + return d.get(n) + lower = {str(k).lower(): v for k, v in d.items()} + for n in names: + v = lower.get(n.lower()) + if v is not None: + return v + return None + + def _extract_point(p: object) -> tuple[object | None, object | None]: + clk = _pt_get(p, "clk_value", "frequency", "freq", "clk", "sclk") + volt = _pt_get(p, "volt_value", "voltage", "volt", "mV") + return clk, volt + + p0_clk, p0_volt = _extract_point(pts[0]) if len(pts) >= 1 else (None, None) + p1_clk, p1_volt = _extract_point(pts[1]) if len(pts) >= 2 else (None, None) + p2_clk, p2_volt = _extract_point(pts[2]) if len(pts) >= 3 else (None, None) + + return MetricVoltageCurve( + point_0_frequency=self._vu(p0_clk, "MHz"), + point_0_voltage=self._vu(p0_volt, "mV"), + point_1_frequency=self._vu(p1_clk, "MHz"), + point_1_voltage=self._vu(p1_volt, "mV"), + point_2_frequency=self._vu(p2_clk, "MHz"), + point_2_voltage=self._vu(p2_volt, "mV"), + ) + + def _empty_voltage_curve(self) -> MetricVoltageCurve: + return MetricVoltageCurve( + point_0_frequency=None, + point_0_voltage=None, + point_1_frequency=None, + point_1_voltage=None, + point_2_frequency=None, + point_2_voltage=None, + ) + + def _as_first_plane(self, obj) -> list: + """Take a scalar/list/2D-list and return the first plane as a flat list.""" + if isinstance(obj, list): + if obj and isinstance(obj[0], list): # 2D + return obj[0] + return obj + return [] + + def _th_vu_list_pct(self, obj) -> MetricThrottleVu | None: + """Return MetricThrottleVu with % ValueUnits for the first XCP plane.""" + arr = self._as_first_plane(obj) + if not arr: + return None + return MetricThrottleVu( + xcp_0=[self._vu(v, "%") if v not in (None, "N/A") else "N/A" for v in arr] + ) + + def _th_vu_list_raw(self, obj) -> MetricThrottleVu | None: + """Return MetricThrottleVu with raw ints/strings for the first XCP plane.""" + arr = self._as_first_plane(obj) + if not arr: + return None + return MetricThrottleVu( + xcp_0=[ + (int(v) if isinstance(v, (int, float, str)) and str(v).strip().isdigit() else v) + for v in arr + ] + ) + + def get_throttle(self, h) -> MetricThrottle: + amdsmi = self._amdsmi_mod() + raw = self._smi_try(amdsmi.amdsmi_get_violation_status, h, default=None) + if not isinstance(raw, dict): + return MetricThrottle() + + acc_counter = raw.get("acc_counter") + prochot_acc = raw.get("acc_prochot_thrm") + ppt_acc = raw.get("acc_ppt_pwr") + socket_thrm_acc = raw.get("acc_socket_thrm") + vr_thrm_acc = raw.get("acc_vr_thrm") + hbm_thrm_acc = raw.get("acc_hbm_thrm") + + acc_gfx_pwr = raw.get("acc_gfx_clk_below_host_limit_pwr") + acc_gfx_thm = raw.get("acc_gfx_clk_below_host_limit_thm") + acc_low_util = raw.get("acc_low_utilization") + acc_gfx_total = raw.get("acc_gfx_clk_below_host_limit_total") + + act_prochot = raw.get("active_prochot_thrm") + act_ppt = raw.get("active_ppt_pwr") + act_socket = raw.get("active_socket_thrm") + act_vr = raw.get("active_vr_thrm") + act_hbm = raw.get("active_hbm_thrm") + act_gfx_pwr = raw.get("active_gfx_clk_below_host_limit_pwr") + act_gfx_thm = raw.get("active_gfx_clk_below_host_limit_thm") + act_low_util = raw.get("active_low_utilization") + act_gfx_total = raw.get("active_gfx_clk_below_host_limit_total") + + per_prochot = raw.get("per_prochot_thrm") + per_ppt = raw.get("per_ppt_pwr") + per_socket = raw.get("per_socket_thrm") + per_vr = raw.get("per_vr_thrm") + per_hbm = raw.get("per_hbm_thrm") + per_gfx_pwr = raw.get("per_gfx_clk_below_host_limit_pwr") + per_gfx_thm = raw.get("per_gfx_clk_below_host_limit_thm") + per_low_util = raw.get("per_low_utilization") + per_gfx_total = raw.get("per_gfx_clk_below_host_limit_total") + + return MetricThrottle( + accumulation_counter=self._vu(acc_counter, ""), # unitless counter + prochot_accumulated=self._th_vu_list_raw(prochot_acc), + ppt_accumulated=self._th_vu_list_raw(ppt_acc), + socket_thermal_accumulated=self._th_vu_list_raw(socket_thrm_acc), + vr_thermal_accumulated=self._th_vu_list_raw(vr_thrm_acc), + hbm_thermal_accumulated=self._th_vu_list_raw(hbm_thrm_acc), + gfx_clk_below_host_limit_power_accumulated=self._th_vu_list_raw(acc_gfx_pwr), + gfx_clk_below_host_limit_thermal_accumulated=self._th_vu_list_raw(acc_gfx_thm), + low_utilization_accumulated=self._th_vu_list_raw(acc_low_util), + total_gfx_clk_below_host_limit_accumulated=self._th_vu_list_raw(acc_gfx_total), + prochot_violation_status=self._th_vu_list_raw(act_prochot), + ppt_violation_status=self._th_vu_list_raw(act_ppt), + socket_thermal_violation_status=self._th_vu_list_raw(act_socket), + vr_thermal_violation_status=self._th_vu_list_raw(act_vr), + hbm_thermal_violation_status=self._th_vu_list_raw(act_hbm), + gfx_clk_below_host_limit_power_violation_status=self._th_vu_list_raw(act_gfx_pwr), + gfx_clk_below_host_limit_thermal_violation_status=self._th_vu_list_raw(act_gfx_thm), + low_utilization_violation_status=self._th_vu_list_raw(act_low_util), + total_gfx_clk_below_host_limit_violation_status=self._th_vu_list_raw(act_gfx_total), + prochot_violation_activity=self._vu(per_prochot, "%"), + ppt_violation_activity=self._vu(per_ppt, "%"), + socket_thermal_violation_activity=self._vu(per_socket, "%"), + vr_thermal_violation_activity=self._vu(per_vr, "%"), + hbm_thermal_violation_activity=self._vu(per_hbm, "%"), + gfx_clk_below_host_limit_power_violation_activity=self._th_vu_list_pct(per_gfx_pwr), + gfx_clk_below_host_limit_thermal_violation_activity=self._th_vu_list_pct(per_gfx_thm), + low_utilization_violation_activity=self._th_vu_list_pct(per_low_util), + total_gfx_clk_below_host_limit_violation_activity=self._th_vu_list_pct(per_gfx_total), + ) + + def _flatten_2d(self, v: object) -> list[object]: + if isinstance(v, list) and v and isinstance(v[0], list): + out: list[object] = [] + for row in v: + if isinstance(row, list): + out.extend(row) + else: + out.append(row) + return out + return v if isinstance(v, list) else [v] if v not in (None, "N/A") else [] + + def _coerce_throttle_value( + self, v: object, unit: str = "" + ) -> MetricThrottleVu | ValueUnit | None: + """ + Convert ints/floats/strings/lists/2D-lists/dicts into: + - ValueUnit + - MetricThrottleVu(xcp_0=[...]) + - None for N/A/empty + """ + if v in (None, "", "N/A"): + return None + + if isinstance(v, (int, float)): + return ValueUnit(value=v, unit=unit) + if isinstance(v, str): + s = v.strip() + if not s or s.upper() == "N/A": + return None + try: + return ValueUnit(value=int(s, 0), unit=unit) + except Exception: + try: + return ValueUnit(value=float(s), unit=unit) + except Exception: + return MetricThrottleVu(xcp_0=[s]) + + if isinstance(v, list): + flat = self._flatten_2d(v) + return MetricThrottleVu(xcp_0=flat if flat else None) + + if isinstance(v, dict): + if "xcp_0" in v and isinstance(v["xcp_0"], list): + return MetricThrottleVu(xcp_0=self._flatten_2d(v["xcp_0"])) + val = v.get("value") + if isinstance(val, dict): + for maybe_list in val.values(): + if isinstance(maybe_list, list): + return MetricThrottleVu(xcp_0=self._flatten_2d(maybe_list)) + return MetricThrottleVu(xcp_0=[str(v)]) + + return MetricThrottleVu(xcp_0=[str(v)]) + def collect_data( self, args=None, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 6f2d5600..ba6a2eef 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -426,6 +426,315 @@ class BadPages(BaseModel): retired: list[PageData] +# Metric Data +class MetricUsage(BaseModel): + gfx_activity: ValueUnit | None + umc_activity: ValueUnit | None + mm_activity: ValueUnit | None + vcn_activity: list[ValueUnit | str | None] + jpeg_activity: list[ValueUnit | str | None] + gfx_busy_inst: dict[str, list[ValueUnit | str | None]] | None + jpeg_busy: dict[str, list[ValueUnit | str | None]] | None + vcn_busy: dict[str, list[ValueUnit | str | None]] | None + na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")( + na_to_none_list + ) + na_validator = field_validator( + "gfx_activity", + "umc_activity", + "mm_activity", + "gfx_busy_inst", + "jpeg_busy", + "vcn_busy", + mode="before", + )(na_to_none) + + +class MetricPower(BaseModel): + socket_power: ValueUnit | None + gfx_voltage: ValueUnit | None + soc_voltage: ValueUnit | None + mem_voltage: ValueUnit | None + throttle_status: str | None + power_management: str | None + na_validator = field_validator( + "socket_power", + "gfx_voltage", + "soc_voltage", + "mem_voltage", + "throttle_status", + "power_management", + mode="before", + )(na_to_none) + + +class MetricClockData(BaseModel): + clk: ValueUnit | None + min_clk: ValueUnit | None + max_clk: ValueUnit | None + clk_locked: int | str | dict | None + deep_sleep: int | str | dict | None + na_validator = field_validator( + "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before" + )(na_to_none) + + +class MetricTemperature(BaseModel): + edge: ValueUnit | None + hotspot: ValueUnit | None + mem: ValueUnit | None + na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none) + + +class MetricPcie(BaseModel): + width: int | None + speed: ValueUnit | None + bandwidth: ValueUnit | None + replay_count: int | None + l0_to_recovery_count: int | None + replay_roll_over_count: int | None + nak_sent_count: int | None + nak_received_count: int | None + current_bandwidth_sent: int | None + current_bandwidth_received: int | None + max_packet_size: int | None + lc_perf_other_end_recovery: int | None + na_validator = field_validator( + "width", + "speed", + "bandwidth", + "replay_count", + "l0_to_recovery_count", + "replay_roll_over_count", + "nak_sent_count", + "nak_received_count", + "current_bandwidth_sent", + "current_bandwidth_received", + "max_packet_size", + "lc_perf_other_end_recovery", + mode="before", + )(na_to_none) + + +class MetricEccTotals(BaseModel): + total_correctable_count: int | None + total_uncorrectable_count: int | None + total_deferred_count: int | None + cache_correctable_count: int | None + cache_uncorrectable_count: int | None + na_validator = field_validator( + "total_correctable_count", + "total_uncorrectable_count", + "total_deferred_count", + "cache_correctable_count", + "cache_uncorrectable_count", + mode="before", + )(na_to_none) + + +class MetricErrorCounts(BaseModel): + correctable_count: str | None + uncorrectable_count: str | None + deferred_count: str | None + na_validator = field_validator( + "correctable_count", "uncorrectable_count", "deferred_count", mode="before" + )(na_to_none) + + +class MetricFan(BaseModel): + speed: ValueUnit | None + max: ValueUnit | None + rpm: ValueUnit | None + usage: ValueUnit | None + na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none) + + +class MetricVoltageCurve(BaseModel): + point_0_frequency: ValueUnit | None + point_0_voltage: ValueUnit | None + point_1_frequency: ValueUnit | None + point_1_voltage: ValueUnit | None + point_2_frequency: ValueUnit | None + point_2_voltage: ValueUnit | None + + na_validator = field_validator( + "point_0_frequency", + "point_0_voltage", + "point_1_frequency", + "point_1_voltage", + "point_2_frequency", + "point_2_voltage", + mode="before", + )(na_to_none) + + +class MetricEnergy(BaseModel): + total_energy_consumption: ValueUnit | None + na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none) + + +class MetricMemUsage(BaseModel): + total_vram: ValueUnit | None + used_vram: ValueUnit | None + free_vram: ValueUnit | None + total_visible_vram: ValueUnit | None + used_visible_vram: ValueUnit | None + free_visible_vram: ValueUnit | None + total_gtt: ValueUnit | None + used_gtt: ValueUnit | None + free_gtt: ValueUnit | None + na_validator = field_validator( + "total_vram", + "used_vram", + "free_vram", + "total_visible_vram", + "used_visible_vram", + "free_visible_vram", + "total_gtt", + "used_gtt", + "free_gtt", + mode="before", + )(na_to_none) + + +class MetricThrottleVu(BaseModel): + xcp_0: list[ValueUnit | str | None] = None + # Deprecated below + value: dict[str, list[int | str]] | None = Field(deprecated=True, default=None) + unit: str = Field(deprecated=True, default="") + + +class MetricThrottle(AmdSmiBaseModel): + # At some point in time these changed from being int -> ValueUnit + + accumulation_counter: MetricThrottleVu | ValueUnit | None = None + + gfx_clk_below_host_limit_accumulated: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_power_accumulated: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_power_violation_activity: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_power_violation_status: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_violation_activity: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_violation_accumulated: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_violation_status: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_thermal_violation_accumulated: MetricThrottleVu | ValueUnit | None = ( + None + ) + gfx_clk_below_host_limit_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + gfx_clk_below_host_limit_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + + hbm_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + hbm_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + hbm_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + low_utilization_violation_accumulated: MetricThrottleVu | ValueUnit | None = None + low_utilization_violation_activity: MetricThrottleVu | ValueUnit | None = None + low_utilization_violation_status: MetricThrottleVu | ValueUnit | None = None + ppt_accumulated: MetricThrottleVu | ValueUnit | None = None + ppt_violation_activity: MetricThrottleVu | ValueUnit | None = None + ppt_violation_status: MetricThrottleVu | ValueUnit | None = None + prochot_accumulated: MetricThrottleVu | ValueUnit | None = None + prochot_violation_activity: MetricThrottleVu | ValueUnit | None = None + prochot_violation_status: MetricThrottleVu | ValueUnit | None = None + socket_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + socket_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + socket_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + vr_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None + vr_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None + vr_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None + + total_gfx_clk_below_host_limit_accumulated: MetricThrottleVu | ValueUnit | None = None + low_utilization_accumulated: MetricThrottleVu | ValueUnit | None = None + total_gfx_clk_below_host_limit_violation_status: MetricThrottleVu | ValueUnit | None = None + total_gfx_clk_below_host_limit_violation_activity: MetricThrottleVu | ValueUnit | None = None + + na_validator = field_validator( + "accumulation_counter", + "gfx_clk_below_host_limit_accumulated", + "gfx_clk_below_host_limit_power_accumulated", + "gfx_clk_below_host_limit_power_violation_activity", + "gfx_clk_below_host_limit_power_violation_status", + "gfx_clk_below_host_limit_violation_activity", + "gfx_clk_below_host_limit_violation_accumulated", + "gfx_clk_below_host_limit_violation_status", + "gfx_clk_below_host_limit_thermal_violation_accumulated", + "gfx_clk_below_host_limit_thermal_violation_activity", + "gfx_clk_below_host_limit_thermal_violation_status", + "gfx_clk_below_host_limit_thermal_accumulated", + "hbm_thermal_accumulated", + "hbm_thermal_violation_activity", + "hbm_thermal_violation_status", + "low_utilization_violation_accumulated", + "low_utilization_violation_activity", + "low_utilization_violation_status", + "ppt_accumulated", + "ppt_violation_activity", + "ppt_violation_status", + "prochot_accumulated", + "prochot_violation_activity", + "prochot_violation_status", + "socket_thermal_accumulated", + "socket_thermal_violation_activity", + "socket_thermal_violation_status", + "vr_thermal_accumulated", + "vr_thermal_violation_activity", + "vr_thermal_violation_status", + "total_gfx_clk_below_host_limit_accumulated", + "low_utilization_accumulated", + "total_gfx_clk_below_host_limit_violation_status", + "total_gfx_clk_below_host_limit_violation_activity", + mode="before", + )(na_to_none) + + +class EccData(BaseModel): + "ECC counts collected per ecc block" + + correctable_count: int | None = 0 + uncorrectable_count: int | None = 0 + deferred_count: int | None = 0 + + na_validator = field_validator( + "correctable_count", "uncorrectable_count", "deferred_count", mode="before" + )(na_to_none) + + +class AmdSmiMetric(BaseModel): + gpu: int + usage: MetricUsage + power: MetricPower + clock: dict[str, MetricClockData] + temperature: MetricTemperature + pcie: MetricPcie + ecc: MetricEccTotals + ecc_blocks: dict[str, EccData] | str + fan: MetricFan + voltage_curve: MetricVoltageCurve | None + perf_level: str | dict | None + xgmi_err: str | dict | None + energy: MetricEnergy | None + mem_usage: MetricMemUsage + throttle: MetricThrottle + + na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) + + @field_validator("ecc_blocks", mode="before") + @classmethod + def validate_ecc_blocks(cls, value: dict[str, EccData] | str) -> dict[str, EccData]: + """Validate the ecc_blocks field.""" + if isinstance(value, str): + # If it's a string, we assume it's "N/A" and return an empty dict + return {} + return value + + @field_validator("energy", mode="before") + @classmethod + def validate_energy(cls, value: Any | None) -> MetricEnergy | None: + """Validate the energy field.""" + if value == "N/A" or value is None: + return None + return value + + class AmdSmiDataModel(DataModel): """Data model for amd-smi data. @@ -449,6 +758,7 @@ class AmdSmiDataModel(DataModel): firmware: list[Fw] | None = Field(default_factory=list) bad_pages: list[BadPages] | None = Field(default_factory=list) static: list[AmdSmiStatic] | None = Field(default_factory=list) + metric: list[AmdSmiMetric] | None = Field(default_factory=list) def get_list(self, gpu: int) -> AmdSmiListItem | None: """Get the gpu list item for the given gpu id.""" From a9a4ae31fe5892223b31fdf07de15f4736517672 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 12 Nov 2025 11:58:38 -0600 Subject: [PATCH 3/9] filled in more gaps + typehint + docstring --- .../plugins/inband/amdsmi/amdsmi_collector.py | 358 +++++++++++++++--- 1 file changed, 304 insertions(+), 54 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 291f1949..a29ff1e7 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -68,6 +68,7 @@ StaticClockData, StaticDriver, StaticFrequencyLevels, + StaticLimit, StaticNuma, StaticPolicy, StaticSocPstate, @@ -776,12 +777,23 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: except Exception: vram_size_b = None + # Calculate VRAM max bandwidth if possible + max_bandwidth = None + if vram_bits and kfd.get("memory_max_frequency"): + try: + mem_freq_mhz = float(kfd["memory_max_frequency"]) + # Bandwidth (GB/s) = (bit_width * frequency_MHz) / 8000 Note: is this correct? + bandwidth_gbs = (float(vram_bits) * mem_freq_mhz) / 8000.0 + max_bandwidth = self._valueunit(bandwidth_gbs, "GB/s") + except Exception: + pass + vram_model = StaticVram( type=vram_type, vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor), size=self._valueunit(vram_size_b, "B"), bit_width=self._valueunit(vram_bits, "bit"), - max_bandwidth=None, + max_bandwidth=max_bandwidth, ) soc_pstate_model = self._get_soc_pstate(h) @@ -796,7 +808,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: asic=asic_model, bus=bus, vbios=vbios_model, - limit=None, + limit=self._get_limit_info(h), driver=driver_model, board=board_model, soc_pstate=soc_pstate_model, @@ -805,7 +817,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: numa=numa_model, vram=vram_model, cache_info=cache_info_model, - partition=None, + partition=None, # Note: ? clock=clock_model, ) ) @@ -821,13 +833,13 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: return out def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]: - """SOC pstate check + """Get SOC P-state (performance state) policy information for a GPU device. Args: handle (Any): GPU device handle Returns: - Optional[StaticSocPstate]: StaticSocPstate instance or None + Optional[StaticSocPstate]: SOC P-state policy data or None if unavailable """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_soc_pstate", None) @@ -883,13 +895,13 @@ def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]: return None def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]: - """Check XGMI plpd + """Get XGMI Per-Link Power Down (PLPD) policy for a GPU device. Args: handle (Any): GPU device handle Returns: - Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None + Optional[StaticXgmiPlpd]: XGMI PLPD policy data or None if unavailable """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_xgmi_plpd", None) @@ -945,13 +957,13 @@ def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]: return None def _get_cache_info(self, handle: Any) -> list[StaticCacheInfoItem]: - """Check cache info + """Get GPU cache hierarchy information (L1, L2, L3, etc.). Args: handle (Any): GPU device handle Returns: - list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances + list[StaticCacheInfoItem]: List of cache info items for each cache level """ amdsmi = self._amdsmi_mod() raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, handle, default=None) @@ -1007,8 +1019,38 @@ def _as_list_str(v: Any) -> list[str]: return out + def _get_limit_info(self, handle: Any) -> Optional[StaticLimit]: + """Get power cap and temperature limit information. + + Args: + handle (Any): GPU device handle + + Returns: + Optional[StaticLimit]: StaticLimit instance or None + """ + amdsmi = self._amdsmi_mod() + fn = getattr(amdsmi, "amdsmi_get_power_cap_info", None) + if not callable(fn): + return None + + data = self._smi_try(fn, handle, default=None) + if not isinstance(data, dict): + return None + + return StaticLimit( + max_power=self._valueunit(data.get("power_cap"), "W"), + min_power=self._valueunit(data.get("min_power_cap"), "W"), + socket_power=self._valueunit(data.get("default_power_cap"), "W"), + slowdown_edge_temperature=self._valueunit(data.get("slowdown_temp"), "C"), + slowdown_hotspot_temperature=self._valueunit(data.get("slowdown_mem_temp"), "C"), + slowdown_vram_temperature=self._valueunit(data.get("slowdown_vram_temp"), "C"), + shutdown_edge_temperature=self._valueunit(data.get("shutdown_temp"), "C"), + shutdown_hotspot_temperature=self._valueunit(data.get("shutdown_mem_temp"), "C"), + shutdown_vram_temperature=self._valueunit(data.get("shutdown_vram_temp"), "C"), + ) + def _get_clock(self, handle: Any) -> Optional[StaticClockData]: - """Get clock info + """Get clock info using amdsmi_get_clock_info or fallback to amdsmi_get_clk_freq Args: handle (Any): GPU device handle @@ -1017,9 +1059,23 @@ def _get_clock(self, handle: Any) -> Optional[StaticClockData]: Optional[StaticClockData]: StaticClockData instance or None """ amdsmi = self._amdsmi_mod() - fn = getattr(amdsmi, "amdsmi_get_clk_freq", None) clk_type = getattr(amdsmi, "AmdSmiClkType", None) - if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"): + + if clk_type is None or not hasattr(clk_type, "SYS"): + return None + + # Try amdsmi_get_clock_info API first + clock_info_fn = getattr(amdsmi, "amdsmi_get_clock_info", None) + if callable(clock_info_fn): + data = self._smi_try(clock_info_fn, handle, clk_type.SYS, default=None) + if isinstance(data, dict): + freqs_raw = data.get("clk_freq") or data.get("frequency") + if isinstance(freqs_raw, list) and freqs_raw: + return self._process_clock_data(data, freqs_raw) + + # Fallback to amdsmi_get_clk_freq API + fn = getattr(amdsmi, "amdsmi_get_clk_freq", None) + if not callable(fn): return None data = self._smi_try(fn, handle, clk_type.SYS, default=None) @@ -1030,6 +1086,19 @@ def _get_clock(self, handle: Any) -> Optional[StaticClockData]: if not isinstance(freqs_raw, list) or not freqs_raw: return None + return self._process_clock_data(data, freqs_raw) + + def _process_clock_data(self, data: dict, freqs_raw: list) -> Optional[StaticClockData]: + """Process clock frequency data into StaticClockData model. + + Args: + data (dict): Raw clock data from amdsmi API + freqs_raw (list): List of frequency values + + Returns: + Optional[StaticClockData]: StaticClockData instance or None + """ + def _to_mhz(v: object) -> Optional[int]: x = self._to_number(v) if x is None: @@ -1079,11 +1148,10 @@ def _fmt(n: Optional[int]) -> Optional[str]: return None def get_bad_pages(self) -> Optional[list[BadPages]]: - """ - Collect bad page info per GPU and map to BadPages/PageData models. + """Collect bad page info per GPU and map to BadPages/PageData models. Returns: - List[BadPages] (one item per GPU) or None if no devices. + Optional[list[BadPages]]: List of bad pages (one per GPU) or None if no devices """ amdsmi = self._amdsmi_mod() devices = self._get_handles() @@ -1165,6 +1233,14 @@ def get_bad_pages(self) -> Optional[list[BadPages]]: return out def get_metric(self) -> Optional[list[AmdSmiMetric]]: + """Collect runtime metric data from all GPU devices. + + Collects usage, power, temperature, clocks, PCIe, fan, memory, ECC, + throttle, and voltage curve data from amdsmi_get_gpu_metrics_info. + + Returns: + Optional[list[AmdSmiMetric]]: List of metric data per GPU or None if no devices + """ amdsmi = self._amdsmi_mod() devices = self._get_handles() out: list[AmdSmiMetric] = [] @@ -1213,14 +1289,29 @@ def _as_list(v: object) -> list[object]: jpeg_activity=[ self._valueunit(v, "%") for v in _as_list(raw.get("jpeg_activity")) ], - gfx_busy_inst=None, - jpeg_busy=None, - vcn_busy=None, + gfx_busy_inst=None, # Note: note avilable? + jpeg_busy=None, # Note: note avilable? + vcn_busy=None, # Note: note avilable? ) # Power / Energy + # Get power from metrics_info + socket_power_val = self._valueunit(raw.get("average_socket_power"), "W") + + # Try amdsmi_get_power_info if available and metrics is missing + if socket_power_val is None: + power_info_fn = getattr(amdsmi, "amdsmi_get_power_info", None) + if callable(power_info_fn): + power_data = self._smi_try(power_info_fn, h, default=None) + if isinstance(power_data, dict): + socket_power_val = self._valueunit( + power_data.get("current_socket_power") + or power_data.get("average_socket_power"), + "W", + ) + power = MetricPower( - socket_power=self._valueunit(raw.get("average_socket_power"), "W"), + socket_power=socket_power_val, gfx_voltage=self._valueunit(raw.get("voltage_gfx"), "mV"), soc_voltage=self._valueunit(raw.get("voltage_soc"), "mV"), mem_voltage=self._valueunit(raw.get("voltage_mem"), "mV"), @@ -1250,6 +1341,16 @@ def _as_list(v: object) -> list[object]: float(speed_raw) / 10.0 if isinstance(speed_raw, (int, float)) else None ) + # Get PCIe throughput + throughput_fn = getattr(amdsmi, "amdsmi_get_gpu_pci_throughput", None) + bandwidth_sent = None + bandwidth_received = None + if callable(throughput_fn): + throughput_data = self._smi_try(throughput_fn, h, default=None) + if isinstance(throughput_data, dict): + bandwidth_sent = _to_int_or_none(throughput_data.get("sent")) + bandwidth_received = _to_int_or_none(throughput_data.get("received")) + pcie = MetricPcie( width=_to_int_or_none(raw.get("pcie_link_width")), speed=self._valueunit(speed_gtps, "GT/s"), @@ -1259,39 +1360,88 @@ def _as_list(v: object) -> list[object]: replay_roll_over_count=_to_int_or_none(raw.get("pcie_replay_rover_count_acc")), nak_sent_count=_to_int_or_none(raw.get("pcie_nak_sent_count_acc")), nak_received_count=_to_int_or_none(raw.get("pcie_nak_rcvd_count_acc")), - current_bandwidth_sent=None, - current_bandwidth_received=None, + current_bandwidth_sent=bandwidth_sent, + current_bandwidth_received=bandwidth_received, max_packet_size=None, lc_perf_other_end_recovery=None, ) - # Clocks - def _clk(cur_key: str, raw: dict = raw) -> MetricClockData: + # Clocks from clock_info API + clock_info_fn = getattr(amdsmi, "amdsmi_get_clock_info", None) + clk_type = getattr(amdsmi, "AmdSmiClkType", None) + clock_ranges = {} + + if callable(clock_info_fn) and clk_type is not None: + for clk_name, clk_enum_name in [ + ("GFX", "GFX"), + ("SOC", "SYS"), + ("UCLK", "MEM"), + ("VCLK0", "VCLK0"), + ("DCLK0", "DCLK0"), + ("VCLK1", "VCLK1"), + ("DCLK1", "DCLK1"), + ]: + clk_enum = getattr(clk_type, clk_enum_name, None) + if clk_enum is not None: + clk_data = self._smi_try(clock_info_fn, h, clk_enum, default=None) + if isinstance(clk_data, dict): + clock_ranges[clk_name] = { + "min": clk_data.get("min_clk"), + "max": clk_data.get("max_clk"), + "sleep": clk_data.get("sleep_clk") + or clk_data.get("deep_sleep_clk"), + } + + def _clk( + cur_key: str, + clk_name: str = "", + raw: dict = raw, + clock_ranges: dict = clock_ranges, + ) -> MetricClockData: + ranges = clock_ranges.get(clk_name, {}) return MetricClockData( clk=self._valueunit(raw.get(cur_key), "MHz"), - min_clk=None, - max_clk=None, + min_clk=self._valueunit(ranges.get("min"), "MHz") if ranges else None, + max_clk=self._valueunit(ranges.get("max"), "MHz") if ranges else None, clk_locked=( raw.get("gfxclk_lock_status") if cur_key == "current_gfxclk" else None ), - deep_sleep=None, + deep_sleep=ranges.get("sleep") if ranges else None, ) clock: dict[str, MetricClockData] = { - "GFX": _clk("current_gfxclk"), - "SOC": _clk("current_socclk"), - "UCLK": _clk("current_uclk"), - "VCLK0": _clk("current_vclk0"), - "DCLK0": _clk("current_dclk0"), - "VCLK1": _clk("current_vclk1"), - "DCLK1": _clk("current_dclk1"), + "GFX": _clk("current_gfxclk", "GFX"), + "SOC": _clk("current_socclk", "SOC"), + "UCLK": _clk("current_uclk", "UCLK"), + "VCLK0": _clk("current_vclk0", "VCLK0"), + "DCLK0": _clk("current_dclk0", "DCLK0"), + "VCLK1": _clk("current_vclk1", "VCLK1"), + "DCLK1": _clk("current_dclk1", "DCLK1"), } # Fan + fan_rpm = self._valueunit(raw.get("current_fan_speed"), "RPM") + + # Get fan speed as percentage + fan_speed_fn = getattr(amdsmi, "amdsmi_get_gpu_fan_speed", None) + fan_speed_pct = None + if callable(fan_speed_fn): + fan_speed_data = self._smi_try(fan_speed_fn, h, 0, default=None) + if isinstance(fan_speed_data, (int, float)): + fan_speed_pct = self._valueunit(fan_speed_data, "%") + + # Get max fan speed + fan_max_fn = getattr(amdsmi, "amdsmi_get_gpu_fan_speed_max", None) + fan_max_rpm = None + if callable(fan_max_fn): + fan_max_data = self._smi_try(fan_max_fn, h, 0, default=None) + if isinstance(fan_max_data, (int, float)): + fan_max_rpm = self._valueunit(fan_max_data, "RPM") + fan = MetricFan( - rpm=self._valueunit(raw.get("current_fan_speed"), "RPM"), - speed=None, - max=None, + rpm=fan_rpm, + speed=fan_speed_pct, + max=fan_max_rpm, usage=None, ) @@ -1310,7 +1460,13 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData: mem_enum = getattr(amdsmi, "AmdSmiMemoryType", None) vis_total_vu: Optional[ValueUnit] = None + vis_used_vu: Optional[ValueUnit] = None + vis_free_vu: Optional[ValueUnit] = None gtt_total_vu: Optional[ValueUnit] = None + gtt_used_vu: Optional[ValueUnit] = None + gtt_free_vu: Optional[ValueUnit] = None + + mem_usage_fn = getattr(amdsmi, "amdsmi_get_gpu_memory_usage", None) if mem_enum is not None: if total_vram_vu is None: @@ -1320,19 +1476,47 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData: if vram_total_alt is not None: total_vram_vu = self._valueunit(vram_total_alt, "B") + # Visible VRAM total and usage vis_total = self._smi_try( amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.VIS_VRAM, default=None ) if vis_total is not None: vis_total_vu = self._valueunit(vis_total, "B") + # Get visible VRAM usage + if callable(mem_usage_fn): + vis_used = self._smi_try( + mem_usage_fn, h, mem_enum.VIS_VRAM, default=None + ) + if vis_used is not None: + vis_used_vu = self._valueunit(vis_used, "B") + # Calculate free + try: + free_val = max(0.0, float(vis_total) - float(vis_used)) + vis_free_vu = self._valueunit(free_val, "B") + except Exception: + pass + + # GTT total and usage gtt_total = self._smi_try( amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.GTT, default=None ) if gtt_total is not None: gtt_total_vu = self._valueunit(gtt_total, "B") - # Compute free if possible + # Get GTT usage + if callable(mem_usage_fn): + gtt_used = self._smi_try(mem_usage_fn, h, mem_enum.GTT, default=None) + if gtt_used is not None: + gtt_used_vu = self._valueunit(gtt_used, "B") + # Calculate free + try: + free_val = max(0.0, float(gtt_total) - float(gtt_used)) + gtt_free_vu = self._valueunit(free_val, "B") + except Exception: + pass + + # Compute free VRAM if possible if free_vram_vu is None and total_vram_vu is not None and used_vram_vu is not None: try: free_num = max(0.0, float(total_vram_vu.value) - float(used_vram_vu.value)) @@ -1346,11 +1530,11 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData: used_vram=used_vram_vu, free_vram=free_vram_vu, total_visible_vram=vis_total_vu, - used_visible_vram=None, - free_visible_vram=None, + used_visible_vram=vis_used_vu, + free_visible_vram=vis_free_vu, total_gtt=gtt_total_vu, - used_gtt=None, - free_gtt=None, + used_gtt=gtt_used_vu, + free_gtt=gtt_free_vu, ) # ECC totals @@ -1409,6 +1593,14 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData: return out def _empty_metric(self, gpu_idx: int) -> AmdSmiMetric: + """Create an empty/default AmdSmiMetric instance when data collection fails. + + Args: + gpu_idx (int): GPU index + + Returns: + AmdSmiMetric: Metric instance with all fields set to None or empty values + """ return AmdSmiMetric( gpu=gpu_idx, usage=MetricUsage( @@ -1472,7 +1664,15 @@ def _empty_metric(self, gpu_idx: int) -> AmdSmiMetric: throttle=MetricThrottle(), ) - def _get_voltage_curve(self, h) -> MetricVoltageCurve: + def _get_voltage_curve(self, h: Any) -> MetricVoltageCurve: + """Get GPU voltage curve (frequency/voltage points) for overdrive settings. + + Args: + h (Any): GPU device handle + + Returns: + MetricVoltageCurve: Voltage curve data with up to 3 frequency/voltage points + """ amdsmi = self._amdsmi_mod() raw = self._smi_try(amdsmi.amdsmi_get_gpu_od_volt_info, h, default=None) if not isinstance(raw, dict): @@ -1522,6 +1722,11 @@ def _extract_point(p: object) -> tuple[Optional[object], Optional[object]]: ) def _empty_voltage_curve(self) -> MetricVoltageCurve: + """Create an empty MetricVoltageCurve with all points set to None. + + Returns: + MetricVoltageCurve: Empty voltage curve instance + """ return MetricVoltageCurve( point_0_frequency=None, point_0_voltage=None, @@ -1531,16 +1736,30 @@ def _empty_voltage_curve(self) -> MetricVoltageCurve: point_2_voltage=None, ) - def _as_first_plane(self, obj) -> list: - """Take a scalar/list/2D-list and return the first plane as a flat list.""" + def _as_first_plane(self, obj: object) -> list: + """Take a scalar/list/2D-list and return the first plane as a flat list. + + Args: + obj (object): Scalar, list, or 2D-list to process + + Returns: + list: First plane as a flat list, or empty list if not a list + """ if isinstance(obj, list): if obj and isinstance(obj[0], list): # 2D return obj[0] return obj return [] - def _th_vu_list_pct(self, obj) -> Optional[MetricThrottleVu]: - """Return MetricThrottleVu with % ValueUnits for the first XCP plane.""" + def _th_vu_list_pct(self, obj: object) -> Optional[MetricThrottleVu]: + """Return MetricThrottleVu with percentage ValueUnits for the first XCP plane. + + Args: + obj (object): Object containing throttle data (scalar, list, or 2D-list) + + Returns: + Optional[MetricThrottleVu]: MetricThrottleVu with percentage values or None + """ arr = self._as_first_plane(obj) if not arr: return None @@ -1548,8 +1767,15 @@ def _th_vu_list_pct(self, obj) -> Optional[MetricThrottleVu]: xcp_0=[self._valueunit(v, "%") if v not in (None, "N/A") else "N/A" for v in arr] ) - def _th_vu_list_raw(self, obj) -> Optional[MetricThrottleVu]: - """Return MetricThrottleVu with raw ints/strings for the first XCP plane.""" + def _th_vu_list_raw(self, obj: object) -> Optional[MetricThrottleVu]: + """Return MetricThrottleVu with raw integers/strings for the first XCP plane. + + Args: + obj (object): Object containing throttle data (scalar, list, or 2D-list) + + Returns: + Optional[MetricThrottleVu]: MetricThrottleVu with raw values or None + """ arr = self._as_first_plane(obj) if not arr: return None @@ -1564,7 +1790,15 @@ def _th_vu_list_raw(self, obj) -> Optional[MetricThrottleVu]: ] ) - def get_throttle(self, h) -> MetricThrottle: + def get_throttle(self, h: Any) -> MetricThrottle: + """Get throttle/violation status data for a GPU device. + + Args: + h (Any): GPU device handle + + Returns: + MetricThrottle: Throttle metrics and violation status data + """ amdsmi = self._amdsmi_mod() raw = self._smi_try(amdsmi.amdsmi_get_violation_status, h, default=None) if not isinstance(raw, dict): @@ -1634,6 +1868,14 @@ def get_throttle(self, h) -> MetricThrottle: ) def _flatten_2d(self, v: object) -> list[object]: + """Flatten a 2D list into a 1D list, or normalize scalars/None to lists. + + Args: + v (object): Input value (scalar, list, or 2D-list) + + Returns: + list[object]: Flattened list of objects + """ if isinstance(v, list) and v and isinstance(v[0], list): out: list[object] = [] for row in v: @@ -1647,11 +1889,19 @@ def _flatten_2d(self, v: object) -> list[object]: def _coerce_throttle_value( self, v: object, unit: str = "" ) -> Optional[Union[MetricThrottleVu, ValueUnit]]: - """ - Convert ints/floats/strings/lists/2D-lists/dicts into: - - ValueUnit - - MetricThrottleVu(xcp_0=[...]) - - None for N/A/empty + """Convert various throttle data formats to ValueUnit or MetricThrottleVu. + + Converts integers/floats/strings/lists/2D-lists/dicts into appropriate types: + - ValueUnit for scalar values + - MetricThrottleVu(xcp_0=[...]) for lists/arrays + - None for N/A or empty values + + Args: + v (object): Input throttle value in various formats + unit (str, optional): Unit of measurement. Defaults to empty string. + + Returns: + Optional[Union[MetricThrottleVu, ValueUnit]]: Coerced throttle value or None """ if v in (None, "", "N/A"): return None From 06aa5ebd50c600c3f60a4a02a0a621c04bf292c9 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 13 Nov 2025 09:45:19 -0600 Subject: [PATCH 4/9] fix + enhanced utest + deprecation warning fix --- .../plugins/inband/amdsmi/amdsmidata.py | 2 +- test/unit/plugin/test_amdsmi_collector.py | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index edab6044..7fda0378 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -64,7 +64,7 @@ class AmdSmiBaseModel(BaseModel): def __init__(self, **data): # Convert Union[int, str, float] -> ValueUnit - for field_name, field_type in self.model_fields.items(): + for field_name, field_type in self.__class__.model_fields.items(): annotation = field_type.annotation target_type, container = find_annotation_in_container(annotation, ValueUnit) if target_type is None: diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index 2a34551b..66ab2b6f 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -151,6 +151,35 @@ def amdsmi_get_clk_freq(h, clk_type): m.amdsmi_get_clk_freq = amdsmi_get_clk_freq + m.amdsmi_get_gpu_bad_page_info = lambda h: {"page_list": []} + + m.amdsmi_get_gpu_metrics_info = lambda h: { + "temperature_hotspot": 55, + "temperature_mem": 50, + "average_socket_power": 150, + "current_gfxclk": 1500, + "current_uclk": 1000, + } + + m.amdsmi_get_gpu_od_volt_info = lambda h: { + "curve": {"vc_points": [{"frequency": 1500, "voltage": 850}]} + } + + m.amdsmi_get_gpu_vram_usage = lambda h: { + "vram_used": 1024 * 1024 * 1024, + "vram_total": 64 * 1024 * 1024 * 1024, + } + + m.amdsmi_get_gpu_memory_total = lambda h, mem_type: 64 * 1024 * 1024 * 1024 + + m.amdsmi_get_gpu_total_ecc_count = lambda h: {"correctable_count": 0, "uncorrectable_count": 0} + + m.amdsmi_get_violation_status = lambda h: { + "acc_counter": 0, + "acc_prochot_thrm": 0, + "acc_ppt_pwr": 0, + } + m.amdsmi_get_fw_info = lambda h: { "fw_list": [ {"fw_name": "SMU", "fw_version": "55.33"}, From f7259a854a3b6af88e272989d2731c1ac404e8c6 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 13 Nov 2025 12:16:33 -0600 Subject: [PATCH 5/9] removed deprecated calls --- .../plugins/inband/amdsmi/amdsmi_collector.py | 10 ++-------- .../plugins/inband/amdsmi/amdsmidata.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index bcdcfae3..244b994f 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -1010,13 +1010,13 @@ def _as_list_str(v: Any) -> list[str]: return out def _get_limit_info(self, handle: Any) -> Optional[StaticLimit]: - """Get power cap and temperature limit information. + """Get power cap limit information. Args: handle (Any): GPU device handle Returns: - Optional[StaticLimit]: StaticLimit instance or None + Optional[StaticLimit]: StaticLimit instance with power cap data or None """ amdsmi = self._amdsmi_mod() fn = getattr(amdsmi, "amdsmi_get_power_cap_info", None) @@ -1031,12 +1031,6 @@ def _get_limit_info(self, handle: Any) -> Optional[StaticLimit]: max_power=self._valueunit(data.get("power_cap"), "W"), min_power=self._valueunit(data.get("min_power_cap"), "W"), socket_power=self._valueunit(data.get("default_power_cap"), "W"), - slowdown_edge_temperature=self._valueunit(data.get("slowdown_temp"), "C"), - slowdown_hotspot_temperature=self._valueunit(data.get("slowdown_mem_temp"), "C"), - slowdown_vram_temperature=self._valueunit(data.get("slowdown_vram_temp"), "C"), - shutdown_edge_temperature=self._valueunit(data.get("shutdown_temp"), "C"), - shutdown_hotspot_temperature=self._valueunit(data.get("shutdown_mem_temp"), "C"), - shutdown_vram_temperature=self._valueunit(data.get("shutdown_vram_temp"), "C"), ) def _get_clock(self, handle: Any) -> Optional[StaticClockData]: diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 7fda0378..b3aca761 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -277,15 +277,15 @@ class StaticVbios(BaseModel): class StaticLimit(AmdSmiBaseModel): - max_power: Optional[ValueUnit] - min_power: Optional[ValueUnit] - socket_power: Optional[ValueUnit] - slowdown_edge_temperature: Optional[ValueUnit] - slowdown_hotspot_temperature: Optional[ValueUnit] - slowdown_vram_temperature: Optional[ValueUnit] - shutdown_edge_temperature: Optional[ValueUnit] - shutdown_hotspot_temperature: Optional[ValueUnit] - shutdown_vram_temperature: Optional[ValueUnit] + max_power: Optional[ValueUnit] = None + min_power: Optional[ValueUnit] = None + socket_power: Optional[ValueUnit] = None + slowdown_edge_temperature: Optional[ValueUnit] = None + slowdown_hotspot_temperature: Optional[ValueUnit] = None + slowdown_vram_temperature: Optional[ValueUnit] = None + shutdown_edge_temperature: Optional[ValueUnit] = None + shutdown_hotspot_temperature: Optional[ValueUnit] = None + shutdown_vram_temperature: Optional[ValueUnit] = None na_validator = field_validator( "max_power", "min_power", From a0b825ef7cf1e5bd27d59c75ed3cdc34de1e2db4 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 17 Nov 2025 12:37:22 -0600 Subject: [PATCH 6/9] updates on missing calls --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 237 +++++++++++++++++- 1 file changed, 236 insertions(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 5a836740..5e16f82e 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -30,7 +30,15 @@ from nodescraper.interfaces import DataAnalyzer from nodescraper.models import TaskResult -from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes +from .amdsmidata import ( + AmdSmiDataModel, + AmdSmiMetric, + AmdSmiStatic, + EccData, + Fw, + Partition, + Processes, +) from .analyzer_args import AmdSmiAnalyzerArgs @@ -122,6 +130,223 @@ def check_expected_driver_version( }, ) + def check_amdsmi_metric_pcie( + self, + amdsmi_metric_data: list[AmdSmiMetric], + l0_to_recovery_count_error_threshold: int, + l0_to_recovery_count_warning_threshold: int, + ): + """Check PCIe metrics for link errors + + Checks for PCIe link width, speed, replays, recoveries, and NAKs. + Expected width/speeds should come from SKU info. + + Args: + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model + l0_to_recovery_count_error_threshold (int): Threshold for error events + l0_to_recovery_count_warning_threshold (int): Threshold for warning events + """ + for metric in amdsmi_metric_data: + pcie_data = metric.pcie + gpu = metric.gpu + + if pcie_data.width is not None and pcie_data.width != 16: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} PCIe width is not x16", + priority=EventPriority.ERROR, + data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16}, + console_log=True, + ) + + if pcie_data.speed is not None and pcie_data.speed.value is not None: + try: + speed_val = float(pcie_data.speed.value) + if speed_val != 32.0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)", + priority=EventPriority.ERROR, + data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0}, + console_log=True, + ) + except (ValueError, TypeError): + pass + + if pcie_data.replay_count is not None and pcie_data.replay_count > 0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}", + priority=EventPriority.WARNING, + data={"gpu": gpu, "replay_count": pcie_data.replay_count}, + console_log=True, + ) + + if ( + pcie_data.replay_roll_over_count is not None + and pcie_data.replay_roll_over_count > 0 + ): + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}", + priority=EventPriority.WARNING, + data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count}, + console_log=True, + ) + + if pcie_data.l0_to_recovery_count is not None: + if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries", + priority=EventPriority.ERROR, + data={ + "gpu": gpu, + "l0_to_recovery_count": pcie_data.l0_to_recovery_count, + "error_threshold": l0_to_recovery_count_error_threshold, + }, + console_log=True, + ) + elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries", + priority=EventPriority.WARNING, + data={ + "gpu": gpu, + "l0_to_recovery_count": pcie_data.l0_to_recovery_count, + "warning_threshold": l0_to_recovery_count_warning_threshold, + }, + console_log=True, + ) + + if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs", + priority=EventPriority.WARNING, + data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count}, + console_log=True, + ) + + if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0: + self._log_event( + category=EventCategory.IO, + description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs", + priority=EventPriority.WARNING, + data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count}, + console_log=True, + ) + + def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]): + """Check ECC totals for all GPUs + + Raises errors for uncorrectable errors, warnings for correctable and deferred. + + Args: + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model + """ + for metric in amdsmi_metric_data: + ecc_totals = metric.ecc + gpu = metric.gpu + + ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [ + ( + EventPriority.WARNING, + ecc_totals.total_correctable_count, + "Total correctable ECC errors", + ), + ( + EventPriority.ERROR, + ecc_totals.total_uncorrectable_count, + "Total uncorrectable ECC errors", + ), + ( + EventPriority.WARNING, + ecc_totals.total_deferred_count, + "Total deferred ECC errors", + ), + ( + EventPriority.WARNING, + ecc_totals.cache_correctable_count, + "Cache correctable ECC errors", + ), + ( + EventPriority.ERROR, + ecc_totals.cache_uncorrectable_count, + "Cache uncorrectable ECC errors", + ), + ] + + for priority, count, desc in ecc_checks: + if count is not None and count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has {desc}: {count}", + priority=priority, + data={"gpu": gpu, "error_count": count, "error_type": desc}, + console_log=True, + ) + + def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]): + """Check ECC counts in all blocks for all GPUs + + Raises errors for uncorrectable errors, warnings for correctable and deferred. + + Args: + amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model + """ + for metric in amdsmi_metric_data: + gpu = metric.gpu + ecc_blocks = metric.ecc_blocks + + # Skip if ecc_blocks is a string (e.g., "N/A") or empty + if isinstance(ecc_blocks, str) or not ecc_blocks: + continue + + for block_name, ecc_data in ecc_blocks.items(): + if not isinstance(ecc_data, EccData): + continue + + if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has correctable ECC errors in block {block_name}", + priority=EventPriority.WARNING, + data={ + "gpu": gpu, + "block": block_name, + "correctable_count": ecc_data.correctable_count, + }, + console_log=True, + ) + + if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}", + priority=EventPriority.ERROR, + data={ + "gpu": gpu, + "block": block_name, + "uncorrectable_count": ecc_data.uncorrectable_count, + }, + console_log=True, + ) + + if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0: + self._log_event( + category=EventCategory.RAS, + description=f"GPU: {gpu} has deferred ECC errors in block {block_name}", + priority=EventPriority.WARNING, + data={ + "gpu": gpu, + "block": block_name, + "deferred_count": ecc_data.deferred_count, + }, + console_log=True, + ) + def expected_gpu_processes( self, processes_data: Optional[list[Processes]], max_num_processes: int ): @@ -427,6 +652,16 @@ def analyze_data( if args is None: args = AmdSmiAnalyzerArgs() + if data.metric is not None and len(data.metric) > 0: + if args.l0_to_recovery_count_error_threshold is not None: + self.check_amdsmi_metric_pcie( + data.metric, + args.l0_to_recovery_count_error_threshold, + args.l0_to_recovery_count_warning_threshold or 1, + ) + self.check_amdsmi_metric_ecc_totals(data.metric) + self.check_amdsmi_metric_ecc(data.metric) + if args.expected_gpu_processes: self.expected_gpu_processes(data.process, args.expected_gpu_processes) From 80b283b52fff7aeb909a8cf430069dc985df9bce Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 24 Nov 2025 14:27:08 -0600 Subject: [PATCH 7/9] fix for multi json return --- .../plugins/inband/amdsmi/amdsmi_collector.py | 53 +++++++++++++++---- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 15894a4a..82b02071 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -183,19 +183,50 @@ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]: cmd_ret = self._run_amd_smi(cmd) if cmd_ret: try: + # Try to parse as single JSON first return json.loads(cmd_ret) except json.JSONDecodeError as e: - self._log_event( - category=EventCategory.APPLICATION, - description=f"Error parsing command: `{cmd}` json data", - data={ - "cmd": cmd, - "exception": get_exception_traceback(e), - }, - priority=EventPriority.ERROR, - console_log=True, - ) - return None + # try to extract and parse multiple JSON objects + try: + json_objects = [] + decoder = json.JSONDecoder() + idx = 0 + cmd_ret_stripped = cmd_ret.strip() + + while idx < len(cmd_ret_stripped): + while idx < len(cmd_ret_stripped) and cmd_ret_stripped[idx].isspace(): + idx += 1 + + if idx >= len(cmd_ret_stripped): + break + + if cmd_ret_stripped[idx] not in ["{", "["]: + break + + try: + obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx) + json_objects.append(obj) + idx += end_idx + except json.JSONDecodeError: + break + + if json_objects: + return json_objects if len(json_objects) > 1 else json_objects[0] + else: + raise + + except Exception: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Error parsing command: `{cmd}` json data", + data={ + "cmd": cmd, + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None return None def _to_number(self, v: object) -> Optional[Union[int, float]]: From aa838b582baf4ef7c500745e8e5cfa66bda54361 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 24 Nov 2025 14:41:52 -0600 Subject: [PATCH 8/9] fix for partition --- .../plugins/inband/amdsmi/amdsmi_collector.py | 10 ++- test/unit/plugin/test_amdsmi_collector.py | 81 +++++++++++++++++-- 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 82b02071..c8a0eb60 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -206,7 +206,7 @@ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]: try: obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx) json_objects.append(obj) - idx += end_idx + idx = end_idx except json.JSONDecodeError: break @@ -529,7 +529,15 @@ def get_partition(self) -> Optional[Partition]: memparts: list[PartitionMemory] = [] computeparts: list[PartitionCompute] = [] + # Flatten multi-JSON results (partition command returns multiple JSON arrays) + flattened_data = [] for item in partition_data: + if isinstance(item, list): + flattened_data.extend(item) + elif isinstance(item, dict): + flattened_data.append(item) + + for item in flattened_data: if not isinstance(item, dict): continue diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index 6783c407..4eeb76d3 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -87,11 +87,21 @@ def mock_run_sut_cmd(cmd: str) -> MagicMock: ) if "partition --json" in cmd: - return make_cmd_result( + json_output = ( make_json_response( [{"gpu": 0, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}] ) + + "\n" + + make_json_response( + [{"gpu": 1, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}] + ) + + "\n" + + make_json_response( + [{"gpu_id": "N/A", "profile_index": "N/A", "partition_id": "0"}] + ) + + "\n\nLegend:\n * = Current mode" ) + return make_cmd_result(json_output) if "firmware --json" in cmd: return make_cmd_result( @@ -241,9 +251,8 @@ def test_collect_data(collector): assert data.process is not None and len(data.process) == 1 assert len(data.process[0].process_list) == 2 - # partition assert data.partition is not None - assert len(data.partition.memory_partition) == 1 + assert len(data.partition.memory_partition) >= 1 assert data.partition.memory_partition[0].partition_type == "NPS1" # firmware @@ -286,12 +295,12 @@ def test_get_process(collector): def test_get_partition(collector): - """Test partition parsing""" + """Test partition parsing with multi-JSON output""" p = collector.get_partition() assert p is not None - assert len(p.memory_partition) == 1 and len(p.compute_partition) == 1 + # The mock now returns realistic multi-JSON output + assert len(p.memory_partition) >= 1 assert p.memory_partition[0].partition_type == "NPS1" - assert p.compute_partition[0].partition_type == "CPX_DISABLED" def test_get_firmware(collector): @@ -369,7 +378,7 @@ def mock_bad_json(cmd: str) -> MagicMock: result, data = c.collect_data() assert data is not None assert data.version is None - assert len(result.events) > 0 # Should have error events + assert len(result.events) > 0 def test_command_error(conn_mock, system_info, monkeypatch): @@ -392,3 +401,61 @@ def mock_cmd_error(cmd: str) -> MagicMock: assert data.version is None assert data.gpu_list == [] assert len(result.events) > 0 # Should have error events + + +def test_multi_json_parsing(conn_mock, system_info, monkeypatch): + """Test parsing of multiple JSON objects with trailing text""" + + def mock_multi_json(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "test --json" in cmd: + multi_json = ( + '[{"data": 1}]\n' + '[{"data": 2}]\n' + '[{"data": 3}]\n' + "\n\nLegend:\n * = Current mode\n" + ) + return make_cmd_result(multi_json) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_multi_json) + + result = c._run_amd_smi_dict("test") + + assert result is not None + assert isinstance(result, list) + assert len(result) == 3 + assert result[0] == [{"data": 1}] + assert result[1] == [{"data": 2}] + assert result[2] == [{"data": 3}] + + +def test_single_json_parsing(conn_mock, system_info, monkeypatch): + """Test that single JSON parsing still works""" + + def mock_single_json(cmd: str) -> MagicMock: + if "which amd-smi" in cmd: + return make_cmd_result("/usr/bin/amd-smi") + if "version --json" in cmd: + return make_cmd_result(make_json_response([{"tool": "amdsmi", "version": "1.0"}])) + return make_cmd_result("") + + c = AmdSmiCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + monkeypatch.setattr(c, "_run_sut_cmd", mock_single_json) + + result = c._run_amd_smi_dict("version") + + assert result is not None + assert isinstance(result, list) + assert len(result) == 1 + assert result[0]["tool"] == "amdsmi" From d4d85193e304ba6d4c30dcd248a2459045aaaed7 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 4 Dec 2025 09:20:38 -0600 Subject: [PATCH 9/9] cleanup --- test/unit/plugin/test_amdsmi_collector.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py index 4eeb76d3..c8ba08c6 100644 --- a/test/unit/plugin/test_amdsmi_collector.py +++ b/test/unit/plugin/test_amdsmi_collector.py @@ -298,7 +298,6 @@ def test_get_partition(collector): """Test partition parsing with multi-JSON output""" p = collector.get_partition() assert p is not None - # The mock now returns realistic multi-JSON output assert len(p.memory_partition) >= 1 assert p.memory_partition[0].partition_type == "NPS1" @@ -400,7 +399,7 @@ def mock_cmd_error(cmd: str) -> MagicMock: assert data is not None assert data.version is None assert data.gpu_list == [] - assert len(result.events) > 0 # Should have error events + assert len(result.events) > 0 def test_multi_json_parsing(conn_mock, system_info, monkeypatch):