From 3fecf569f5a39d6cda42be5627f5fea9900b7fc5 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Wed, 24 Sep 2025 13:47:34 -0500
Subject: [PATCH 1/9] bad pages

---
 .../plugins/inband/amdsmi/amdsmi_collector.py | 123 +++++++++++++++++-
 .../plugins/inband/amdsmi/amdsmidata.py       |  23 ++++
 2 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
index 54c74d7f..49709b7e 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
@@ -36,8 +36,10 @@
     AmdSmiListItem,
     AmdSmiStatic,
     AmdSmiVersion,
+    BadPages,
     Fw,
     FwListItem,
+    PageData,
     Partition,
     PartitionCompute,
     PartitionMemory,
@@ -163,6 +165,7 @@ def _get_handles(self):
     def _get_amdsmi_data(self) -> AmdSmiDataModel | None:
         try:
             version = self._get_amdsmi_version()
+            bad_pages = self.get_bad_pages()
             processes = self.get_process()
             partition = self.get_partition()
             firmware = self.get_firmware()
@@ -182,6 +185,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None:
         try:
             return AmdSmiDataModel(
                 version=version,
+                bad_pages=bad_pages,
                 gpu_list=gpu_list,
                 process=processes,
                 partition=partition,
@@ -464,13 +468,21 @@ def _smi_try(self, fn, *a, default=None, **kw):
         try:
             return fn(*a, **kw)
         except amdsmi.AmdSmiException as e:  # type: ignore[attr-defined]
-            self.logger.warning(e)
+            fn_name = getattr(fn, "__name__", str(fn))
+            self.logger.warning(
+                "%s(%s) raised AmdSmiException: %s",
+                fn_name,
+                ", ".join(repr(x) for x in a),
+                e,
+            )
+
             code = getattr(e, "ret_code", None)
             if code is None:
                 try:
                     code = int(e.args[0]) if getattr(e, "args", None) else None
                 except Exception:
                     code = None
+
             CODE2NAME = {
                 1: "AMDSMI_STATUS_SUCCESS",
                 2: "AMDSMI_STATUS_NOT_SUPPORTED",
@@ -482,25 +494,40 @@ def _smi_try(self, fn, *a, default=None, **kw):
             }
             name = CODE2NAME.get(code, "unknown")
 
+            common_data = {
+                "function": fn_name,
+                "args": [repr(x) for x in a],
+                "status_name": name,
+                "status_code": code,
+                "exception": get_exception_traceback(e),
+            }
+
             if name in ("AMDSMI_STATUS_NOT_SUPPORTED", "AMDSMI_STATUS_NOT_FOUND"):
                 self._log_event(
                     category=EventCategory.APPLICATION,
-                    description=f"{fn.__name__} not supported on this device/mode (status={name}, code={code})",
+                    description=f"{fn_name} not supported on this device/mode (status={name}, code={code})",
+                    data=common_data,
                     priority=EventPriority.WARNING,
                 )
                 return default
+
             if name == "AMDSMI_STATUS_PERMISSION":
                 self._log_event(
                     category=EventCategory.APPLICATION,
-                    description=f"{fn.__name__} permission denied (need access to /dev/kfd & render nodes, or root for RAS). status={name}, code={code})",
+                    description=(
+                        f"{fn_name} permission denied "
+                        f"(need access to /dev/kfd & render nodes, or root for RAS). "
+                        f"status={name}, code={code}"
+                    ),
+                    data=common_data,
                     priority=EventPriority.WARNING,
                 )
                 return default
 
             self._log_event(
                 category=EventCategory.APPLICATION,
-                description=f"{fn.__name__} failed (status={name}, code={code})",
-                data={"exception": get_exception_traceback(e)},
+                description=f"{fn_name} failed (status={name}, code={code})",
+                data=common_data,
                 priority=EventPriority.WARNING,
             )
             return default
@@ -906,6 +933,92 @@ def _fmt(n: int | None) -> str | None:
         except ValidationError:
             return None
 
+    def get_bad_pages(self) -> list[BadPages] | None:
+        """
+        Collect bad page info per GPU and map to BadPages/PageData models.
+
+        Returns:
+            List[BadPages] (one item per GPU) or None if no devices.
+        """
+        amdsmi = self._amdsmi_mod()
+        devices = self._get_handles()
+        if not devices:
+            return None
+
+        out: list[BadPages] = []
+
+        for idx, h in enumerate(devices):
+            raw = self._smi_try(amdsmi.amdsmi_get_gpu_bad_page_info, h, default=[]) or []
+            pages: list[PageData] = []
+
+            if isinstance(raw, list):
+                for entry in raw:
+                    if not isinstance(entry, dict):
+                        continue
+
+                    pa = entry.get("page_address")
+                    ps = entry.get("page_size")
+                    st = entry.get("status")
+                    val = entry.get("value")
+
+                    page_address: int | str
+                    if isinstance(pa, (int, str)):
+                        page_address = pa
+                    else:
+                        page_address = str(pa)
+
+                    page_size: int | str
+                    if isinstance(ps, (int, str)):
+                        page_size = ps
+                    else:
+                        page_size = str(ps)
+
+                    status = "" if st in (None, "N/A") else str(st)
+
+                    value_i: int | None = None
+                    if isinstance(val, int):
+                        value_i = val
+                    elif isinstance(val, str):
+                        s = val.strip()
+                        try:
+                            value_i = int(s, 0)
+                        except Exception:
+                            value_i = None
+
+                    try:
+                        pages.append(
+                            PageData(
+                                page_address=page_address,
+                                page_size=page_size,
+                                status=status,
+                                value=value_i,
+                            )
+                        )
+                    except ValidationError as e:
+                        self._log_event(
+                            category=EventCategory.APPLICATION,
+                            description="Failed to build PageData; skipping entry",
+                            data={
+                                "exception": get_exception_traceback(e),
+                                "gpu_index": idx,
+                                "entry": repr(entry),
+                            },
+                            priority=EventPriority.WARNING,
+                        )
+                        continue
+
+            try:
+                out.append(BadPages(gpu=idx, retired=pages))
+            except ValidationError as e:
+                self._log_event(
+                    category=EventCategory.APPLICATION,
+                    description="Failed to build BadPages",
+                    data={"exception": get_exception_traceback(e), "gpu_index": idx},
+                    priority=EventPriority.WARNING,
+                )
+
+        return out
+
     def collect_data(
         self,
         args=None,
diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
index ea4b6bcb..6f2d5600 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
@@ -413,6 +413,19 @@ class AmdSmiStatic(BaseModel):
     )
 
 
+# PAGES
+class PageData(BaseModel):
+    page_address: int | str
+    page_size: int | str
+    status: str
+    value: int | None
+
+
+class BadPages(BaseModel):
+    gpu: int
+    retired: list[PageData]
+
+
 class AmdSmiDataModel(DataModel):
     """Data model for amd-smi data.
 
@@ -434,6 +447,7 @@ class AmdSmiDataModel(DataModel):
     partition: Partition | None = None
     process: list[Processes] | None = Field(default_factory=list)
     firmware: list[Fw] | None = Field(default_factory=list)
+    bad_pages: list[BadPages] | None = Field(default_factory=list)
     static: list[AmdSmiStatic] | None = Field(default_factory=list)
 
     def get_list(self, gpu: int) -> AmdSmiListItem | None:
@@ -471,3 +485,12 @@ def get_static(self, gpu: int) -> AmdSmiStatic | None:
             if item.gpu == gpu:
                 return item
         return None
+
+    def get_bad_pages(self, gpu: int) -> BadPages | None:
+        """Get the bad pages data for the given gpu id."""
+        if self.bad_pages is None:
+            return None
+        for item in self.bad_pages:
+            if item.gpu == gpu:
+                return item
+        return None

From 03b0528b96060bc1319ff190d4ab172f3d38d099 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Fri, 26 Sep 2025 10:43:45 -0500
Subject: [PATCH 2/9] metric updates

---
 .../plugins/inband/amdsmi/amdsmi_collector.py | 575 +++++++++++++++++-
 .../plugins/inband/amdsmi/amdsmidata.py       | 310 ++++++++++
 2 files changed, 871 insertions(+), 14 deletions(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
index 49709b7e..533904b6 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
@@ -34,11 +34,24 @@
 from nodescraper.plugins.inband.amdsmi.amdsmidata import (
     AmdSmiDataModel,
     AmdSmiListItem,
+    AmdSmiMetric,
     AmdSmiStatic,
     AmdSmiVersion,
     BadPages,
     Fw,
     FwListItem,
+    MetricClockData,
+    MetricEccTotals,
+    MetricEnergy,
+    MetricFan,
+    MetricMemUsage,
+    MetricPcie,
+    MetricPower,
+    MetricTemperature,
+    MetricThrottle,
+    MetricThrottleVu,
+    MetricUsage,
+    MetricVoltageCurve,
     PageData,
     Partition,
     PartitionCompute,
@@ -171,7 +184,9 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None:
             firmware = self.get_firmware()
             gpu_list = self.get_gpu_list()
             statics = self.get_static()
+            metric = self.get_metric()
         except Exception as e:
+            self.logger.error(e)
             self._log_event(
                 category=EventCategory.APPLICATION,
                 description="Error running amd-smi sub commands",
@@ -191,6 +206,7 @@ def _get_amdsmi_data(self) -> AmdSmiDataModel | None:
                 partition=partition,
                 firmware=firmware,
                 static=statics,
+                metric=metric
             )
         except ValidationError as e:
             self.logger.warning("Validation err: %s", e)
@@ -277,35 +293,54 @@ def get_process(self) -> list[Processes] | None:
                         plist.append(ProcessListItem(process_info=str(entry)))
                         continue
 
-                    name = entry.get("name", "N/A")
+                    raw_name = entry.get("name", None)
+                    name = (
+                        None
+                        if (raw_name is None or str(raw_name).strip().upper() == "N/A")
+                        else str(raw_name)
+                    )
+
                     pid_val = entry.get("pid", 0)
                     try:
                         pid = int(pid_val) if pid_val not in (None, "") else 0
                     except Exception:
                         pid = 0
 
+                    # memory_usage block
+                    mu = entry.get("memory_usage") or {}
+                    gtt_mem_vu = self._vu(mu.get("gtt_mem"), "B")
+                    cpu_mem_vu = self._vu(mu.get("cpu_mem"), "B")
+                    vram_mem_vu = self._vu(mu.get("vram_mem"), "B")
+
+                    # mem
                     mem_vu = self._vu(entry.get("mem"), "B")
+                    if mem_vu is None and vram_mem_vu is not None:
+                        mem_vu = vram_mem_vu
+
+                    if (not mu) and mem_vu is not None and vram_mem_vu is None:
+                        vram_mem_vu = mem_vu
 
-                    mu = entry.get("memory_usage") or {}
                     mem_usage = ProcessMemoryUsage(
-                        gtt_mem=self._vu(mu.get("gtt_mem"), "B"),
-                        cpu_mem=self._vu(mu.get("cpu_mem"), "B"),
-                        vram_mem=self._vu(mu.get("vram_mem"), "B"),
+                        gtt_mem=gtt_mem_vu,
+                        cpu_mem=cpu_mem_vu,
+                        vram_mem=vram_mem_vu,
                     )
 
+                    # engine_usage
                     eu = entry.get("engine_usage") or {}
-                    usage = ProcessUsage(
-                        gfx=self._vu(eu.get("gfx"), "ns"),
-                        enc=self._vu(eu.get("enc"), "ns"),
-                    )
+                    gfx_vu = self._vu(eu.get("gfx"), "ns") or self._vu(0, "ns")
+                    enc_vu = self._vu(eu.get("enc"), "ns") or self._vu(0, "ns")
+                    usage = ProcessUsage(gfx=gfx_vu, enc=enc_vu)
 
-                    cu_occ = self._vu(entry.get("cu_occupancy"), "")
+                    # CU occupancy, default 0
+                    cu_raw = entry.get("cu_occupancy", None)
+                    cu_occ = self._vu(cu_raw, "") or self._vu(0, "")
 
                     try:
                         plist.append(
                             ProcessListItem(
                                 process_info=ProcessInfo(
-                                    name=str(name),
+                                    name=name if name is not None else "N/A",
                                     pid=pid,
                                     mem=mem_vu,
                                     memory_usage=mem_usage,
@@ -633,7 +668,7 @@ def get_static(self) -> list[AmdSmiStatic] | None:
                     version=str(vb.get("vbios_version", "")),
                 )
 
-            # NUMA (via KFD)
+            # NUMA
             if isinstance(kfd, dict):
                 try:
                     numa_node = int(kfd.get("node_id", 0) or 0)
@@ -714,7 +749,7 @@ def _get_soc_pstate(self, h) -> StaticSocPstate | None:
             self._log_event(
                 category=EventCategory.APPLICATION,
                 description="amdsmi_get_soc_pstate not exposed by amdsmi build",
-                priority=EventPriority.INFO,
+                priority=EventPriority.WARNING,
             )
             return None
 
@@ -768,7 +803,7 @@ def _get_xgmi_plpd(self, h) -> StaticXgmiPlpd | None:
             self._log_event(
                 category=EventCategory.APPLICATION,
                 description="XGMI PLPD not exposed by this amdsmi build",
-                priority=EventPriority.INFO,
+                priority=EventPriority.WARNING,
             )
             return None
 
@@ -1019,6 +1054,518 @@ def get_bad_pages(self) -> list[BadPages] | None:
 
         return out
 
+    def get_metric(self) -> list[AmdSmiMetric] | None:
+        amdsmi = self._amdsmi_mod()
+        devices = self._get_handles()
+        out: list[AmdSmiMetric] = []
+
+        def _to_int_or_none(v: object) -> int | None:
+            n = self._to_number(v)
+            if n is None:
+                return None
+            try:
+                return int(n)
+            except Exception:
+                try:
+                    return int(float(n))
+                except Exception:
+                    return None
+
+        def _as_list(v: object) -> list[object]:
+            if isinstance(v, list):
+                return v
+            return (
+                [] if v in (None, "N/A") else [v] if not isinstance(v, (dict, tuple, set)) else []
+            )
+
+        for idx, h in enumerate(devices):
+            raw = self._smi_try(amdsmi.amdsmi_get_gpu_metrics_info, h, default=None)
+
+            if not isinstance(raw, dict):
+                self._log_event(
+                    category=EventCategory.APPLICATION,
+                    description="amdsmi_get_gpu_metrics_info returned no dict; using empty metric",
+                    data={"gpu_index": idx, "type": type(raw).__name__},
+                    priority=EventPriority.WARNING,
+                )
+                out.append(self._empty_metric(idx))
+                continue
+
+            try:
+                # Usage
+                usage = MetricUsage(
+                    gfx_activity=self._vu(raw.get("average_gfx_activity"), "%"),
+                    umc_activity=self._vu(raw.get("average_umc_activity"), "%"),
+                    mm_activity=self._vu(raw.get("average_mm_activity"), "%"),
+                    vcn_activity=[self._vu(v, "%") for v in _as_list(raw.get("vcn_activity"))],
+                    jpeg_activity=[self._vu(v, "%") for v in _as_list(raw.get("jpeg_activity"))],
+                    gfx_busy_inst=None,
+                    jpeg_busy=None,
+                    vcn_busy=None,
+                )
+
+                # Power / Energy
+                power = MetricPower(
+                    socket_power=self._vu(raw.get("average_socket_power"), "W"),
+                    gfx_voltage=self._vu(raw.get("voltage_gfx"), "mV"),
+                    soc_voltage=self._vu(raw.get("voltage_soc"), "mV"),
+                    mem_voltage=self._vu(raw.get("voltage_mem"), "mV"),
+                    throttle_status=(
+                        str(raw.get("throttle_status"))
+                        if raw.get("throttle_status") is not None
+                        else None
+                    ),
+                    power_management=self._nz(raw.get("indep_throttle_status"), default="unknown"),
+                )
+                energy = MetricEnergy(
+                    total_energy_consumption=self._vu(raw.get("energy_accumulator"), "uJ")
+                )
+
+                # Temperature
+                temperature = MetricTemperature(
+                    edge=self._vu(raw.get("temperature_edge"), "C"),
+                    hotspot=self._vu(raw.get("temperature_hotspot"), "C"),
+                    mem=self._vu(raw.get("temperature_mem"), "C"),
+                )
+
+                # PCIe
+                speed_raw = self._to_number(raw.get("pcie_link_speed"))
+                speed_gtps = (
+                    float(speed_raw) / 10.0 if isinstance(speed_raw, (int, float)) else None
+                )
+
+                pcie = MetricPcie(
+                    width=_to_int_or_none(raw.get("pcie_link_width")),
+                    speed=self._vu(speed_gtps, "GT/s"),
+                    bandwidth=self._vu(raw.get("pcie_bandwidth_inst"), "GB/s"),
+                    replay_count=_to_int_or_none(raw.get("pcie_replay_count_acc")),
+                    l0_to_recovery_count=_to_int_or_none(raw.get("pcie_l0_to_recov_count_acc")),
+                    replay_roll_over_count=_to_int_or_none(raw.get("pcie_replay_rover_count_acc")),
+                    nak_sent_count=_to_int_or_none(raw.get("pcie_nak_sent_count_acc")),
+                    nak_received_count=_to_int_or_none(raw.get("pcie_nak_rcvd_count_acc")),
+                    current_bandwidth_sent=None,
+                    current_bandwidth_received=None,
+                    max_packet_size=None,
+                    lc_perf_other_end_recovery=None,
+                )
+
+                # Clocks
+                def _clk(cur_key: str) -> MetricClockData:
+                    return MetricClockData(
+                        clk=self._vu(raw.get(cur_key), "MHz"),
+                        min_clk=None,
+                        max_clk=None,
+                        clk_locked=(
+                            raw.get("gfxclk_lock_status") if cur_key == "current_gfxclk" else None
+                        ),
+                        deep_sleep=None,
+                    )
+
+                clock: dict[str, MetricClockData] = {
+                    "GFX": _clk("current_gfxclk"),
+                    "SOC": _clk("current_socclk"),
+                    "UCLK": _clk("current_uclk"),
+                    "VCLK0": _clk("current_vclk0"),
+                    "DCLK0": _clk("current_dclk0"),
+                    "VCLK1": _clk("current_vclk1"),
+                    "DCLK1": _clk("current_dclk1"),
+                }
+
+                # Fan
+                fan = MetricFan(
+                    rpm=self._vu(raw.get("current_fan_speed"), "RPM"),
+                    speed=None,
+                    max=None,
+                    usage=None,
+                )
+
+                # Voltage curve
+                voltage_curve = self._get_voltage_curve(h) or self._empty_voltage_curve()
+
+                # Memory usage
+                total_vram_vu: ValueUnit | None = None
+                used_vram_vu: ValueUnit | None = None
+                free_vram_vu: ValueUnit | None = None
+
+                vram_usage = self._smi_try(amdsmi.amdsmi_get_gpu_vram_usage, h, default=None)
+                if isinstance(vram_usage, dict):
+                    used_vram_vu = self._vu(vram_usage.get("vram_used"), "B")
+                    total_vram_vu = self._vu(vram_usage.get("vram_total"), "B")
+
+                mem_enum = getattr(amdsmi, "AmdSmiMemoryType", None)
+                vis_total_vu: ValueUnit | None = None
+                gtt_total_vu: ValueUnit | None = None
+
+                if mem_enum is not None:
+                    if total_vram_vu is None:
+                        vram_total_alt = self._smi_try(
+                            amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.VRAM, default=None
+                        )
+                        if vram_total_alt is not None:
+                            total_vram_vu = self._vu(vram_total_alt, "B")
+
+                    vis_total = self._smi_try(
+                        amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.VIS_VRAM, default=None
+                    )
+                    if vis_total is not None:
+                        vis_total_vu = self._vu(vis_total, "B")
+
+                    gtt_total = self._smi_try(
+                        amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.GTT, default=None
+                    )
+                    if gtt_total is not None:
+                        gtt_total_vu = self._vu(gtt_total, "B")
+
+                # Compute free if possible
+                if free_vram_vu is None and total_vram_vu is not None and used_vram_vu is not None:
+                    try:
+                        free_num = max(0.0, float(total_vram_vu.value) - float(used_vram_vu.value))
+                        free_vram_vu = self._vu(free_num, "B")
+                    except Exception:
+                        pass
+
+                # Build mem_usage
+                mem_usage = MetricMemUsage(
+                    total_vram=total_vram_vu,
+                    used_vram=used_vram_vu,
+                    free_vram=free_vram_vu,
+                    total_visible_vram=vis_total_vu,
+                    used_visible_vram=None,
+                    free_visible_vram=None,
+                    total_gtt=gtt_total_vu,
+                    used_gtt=None,
+                    free_gtt=None,
+                )
+
+                # ECC totals
+                ecc_raw = self._smi_try(amdsmi.amdsmi_get_gpu_total_ecc_count, h, default=None)
+                if isinstance(ecc_raw, dict):
+                    ecc = MetricEccTotals(
+                        total_correctable_count=_to_int_or_none(ecc_raw.get("correctable_count")),
+                        total_uncorrectable_count=_to_int_or_none(
+                            ecc_raw.get("uncorrectable_count")
+                        ),
+                        total_deferred_count=_to_int_or_none(ecc_raw.get("deferred_count")),
+                        cache_correctable_count=None,
+                        cache_uncorrectable_count=None,
+                    )
+                else:
+                    ecc = MetricEccTotals(
+                        total_correctable_count=None,
+                        total_uncorrectable_count=None,
+                        total_deferred_count=None,
+                        cache_correctable_count=None,
+                        cache_uncorrectable_count=None,
+                    )
+
+                # Throttle
+                throttle = self.get_throttle(h) or MetricThrottle()
+
+                out.append(
+                    AmdSmiMetric(
+                        gpu=idx,
+                        usage=usage,
+                        power=power,
+                        clock=clock,
+                        temperature=temperature,
+                        pcie=pcie,
+                        ecc=ecc,
+                        ecc_blocks={},
+                        fan=fan,
+                        voltage_curve=voltage_curve,
+                        perf_level=None,
+                        xgmi_err=None,
+                        energy=energy,
+                        mem_usage=mem_usage,
+                        throttle=throttle,
+                    )
+                )
+            except ValidationError as e:
+                self.logger.warning(e)
+                self._log_event(
+                    category=EventCategory.APPLICATION,
+                    description="Failed to build AmdSmiMetric; using empty metric",
+                    data={"exception": get_exception_traceback(e), "gpu_index": idx},
+                    priority=EventPriority.WARNING,
+                )
+                out.append(self._empty_metric(idx))
+
+        return out
+
+    def _empty_metric(self, gpu_idx: int) -> AmdSmiMetric:
+        return AmdSmiMetric(
+            gpu=gpu_idx,
+            usage=MetricUsage(
+                gfx_activity=None,
+                umc_activity=None,
+                mm_activity=None,
+                vcn_activity=[],
+                jpeg_activity=[],
+                gfx_busy_inst=None,
+                jpeg_busy=None,
+                vcn_busy=None,
+            ),
+            power=MetricPower(
+                socket_power=None,
+                gfx_voltage=None,
+                soc_voltage=None,
+                mem_voltage=None,
+                throttle_status=None,
+                power_management=None,
+            ),
+            clock={},
+            temperature=MetricTemperature(edge=None, hotspot=None, mem=None),
+            pcie=MetricPcie(
+                width=None,
+                speed=None,
+                bandwidth=None,
+                replay_count=None,
+                l0_to_recovery_count=None,
+                replay_roll_over_count=None,
+                nak_sent_count=None,
+                nak_received_count=None,
+                current_bandwidth_sent=None,
+                current_bandwidth_received=None,
+                max_packet_size=None,
+                lc_perf_other_end_recovery=None,
+            ),
+            ecc=MetricEccTotals(
+                total_correctable_count=None,
+                total_uncorrectable_count=None,
+                total_deferred_count=None,
+                cache_correctable_count=None,
+                cache_uncorrectable_count=None,
+            ),
+            ecc_blocks={},
+            fan=MetricFan(speed=None, max=None, rpm=None, usage=None),
+            voltage_curve=self._empty_voltage_curve(),
+            perf_level=None,
+            xgmi_err=None,
+            energy=None,
+            mem_usage=MetricMemUsage(
+                total_vram=None,
+                used_vram=None,
+                free_vram=None,
+                total_visible_vram=None,
+                used_visible_vram=None,
+                free_visible_vram=None,
+                total_gtt=None,
+                used_gtt=None,
+                free_gtt=None,
+            ),
+            throttle=MetricThrottle(),
+        )
+
+    def _get_voltage_curve(self, h) -> MetricVoltageCurve:
+        amdsmi = self._amdsmi_mod()
+        raw = self._smi_try(amdsmi.amdsmi_get_gpu_od_volt_info, h, default=None)
+        if not isinstance(raw, dict):
+            return self._empty_voltage_curve()
+
+        try:
+            num_regions = int(raw.get("num_regions", 0) or 0)
+        except Exception:
+            num_regions = 0
+        if num_regions == 0:
+            return self._empty_voltage_curve()
+
+        curve = raw.get("curve") or {}
+        pts = curve.get("vc_points") or raw.get("vc_points") or []
+        if not isinstance(pts, list) or len(pts) == 0:
+            return self._empty_voltage_curve()
+
+        def _pt_get(d: object, *names: str) -> object | None:
+            if not isinstance(d, dict):
+                return None
+            for n in names:
+                if n in d:
+                    return d.get(n)
+            lower = {str(k).lower(): v for k, v in d.items()}
+            for n in names:
+                v = lower.get(n.lower())
+                if v is not None:
+                    return v
+            return None
+
+        def _extract_point(p: object) -> tuple[object | None, object | None]:
+            clk = _pt_get(p, "clk_value", "frequency", "freq", "clk", "sclk")
+            volt = _pt_get(p, "volt_value", "voltage", "volt", "mV")
+            return clk, volt
+
+        p0_clk, p0_volt = _extract_point(pts[0]) if len(pts) >= 1 else (None, None)
+        p1_clk, p1_volt = _extract_point(pts[1]) if len(pts) >= 2 else (None, None)
+        p2_clk, p2_volt = _extract_point(pts[2]) if len(pts) >= 3 else (None, None)
+
+        return MetricVoltageCurve(
+            point_0_frequency=self._vu(p0_clk, "MHz"),
+            point_0_voltage=self._vu(p0_volt, "mV"),
+            point_1_frequency=self._vu(p1_clk, "MHz"),
+            point_1_voltage=self._vu(p1_volt, "mV"),
+            point_2_frequency=self._vu(p2_clk, "MHz"),
+            point_2_voltage=self._vu(p2_volt, "mV"),
+        )
+
+    def _empty_voltage_curve(self) -> MetricVoltageCurve:
+        return MetricVoltageCurve(
+            point_0_frequency=None,
+            point_0_voltage=None,
+            point_1_frequency=None,
+            point_1_voltage=None,
+            point_2_frequency=None,
+            point_2_voltage=None,
+        )
+
+    def _as_first_plane(self, obj) -> list:
+        """Take a scalar/list/2D-list and return the first plane as a flat list."""
+        if isinstance(obj, list):
+            if obj and isinstance(obj[0], list):  # 2D
+                return obj[0]
+            return obj
+        return []
+
+    def _th_vu_list_pct(self, obj) -> MetricThrottleVu | None:
+        """Return MetricThrottleVu with % ValueUnits for the first XCP plane."""
+        arr = self._as_first_plane(obj)
+        if not arr:
+            return None
+        return MetricThrottleVu(
+            xcp_0=[self._vu(v, "%") if v not in (None, "N/A") else "N/A" for v in arr]
+        )
+
+    def _th_vu_list_raw(self, obj) -> MetricThrottleVu | None:
+        """Return MetricThrottleVu with raw ints/strings for the first XCP plane."""
+        arr = self._as_first_plane(obj)
+        if not arr:
+            return None
+        return MetricThrottleVu(
+            xcp_0=[
+                (int(v) if isinstance(v, (int, float, str)) and str(v).strip().isdigit() else v)
+                for v in arr
+            ]
+        )
+
+    def get_throttle(self, h) -> MetricThrottle:
+        amdsmi = self._amdsmi_mod()
+        raw = self._smi_try(amdsmi.amdsmi_get_violation_status, h, default=None)
+        if not isinstance(raw, dict):
+            return MetricThrottle()
+
+        acc_counter = raw.get("acc_counter")
+        prochot_acc = raw.get("acc_prochot_thrm")
+        ppt_acc = raw.get("acc_ppt_pwr")
+        socket_thrm_acc = raw.get("acc_socket_thrm")
+        vr_thrm_acc = raw.get("acc_vr_thrm")
+        hbm_thrm_acc = raw.get("acc_hbm_thrm")
+
+        acc_gfx_pwr = raw.get("acc_gfx_clk_below_host_limit_pwr")
+        acc_gfx_thm = raw.get("acc_gfx_clk_below_host_limit_thm")
+        acc_low_util = raw.get("acc_low_utilization")
+        acc_gfx_total = raw.get("acc_gfx_clk_below_host_limit_total")
+
+        act_prochot = raw.get("active_prochot_thrm")
+        act_ppt = raw.get("active_ppt_pwr")
+        act_socket = raw.get("active_socket_thrm")
+        act_vr = raw.get("active_vr_thrm")
+        act_hbm = raw.get("active_hbm_thrm")
+        act_gfx_pwr = raw.get("active_gfx_clk_below_host_limit_pwr")
+        act_gfx_thm = raw.get("active_gfx_clk_below_host_limit_thm")
+        act_low_util = raw.get("active_low_utilization")
+        act_gfx_total = raw.get("active_gfx_clk_below_host_limit_total")
+
+        per_prochot = raw.get("per_prochot_thrm")
+        per_ppt = raw.get("per_ppt_pwr")
+        per_socket = raw.get("per_socket_thrm")
+        per_vr = raw.get("per_vr_thrm")
+        per_hbm = raw.get("per_hbm_thrm")
+        per_gfx_pwr = raw.get("per_gfx_clk_below_host_limit_pwr")
+        per_gfx_thm = raw.get("per_gfx_clk_below_host_limit_thm")
+        per_low_util = raw.get("per_low_utilization")
+        per_gfx_total = raw.get("per_gfx_clk_below_host_limit_total")
+
+        return MetricThrottle(
+            accumulation_counter=self._vu(acc_counter, ""),  # unitless counter
+            prochot_accumulated=self._th_vu_list_raw(prochot_acc),
+            ppt_accumulated=self._th_vu_list_raw(ppt_acc),
+            socket_thermal_accumulated=self._th_vu_list_raw(socket_thrm_acc),
+            vr_thermal_accumulated=self._th_vu_list_raw(vr_thrm_acc),
+            hbm_thermal_accumulated=self._th_vu_list_raw(hbm_thrm_acc),
+            gfx_clk_below_host_limit_power_accumulated=self._th_vu_list_raw(acc_gfx_pwr),
+            gfx_clk_below_host_limit_thermal_accumulated=self._th_vu_list_raw(acc_gfx_thm),
+            low_utilization_accumulated=self._th_vu_list_raw(acc_low_util),
+            total_gfx_clk_below_host_limit_accumulated=self._th_vu_list_raw(acc_gfx_total),
+            prochot_violation_status=self._th_vu_list_raw(act_prochot),
+            ppt_violation_status=self._th_vu_list_raw(act_ppt),
+            socket_thermal_violation_status=self._th_vu_list_raw(act_socket),
+            vr_thermal_violation_status=self._th_vu_list_raw(act_vr),
+            hbm_thermal_violation_status=self._th_vu_list_raw(act_hbm),
+            gfx_clk_below_host_limit_power_violation_status=self._th_vu_list_raw(act_gfx_pwr),
+            gfx_clk_below_host_limit_thermal_violation_status=self._th_vu_list_raw(act_gfx_thm),
+            low_utilization_violation_status=self._th_vu_list_raw(act_low_util),
+            total_gfx_clk_below_host_limit_violation_status=self._th_vu_list_raw(act_gfx_total),
+            prochot_violation_activity=self._vu(per_prochot, "%"),
+            ppt_violation_activity=self._vu(per_ppt, "%"),
+            socket_thermal_violation_activity=self._vu(per_socket, "%"),
+            vr_thermal_violation_activity=self._vu(per_vr, "%"),
+            hbm_thermal_violation_activity=self._vu(per_hbm, "%"),
+            gfx_clk_below_host_limit_power_violation_activity=self._th_vu_list_pct(per_gfx_pwr),
+            gfx_clk_below_host_limit_thermal_violation_activity=self._th_vu_list_pct(per_gfx_thm),
+            low_utilization_violation_activity=self._th_vu_list_pct(per_low_util),
+            total_gfx_clk_below_host_limit_violation_activity=self._th_vu_list_pct(per_gfx_total),
+        )
+
+    def _flatten_2d(self, v: object) -> list[object]:
+        if isinstance(v, list) and v and isinstance(v[0], list):
+            out: list[object] = []
+            for row in v:
+                if isinstance(row, list):
+                    out.extend(row)
+                else:
+                    out.append(row)
+            return out
+        return v if isinstance(v, list) else [v] if v not in (None, "N/A") else []
+
+    def _coerce_throttle_value(
+        self, v: object, unit: str = ""
+    ) -> MetricThrottleVu | ValueUnit | None:
+        """
+        Convert ints/floats/strings/lists/2D-lists/dicts into:
+          - ValueUnit
+          - MetricThrottleVu(xcp_0=[...])
+          - None for N/A/empty
+        """
+        if v in (None, "", "N/A"):
+            return None
+
+        if isinstance(v, (int, float)):
+            return ValueUnit(value=v, unit=unit)
+        if isinstance(v, str):
+            s = v.strip()
+            if not s or s.upper() == "N/A":
+                return None
+            try:
+                return ValueUnit(value=int(s, 0), unit=unit)
+            except Exception:
+                try:
+                    return ValueUnit(value=float(s), unit=unit)
+                except Exception:
+                    return MetricThrottleVu(xcp_0=[s])
+
+        if isinstance(v, list):
+            flat = self._flatten_2d(v)
+            return MetricThrottleVu(xcp_0=flat if flat else None)
+
+        if isinstance(v, dict):
+            if "xcp_0" in v and isinstance(v["xcp_0"], list):
+                return MetricThrottleVu(xcp_0=self._flatten_2d(v["xcp_0"]))
+            val = v.get("value")
+            if isinstance(val, dict):
+                for maybe_list in val.values():
+                    if isinstance(maybe_list, list):
+                        return MetricThrottleVu(xcp_0=self._flatten_2d(maybe_list))
+            return MetricThrottleVu(xcp_0=[str(v)])
+
+        return MetricThrottleVu(xcp_0=[str(v)])
+
     def collect_data(
         self,
         args=None,
diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
index 6f2d5600..ba6a2eef 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
@@ -426,6 +426,315 @@ class BadPages(BaseModel):
     retired: list[PageData]
 
 
+# Metric Data
+class MetricUsage(BaseModel):
+    gfx_activity: ValueUnit | None
+    umc_activity: ValueUnit | None
+    mm_activity: ValueUnit | None
+    vcn_activity: list[ValueUnit | str | None]
+    jpeg_activity: list[ValueUnit | str | None]
+    gfx_busy_inst: dict[str, list[ValueUnit | str | None]] | None
+    jpeg_busy: dict[str, list[ValueUnit | str | None]] | None
+    vcn_busy: dict[str, list[ValueUnit | str | None]] | None
+    na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")(
+        na_to_none_list
+    )
+    na_validator = field_validator(
+        "gfx_activity",
+        "umc_activity",
+        "mm_activity",
+        "gfx_busy_inst",
+        "jpeg_busy",
+        "vcn_busy",
+        mode="before",
+    )(na_to_none)
+
+
+class MetricPower(BaseModel):
+    socket_power: ValueUnit | None
+    gfx_voltage: ValueUnit | None
+    soc_voltage: ValueUnit | None
+    mem_voltage: ValueUnit | None
+    throttle_status: str | None
+    power_management: str | None
+    na_validator = field_validator(
+        "socket_power",
+        "gfx_voltage",
+        "soc_voltage",
+        "mem_voltage",
+        "throttle_status",
+        "power_management",
+        mode="before",
+    )(na_to_none)
+
+
+class MetricClockData(BaseModel):
+    clk: ValueUnit | None
+    min_clk: ValueUnit | None
+    max_clk: ValueUnit | None
+    clk_locked: int | str | dict | None
+    deep_sleep: int | str | dict | None
+    na_validator = field_validator(
+        "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before"
+    )(na_to_none)
+
+
+class MetricTemperature(BaseModel):
+    edge: ValueUnit | None
+    hotspot: ValueUnit | None
+    mem: ValueUnit | None
+    na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none)
+
+
+class MetricPcie(BaseModel):
+    width: int | None
+    speed: ValueUnit | None
+    bandwidth: ValueUnit | None
+    replay_count: int | None
+    l0_to_recovery_count: int | None
+    replay_roll_over_count: int | None
+    nak_sent_count: int | None
+    nak_received_count: int | None
+    current_bandwidth_sent: int | None
+    current_bandwidth_received: int | None
+    max_packet_size: int | None
+    lc_perf_other_end_recovery: int | None
+    na_validator = field_validator(
+        "width",
+        "speed",
+        "bandwidth",
+        "replay_count",
+        "l0_to_recovery_count",
+        "replay_roll_over_count",
+        "nak_sent_count",
+        "nak_received_count",
+        "current_bandwidth_sent",
+        "current_bandwidth_received",
+        "max_packet_size",
+        "lc_perf_other_end_recovery",
+        mode="before",
+    )(na_to_none)
+
+
+class MetricEccTotals(BaseModel):
+    total_correctable_count: int | None
+    total_uncorrectable_count: int | None
+    total_deferred_count: int | None
+    cache_correctable_count: int | None
+    cache_uncorrectable_count: int | None
+    na_validator = field_validator(
+        "total_correctable_count",
+        "total_uncorrectable_count",
+        "total_deferred_count",
+        "cache_correctable_count",
+        "cache_uncorrectable_count",
+        mode="before",
+    )(na_to_none)
+
+
+class MetricErrorCounts(BaseModel):
+    correctable_count: str | None
+    uncorrectable_count: str | None
+    deferred_count: str | None
+    na_validator = field_validator(
+        "correctable_count", "uncorrectable_count", "deferred_count", mode="before"
+    )(na_to_none)
+
+
+class MetricFan(BaseModel):
+    speed: ValueUnit | None
+    max: ValueUnit | None
+    rpm: ValueUnit | None
+    usage: ValueUnit | None
+    na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none)
+
+
+class MetricVoltageCurve(BaseModel):
+    point_0_frequency: ValueUnit | None
+    point_0_voltage: ValueUnit | None
+    point_1_frequency: ValueUnit | None
+    point_1_voltage: ValueUnit | None
+    point_2_frequency: ValueUnit | None
+    point_2_voltage: ValueUnit | None
+
+    na_validator = field_validator(
+        "point_0_frequency",
+        "point_0_voltage",
+        "point_1_frequency",
+        "point_1_voltage",
+        "point_2_frequency",
+        "point_2_voltage",
+        mode="before",
+    )(na_to_none)
+
+
+class MetricEnergy(BaseModel):
+    total_energy_consumption: ValueUnit | None
+    na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none)
+
+
+class MetricMemUsage(BaseModel):
+    total_vram: ValueUnit | None
+    used_vram: ValueUnit | None
+    free_vram: ValueUnit | None
+    total_visible_vram: ValueUnit | None
+    used_visible_vram: ValueUnit | None
+    free_visible_vram: ValueUnit | None
+    total_gtt: ValueUnit | None
+    used_gtt: ValueUnit | None
+    free_gtt: ValueUnit | None
+    na_validator = field_validator(
+        "total_vram",
+        "used_vram",
+        "free_vram",
+        "total_visible_vram",
+        "used_visible_vram",
+        "free_visible_vram",
+        "total_gtt",
+        "used_gtt",
+        "free_gtt",
+        mode="before",
+    )(na_to_none)
+
+
+class MetricThrottleVu(BaseModel):
+    xcp_0: list[ValueUnit | str | None] = None
+    # Deprecated below
+    value: dict[str, list[int | str]] | None = Field(deprecated=True, default=None)
+    unit: str = Field(deprecated=True, default="")
+
+
+class MetricThrottle(AmdSmiBaseModel):
+    # At some point in time these changed from being int -> ValueUnit
+
+    accumulation_counter: MetricThrottleVu | ValueUnit | None = None
+
+    gfx_clk_below_host_limit_accumulated: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_power_accumulated: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_power_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_power_violation_status: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_violation_accumulated: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_violation_status: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_thermal_violation_accumulated: MetricThrottleVu | ValueUnit | None = (
+        None
+    )
+    gfx_clk_below_host_limit_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None
+    gfx_clk_below_host_limit_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None
+
+    hbm_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None
+    hbm_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    hbm_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None
+    low_utilization_violation_accumulated: MetricThrottleVu | ValueUnit | None = None
+    low_utilization_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    low_utilization_violation_status: MetricThrottleVu | ValueUnit | None = None
+    ppt_accumulated: MetricThrottleVu | ValueUnit | None = None
+    ppt_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    ppt_violation_status: MetricThrottleVu | ValueUnit | None = None
+    prochot_accumulated: MetricThrottleVu | ValueUnit | None = None
+    prochot_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    prochot_violation_status: MetricThrottleVu | ValueUnit | None = None
+    socket_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None
+    socket_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    socket_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None
+    vr_thermal_accumulated: MetricThrottleVu | ValueUnit | None = None
+    vr_thermal_violation_activity: MetricThrottleVu | ValueUnit | None = None
+    vr_thermal_violation_status: MetricThrottleVu | ValueUnit | None = None
+
+    total_gfx_clk_below_host_limit_accumulated: MetricThrottleVu | ValueUnit | None = None
+    low_utilization_accumulated: MetricThrottleVu | ValueUnit | None = None
+    total_gfx_clk_below_host_limit_violation_status: MetricThrottleVu | ValueUnit | None = None
+    total_gfx_clk_below_host_limit_violation_activity: MetricThrottleVu | ValueUnit | None = None
+
+    na_validator = field_validator(
+        "accumulation_counter",
+        "gfx_clk_below_host_limit_accumulated",
+        "gfx_clk_below_host_limit_power_accumulated",
+        "gfx_clk_below_host_limit_power_violation_activity",
+        "gfx_clk_below_host_limit_power_violation_status",
+        "gfx_clk_below_host_limit_violation_activity",
+        "gfx_clk_below_host_limit_violation_accumulated",
+        "gfx_clk_below_host_limit_violation_status",
+        "gfx_clk_below_host_limit_thermal_violation_accumulated",
+        "gfx_clk_below_host_limit_thermal_violation_activity",
+        "gfx_clk_below_host_limit_thermal_violation_status",
+        "gfx_clk_below_host_limit_thermal_accumulated",
+        "hbm_thermal_accumulated",
+        "hbm_thermal_violation_activity",
+        "hbm_thermal_violation_status",
+        "low_utilization_violation_accumulated",
+        "low_utilization_violation_activity",
+        "low_utilization_violation_status",
+        "ppt_accumulated",
+        "ppt_violation_activity",
+        "ppt_violation_status",
+        "prochot_accumulated",
+        "prochot_violation_activity",
+        "prochot_violation_status",
+        "socket_thermal_accumulated",
+        "socket_thermal_violation_activity",
+        "socket_thermal_violation_status",
+        "vr_thermal_accumulated",
+        "vr_thermal_violation_activity",
+        "vr_thermal_violation_status",
+        "total_gfx_clk_below_host_limit_accumulated",
+        "low_utilization_accumulated",
+        "total_gfx_clk_below_host_limit_violation_status",
+        "total_gfx_clk_below_host_limit_violation_activity",
+        mode="before",
+    )(na_to_none)
+
+
+class EccData(BaseModel):
+    "ECC counts collected per ecc block"
+
+    correctable_count: int | None = 0
+    uncorrectable_count: int | None = 0
+    deferred_count: int | None = 0
+
+    na_validator = field_validator(
+        "correctable_count", "uncorrectable_count", "deferred_count", mode="before"
+    )(na_to_none)
+
+
+class AmdSmiMetric(BaseModel):
+    gpu: int
+    usage: MetricUsage
+    power: MetricPower
+    clock: dict[str, MetricClockData]
+    temperature: MetricTemperature
+    pcie: MetricPcie
+    ecc: MetricEccTotals
+    ecc_blocks: dict[str, EccData] | str
+    fan: MetricFan
+    voltage_curve: MetricVoltageCurve | None
+    perf_level: str | dict | None
+    xgmi_err: str | dict | None
+    energy: MetricEnergy | None
+    mem_usage: MetricMemUsage
+    throttle: MetricThrottle
+
+    na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none)
+
+    @field_validator("ecc_blocks", mode="before")
+    @classmethod
+    def validate_ecc_blocks(cls, value: dict[str, EccData] | str) -> dict[str, EccData]:
+        """Validate the ecc_blocks field."""
+        if isinstance(value, str):
+            # If it's a string, we assume it's "N/A" and return an empty dict
+            return {}
+        return value
+
+    @field_validator("energy", mode="before")
+    @classmethod
+    def validate_energy(cls, value: Any | None) -> MetricEnergy | None:
+        """Validate the energy field."""
+        if value == "N/A" or value is None:
+            return None
+        return value
+
+
 class AmdSmiDataModel(DataModel):
     """Data model for amd-smi data.
 
@@ -449,6 +758,7 @@ class AmdSmiDataModel(DataModel):
     firmware: list[Fw] | None = Field(default_factory=list)
     bad_pages: list[BadPages] | None = Field(default_factory=list)
     static: list[AmdSmiStatic] | None = Field(default_factory=list)
+    metric: list[AmdSmiMetric] | None = Field(default_factory=list)
 
     def get_list(self, gpu: int) -> AmdSmiListItem | None:
         """Get the gpu list item for the given gpu id."""

From a9a4ae31fe5892223b31fdf07de15f4736517672 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Wed, 12 Nov 2025 11:58:38 -0600
Subject: [PATCH 3/9] filled in more gaps + typehint + docstring

---
 .../plugins/inband/amdsmi/amdsmi_collector.py | 358 +++++++++++++++---
 1 file changed, 304 insertions(+), 54 deletions(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
index 291f1949..a29ff1e7 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
@@ -68,6 +68,7 @@
     StaticClockData,
     StaticDriver,
     StaticFrequencyLevels,
+    StaticLimit,
     StaticNuma,
     StaticPolicy,
     StaticSocPstate,
@@ -776,12 +777,23 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]:
                 except Exception:
                     vram_size_b = None
 
+            # Calculate VRAM max bandwidth if possible
+            max_bandwidth = None
+            if vram_bits and kfd.get("memory_max_frequency"):
+                try:
+                    mem_freq_mhz = float(kfd["memory_max_frequency"])
+                    # Bandwidth (GB/s) = (bit_width * frequency_MHz) / 8000 Note: is this correct?
+                    bandwidth_gbs = (float(vram_bits) * mem_freq_mhz) / 8000.0
+                    max_bandwidth = self._valueunit(bandwidth_gbs, "GB/s")
+                except Exception:
+                    pass
+
             vram_model = StaticVram(
                 type=vram_type,
                 vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor),
                 size=self._valueunit(vram_size_b, "B"),
                 bit_width=self._valueunit(vram_bits, "bit"),
-                max_bandwidth=None,
+                max_bandwidth=max_bandwidth,
             )
 
             soc_pstate_model = self._get_soc_pstate(h)
@@ -796,7 +808,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]:
                         asic=asic_model,
                         bus=bus,
                         vbios=vbios_model,
-                        limit=None,
+                        limit=self._get_limit_info(h),
                         driver=driver_model,
                         board=board_model,
                         soc_pstate=soc_pstate_model,
@@ -805,7 +817,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]:
                         numa=numa_model,
                         vram=vram_model,
                         cache_info=cache_info_model,
-                        partition=None,
+                        partition=None,  # Note: ?
                         clock=clock_model,
                     )
                 )
@@ -821,13 +833,13 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]:
         return out
 
     def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]:
-        """SOC pstate check
+        """Get SOC P-state (performance state) policy information for a GPU device.
 
         Args:
             handle (Any): GPU device handle
 
         Returns:
-            Optional[StaticSocPstate]: StaticSocPstate instance or None
+            Optional[StaticSocPstate]: SOC P-state policy data or None if unavailable
         """
         amdsmi = self._amdsmi_mod()
         fn = getattr(amdsmi, "amdsmi_get_soc_pstate", None)
@@ -883,13 +895,13 @@ def _get_soc_pstate(self, handle: Any) -> Optional[StaticSocPstate]:
             return None
 
     def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]:
-        """Check XGMI plpd
+        """Get XGMI Per-Link Power Down (PLPD) policy for a GPU device.
 
         Args:
             handle (Any): GPU device handle
 
         Returns:
-            Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None
+            Optional[StaticXgmiPlpd]: XGMI PLPD policy data or None if unavailable
         """
         amdsmi = self._amdsmi_mod()
         fn = getattr(amdsmi, "amdsmi_get_xgmi_plpd", None)
@@ -945,13 +957,13 @@ def _get_xgmi_plpd(self, handle: Any) -> Optional[StaticXgmiPlpd]:
             return None
 
     def _get_cache_info(self, handle: Any) -> list[StaticCacheInfoItem]:
-        """Check cache info
+        """Get GPU cache hierarchy information (L1, L2, L3, etc.).
 
         Args:
             handle (Any): GPU device handle
 
         Returns:
-            list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances
+            list[StaticCacheInfoItem]: List of cache info items for each cache level
         """
         amdsmi = self._amdsmi_mod()
         raw = self._smi_try(amdsmi.amdsmi_get_gpu_cache_info, handle, default=None)
@@ -1007,8 +1019,38 @@ def _as_list_str(v: Any) -> list[str]:
 
         return out
 
+    def _get_limit_info(self, handle: Any) -> Optional[StaticLimit]:
+        """Get power cap and temperature limit information.
+
+        Args:
+            handle (Any): GPU device handle
+
+        Returns:
+            Optional[StaticLimit]: StaticLimit instance or None
+        """
+        amdsmi = self._amdsmi_mod()
+        fn = getattr(amdsmi, "amdsmi_get_power_cap_info", None)
+        if not callable(fn):
+            return None
+
+        data = self._smi_try(fn, handle, default=None)
+        if not isinstance(data, dict):
+            return None
+
+        return StaticLimit(
+            max_power=self._valueunit(data.get("power_cap"), "W"),
+            min_power=self._valueunit(data.get("min_power_cap"), "W"),
+            socket_power=self._valueunit(data.get("default_power_cap"), "W"),
+            slowdown_edge_temperature=self._valueunit(data.get("slowdown_temp"), "C"),
+            slowdown_hotspot_temperature=self._valueunit(data.get("slowdown_mem_temp"), "C"),
+            slowdown_vram_temperature=self._valueunit(data.get("slowdown_vram_temp"), "C"),
+            shutdown_edge_temperature=self._valueunit(data.get("shutdown_temp"), "C"),
+            shutdown_hotspot_temperature=self._valueunit(data.get("shutdown_mem_temp"), "C"),
+            shutdown_vram_temperature=self._valueunit(data.get("shutdown_vram_temp"), "C"),
+        )
+
     def _get_clock(self, handle: Any) -> Optional[StaticClockData]:
-        """Get clock info
+        """Get clock info using amdsmi_get_clock_info or fallback to amdsmi_get_clk_freq
 
         Args:
             handle (Any): GPU device handle
@@ -1017,9 +1059,23 @@ def _get_clock(self, handle: Any) -> Optional[StaticClockData]:
             Optional[StaticClockData]: StaticClockData instance or None
         """
         amdsmi = self._amdsmi_mod()
-        fn = getattr(amdsmi, "amdsmi_get_clk_freq", None)
         clk_type = getattr(amdsmi, "AmdSmiClkType", None)
-        if not callable(fn) or clk_type is None or not hasattr(clk_type, "SYS"):
+
+        if clk_type is None or not hasattr(clk_type, "SYS"):
+            return None
+
+        # Try amdsmi_get_clock_info API first
+        clock_info_fn = getattr(amdsmi, "amdsmi_get_clock_info", None)
+        if callable(clock_info_fn):
+            data = self._smi_try(clock_info_fn, handle, clk_type.SYS, default=None)
+            if isinstance(data, dict):
+                freqs_raw = data.get("clk_freq") or data.get("frequency")
+                if isinstance(freqs_raw, list) and freqs_raw:
+                    return self._process_clock_data(data, freqs_raw)
+
+        # Fallback to amdsmi_get_clk_freq API
+        fn = getattr(amdsmi, "amdsmi_get_clk_freq", None)
+        if not callable(fn):
             return None
 
         data = self._smi_try(fn, handle, clk_type.SYS, default=None)
@@ -1030,6 +1086,19 @@ def _get_clock(self, handle: Any) -> Optional[StaticClockData]:
         if not isinstance(freqs_raw, list) or not freqs_raw:
             return None
 
+        return self._process_clock_data(data, freqs_raw)
+
+    def _process_clock_data(self, data: dict, freqs_raw: list) -> Optional[StaticClockData]:
+        """Process clock frequency data into StaticClockData model.
+
+        Args:
+            data (dict): Raw clock data from amdsmi API
+            freqs_raw (list): List of frequency values
+
+        Returns:
+            Optional[StaticClockData]: StaticClockData instance or None
+        """
+
         def _to_mhz(v: object) -> Optional[int]:
             x = self._to_number(v)
             if x is None:
@@ -1079,11 +1148,10 @@ def _fmt(n: Optional[int]) -> Optional[str]:
             return None
 
     def get_bad_pages(self) -> Optional[list[BadPages]]:
-        """
-        Collect bad page info per GPU and map to BadPages/PageData models.
+        """Collect bad page info per GPU and map to BadPages/PageData models.
 
         Returns:
-            List[BadPages] (one item per GPU) or None if no devices.
+            Optional[list[BadPages]]: List of bad pages (one per GPU) or None if no devices
         """
         amdsmi = self._amdsmi_mod()
         devices = self._get_handles()
@@ -1165,6 +1233,14 @@ def get_bad_pages(self) -> Optional[list[BadPages]]:
         return out
 
     def get_metric(self) -> Optional[list[AmdSmiMetric]]:
+        """Collect runtime metric data from all GPU devices.
+
+        Collects usage, power, temperature, clocks, PCIe, fan, memory, ECC,
+        throttle, and voltage curve data from amdsmi_get_gpu_metrics_info.
+
+        Returns:
+            Optional[list[AmdSmiMetric]]: List of metric data per GPU or None if no devices
+        """
         amdsmi = self._amdsmi_mod()
         devices = self._get_handles()
         out: list[AmdSmiMetric] = []
@@ -1213,14 +1289,29 @@ def _as_list(v: object) -> list[object]:
                     jpeg_activity=[
                         self._valueunit(v, "%") for v in _as_list(raw.get("jpeg_activity"))
                     ],
-                    gfx_busy_inst=None,
-                    jpeg_busy=None,
-                    vcn_busy=None,
+                    gfx_busy_inst=None,  # Note: note avilable?
+                    jpeg_busy=None,  # Note: note avilable?
+                    vcn_busy=None,  # Note: note avilable?
                 )
 
                 # Power / Energy
+                # Get power from metrics_info
+                socket_power_val = self._valueunit(raw.get("average_socket_power"), "W")
+
+                # Try amdsmi_get_power_info if available and metrics is missing
+                if socket_power_val is None:
+                    power_info_fn = getattr(amdsmi, "amdsmi_get_power_info", None)
+                    if callable(power_info_fn):
+                        power_data = self._smi_try(power_info_fn, h, default=None)
+                        if isinstance(power_data, dict):
+                            socket_power_val = self._valueunit(
+                                power_data.get("current_socket_power")
+                                or power_data.get("average_socket_power"),
+                                "W",
+                            )
+
                 power = MetricPower(
-                    socket_power=self._valueunit(raw.get("average_socket_power"), "W"),
+                    socket_power=socket_power_val,
                     gfx_voltage=self._valueunit(raw.get("voltage_gfx"), "mV"),
                     soc_voltage=self._valueunit(raw.get("voltage_soc"), "mV"),
                     mem_voltage=self._valueunit(raw.get("voltage_mem"), "mV"),
@@ -1250,6 +1341,16 @@ def _as_list(v: object) -> list[object]:
                     float(speed_raw) / 10.0 if isinstance(speed_raw, (int, float)) else None
                 )
 
+                # Get PCIe throughput
+                throughput_fn = getattr(amdsmi, "amdsmi_get_gpu_pci_throughput", None)
+                bandwidth_sent = None
+                bandwidth_received = None
+                if callable(throughput_fn):
+                    throughput_data = self._smi_try(throughput_fn, h, default=None)
+                    if isinstance(throughput_data, dict):
+                        bandwidth_sent = _to_int_or_none(throughput_data.get("sent"))
+                        bandwidth_received = _to_int_or_none(throughput_data.get("received"))
+
                 pcie = MetricPcie(
                     width=_to_int_or_none(raw.get("pcie_link_width")),
                     speed=self._valueunit(speed_gtps, "GT/s"),
@@ -1259,39 +1360,88 @@ def _as_list(v: object) -> list[object]:
                     replay_roll_over_count=_to_int_or_none(raw.get("pcie_replay_rover_count_acc")),
                     nak_sent_count=_to_int_or_none(raw.get("pcie_nak_sent_count_acc")),
                     nak_received_count=_to_int_or_none(raw.get("pcie_nak_rcvd_count_acc")),
-                    current_bandwidth_sent=None,
-                    current_bandwidth_received=None,
+                    current_bandwidth_sent=bandwidth_sent,
+                    current_bandwidth_received=bandwidth_received,
                     max_packet_size=None,
                     lc_perf_other_end_recovery=None,
                 )
 
-                # Clocks
-                def _clk(cur_key: str, raw: dict = raw) -> MetricClockData:
+                # Clocks from clock_info API
+                clock_info_fn = getattr(amdsmi, "amdsmi_get_clock_info", None)
+                clk_type = getattr(amdsmi, "AmdSmiClkType", None)
+                clock_ranges = {}
+
+                if callable(clock_info_fn) and clk_type is not None:
+                    for clk_name, clk_enum_name in [
+                        ("GFX", "GFX"),
+                        ("SOC", "SYS"),
+                        ("UCLK", "MEM"),
+                        ("VCLK0", "VCLK0"),
+                        ("DCLK0", "DCLK0"),
+                        ("VCLK1", "VCLK1"),
+                        ("DCLK1", "DCLK1"),
+                    ]:
+                        clk_enum = getattr(clk_type, clk_enum_name, None)
+                        if clk_enum is not None:
+                            clk_data = self._smi_try(clock_info_fn, h, clk_enum, default=None)
+                            if isinstance(clk_data, dict):
+                                clock_ranges[clk_name] = {
+                                    "min": clk_data.get("min_clk"),
+                                    "max": clk_data.get("max_clk"),
+                                    "sleep": clk_data.get("sleep_clk")
+                                    or clk_data.get("deep_sleep_clk"),
+                                }
+
+                def _clk(
+                    cur_key: str,
+                    clk_name: str = "",
+                    raw: dict = raw,
+                    clock_ranges: dict = clock_ranges,
+                ) -> MetricClockData:
+                    ranges = clock_ranges.get(clk_name, {})
                     return MetricClockData(
                         clk=self._valueunit(raw.get(cur_key), "MHz"),
-                        min_clk=None,
-                        max_clk=None,
+                        min_clk=self._valueunit(ranges.get("min"), "MHz") if ranges else None,
+                        max_clk=self._valueunit(ranges.get("max"), "MHz") if ranges else None,
                         clk_locked=(
                             raw.get("gfxclk_lock_status") if cur_key == "current_gfxclk" else None
                         ),
-                        deep_sleep=None,
+                        deep_sleep=ranges.get("sleep") if ranges else None,
                     )
 
                 clock: dict[str, MetricClockData] = {
-                    "GFX": _clk("current_gfxclk"),
-                    "SOC": _clk("current_socclk"),
-                    "UCLK": _clk("current_uclk"),
-                    "VCLK0": _clk("current_vclk0"),
-                    "DCLK0": _clk("current_dclk0"),
-                    "VCLK1": _clk("current_vclk1"),
-                    "DCLK1": _clk("current_dclk1"),
+                    "GFX": _clk("current_gfxclk", "GFX"),
+                    "SOC": _clk("current_socclk", "SOC"),
+                    "UCLK": _clk("current_uclk", "UCLK"),
+                    "VCLK0": _clk("current_vclk0", "VCLK0"),
+                    "DCLK0": _clk("current_dclk0", "DCLK0"),
+                    "VCLK1": _clk("current_vclk1", "VCLK1"),
+                    "DCLK1": _clk("current_dclk1", "DCLK1"),
                 }
 
                 # Fan
+                fan_rpm = self._valueunit(raw.get("current_fan_speed"), "RPM")
+
+                # Get fan speed as percentage
+                fan_speed_fn = getattr(amdsmi, "amdsmi_get_gpu_fan_speed", None)
+                fan_speed_pct = None
+                if callable(fan_speed_fn):
+                    fan_speed_data = self._smi_try(fan_speed_fn, h, 0, default=None)
+                    if isinstance(fan_speed_data, (int, float)):
+                        fan_speed_pct = self._valueunit(fan_speed_data, "%")
+
+                # Get max fan speed
+                fan_max_fn = getattr(amdsmi, "amdsmi_get_gpu_fan_speed_max", None)
+                fan_max_rpm = None
+                if callable(fan_max_fn):
+                    fan_max_data = self._smi_try(fan_max_fn, h, 0, default=None)
+                    if isinstance(fan_max_data, (int, float)):
+                        fan_max_rpm = self._valueunit(fan_max_data, "RPM")
+
                 fan = MetricFan(
-                    rpm=self._valueunit(raw.get("current_fan_speed"), "RPM"),
-                    speed=None,
-                    max=None,
+                    rpm=fan_rpm,
+                    speed=fan_speed_pct,
+                    max=fan_max_rpm,
                     usage=None,
                 )
 
@@ -1310,7 +1460,13 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData:
 
                 mem_enum = getattr(amdsmi, "AmdSmiMemoryType", None)
                 vis_total_vu: Optional[ValueUnit] = None
+                vis_used_vu: Optional[ValueUnit] = None
+                vis_free_vu: Optional[ValueUnit] = None
                 gtt_total_vu: Optional[ValueUnit] = None
+                gtt_used_vu: Optional[ValueUnit] = None
+                gtt_free_vu: Optional[ValueUnit] = None
+
+                mem_usage_fn = getattr(amdsmi, "amdsmi_get_gpu_memory_usage", None)
 
                 if mem_enum is not None:
                     if total_vram_vu is None:
@@ -1320,19 +1476,47 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData:
                         if vram_total_alt is not None:
                             total_vram_vu = self._valueunit(vram_total_alt, "B")
 
+                    # Visible VRAM total and usage
                     vis_total = self._smi_try(
                         amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.VIS_VRAM, default=None
                     )
                     if vis_total is not None:
                         vis_total_vu = self._valueunit(vis_total, "B")
 
+                        # Get visible VRAM usage
+                        if callable(mem_usage_fn):
+                            vis_used = self._smi_try(
+                                mem_usage_fn, h, mem_enum.VIS_VRAM, default=None
+                            )
+                            if vis_used is not None:
+                                vis_used_vu = self._valueunit(vis_used, "B")
+                                # Calculate free
+                                try:
+                                    free_val = max(0.0, float(vis_total) - float(vis_used))
+                                    vis_free_vu = self._valueunit(free_val, "B")
+                                except Exception:
+                                    pass
+
+                    # GTT total and usage
                     gtt_total = self._smi_try(
                         amdsmi.amdsmi_get_gpu_memory_total, h, mem_enum.GTT, default=None
                     )
                     if gtt_total is not None:
                         gtt_total_vu = self._valueunit(gtt_total, "B")
 
-                # Compute free if possible
+                        # Get GTT usage
+                        if callable(mem_usage_fn):
+                            gtt_used = self._smi_try(mem_usage_fn, h, mem_enum.GTT, default=None)
+                            if gtt_used is not None:
+                                gtt_used_vu = self._valueunit(gtt_used, "B")
+                                # Calculate free
+                                try:
+                                    free_val = max(0.0, float(gtt_total) - float(gtt_used))
+                                    gtt_free_vu = self._valueunit(free_val, "B")
+                                except Exception:
+                                    pass
+
+                # Compute free VRAM if possible
                 if free_vram_vu is None and total_vram_vu is not None and used_vram_vu is not None:
                     try:
                         free_num = max(0.0, float(total_vram_vu.value) - float(used_vram_vu.value))
@@ -1346,11 +1530,11 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData:
                     used_vram=used_vram_vu,
                     free_vram=free_vram_vu,
                     total_visible_vram=vis_total_vu,
-                    used_visible_vram=None,
-                    free_visible_vram=None,
+                    used_visible_vram=vis_used_vu,
+                    free_visible_vram=vis_free_vu,
                     total_gtt=gtt_total_vu,
-                    used_gtt=None,
-                    free_gtt=None,
+                    used_gtt=gtt_used_vu,
+                    free_gtt=gtt_free_vu,
                 )
 
                 # ECC totals
@@ -1409,6 +1593,14 @@ def _clk(cur_key: str, raw: dict = raw) -> MetricClockData:
         return out
 
     def _empty_metric(self, gpu_idx: int) -> AmdSmiMetric:
+        """Create an empty/default AmdSmiMetric instance when data collection fails.
+
+        Args:
+            gpu_idx (int): GPU index
+
+        Returns:
+            AmdSmiMetric: Metric instance with all fields set to None or empty values
+        """
         return AmdSmiMetric(
             gpu=gpu_idx,
             usage=MetricUsage(
@@ -1472,7 +1664,15 @@ def _empty_metric(self, gpu_idx: int) -> AmdSmiMetric:
             throttle=MetricThrottle(),
         )
 
-    def _get_voltage_curve(self, h) -> MetricVoltageCurve:
+    def _get_voltage_curve(self, h: Any) -> MetricVoltageCurve:
+        """Get GPU voltage curve (frequency/voltage points) for overdrive settings.
+
+        Args:
+            h (Any): GPU device handle
+
+        Returns:
+            MetricVoltageCurve: Voltage curve data with up to 3 frequency/voltage points
+        """
         amdsmi = self._amdsmi_mod()
         raw = self._smi_try(amdsmi.amdsmi_get_gpu_od_volt_info, h, default=None)
         if not isinstance(raw, dict):
@@ -1522,6 +1722,11 @@ def _extract_point(p: object) -> tuple[Optional[object], Optional[object]]:
         )
 
     def _empty_voltage_curve(self) -> MetricVoltageCurve:
+        """Create an empty MetricVoltageCurve with all points set to None.
+
+        Returns:
+            MetricVoltageCurve: Empty voltage curve instance
+        """
         return MetricVoltageCurve(
             point_0_frequency=None,
             point_0_voltage=None,
@@ -1531,16 +1736,30 @@ def _empty_voltage_curve(self) -> MetricVoltageCurve:
             point_2_voltage=None,
         )
 
-    def _as_first_plane(self, obj) -> list:
-        """Take a scalar/list/2D-list and return the first plane as a flat list."""
+    def _as_first_plane(self, obj: object) -> list:
+        """Take a scalar/list/2D-list and return the first plane as a flat list.
+
+        Args:
+            obj (object): Scalar, list, or 2D-list to process
+
+        Returns:
+            list: First plane as a flat list, or empty list if not a list
+        """
         if isinstance(obj, list):
             if obj and isinstance(obj[0], list):  # 2D
                 return obj[0]
             return obj
         return []
 
-    def _th_vu_list_pct(self, obj) -> Optional[MetricThrottleVu]:
-        """Return MetricThrottleVu with % ValueUnits for the first XCP plane."""
+    def _th_vu_list_pct(self, obj: object) -> Optional[MetricThrottleVu]:
+        """Return MetricThrottleVu with percentage ValueUnits for the first XCP plane.
+
+        Args:
+            obj (object): Object containing throttle data (scalar, list, or 2D-list)
+
+        Returns:
+            Optional[MetricThrottleVu]: MetricThrottleVu with percentage values or None
+        """
         arr = self._as_first_plane(obj)
         if not arr:
             return None
@@ -1548,8 +1767,15 @@ def _th_vu_list_pct(self, obj) -> Optional[MetricThrottleVu]:
             xcp_0=[self._valueunit(v, "%") if v not in (None, "N/A") else "N/A" for v in arr]
         )
 
-    def _th_vu_list_raw(self, obj) -> Optional[MetricThrottleVu]:
-        """Return MetricThrottleVu with raw ints/strings for the first XCP plane."""
+    def _th_vu_list_raw(self, obj: object) -> Optional[MetricThrottleVu]:
+        """Return MetricThrottleVu with raw integers/strings for the first XCP plane.
+
+        Args:
+            obj (object): Object containing throttle data (scalar, list, or 2D-list)
+
+        Returns:
+            Optional[MetricThrottleVu]: MetricThrottleVu with raw values or None
+        """
         arr = self._as_first_plane(obj)
         if not arr:
             return None
@@ -1564,7 +1790,15 @@ def _th_vu_list_raw(self, obj) -> Optional[MetricThrottleVu]:
             ]
         )
 
-    def get_throttle(self, h) -> MetricThrottle:
+    def get_throttle(self, h: Any) -> MetricThrottle:
+        """Get throttle/violation status data for a GPU device.
+
+        Args:
+            h (Any): GPU device handle
+
+        Returns:
+            MetricThrottle: Throttle metrics and violation status data
+        """
         amdsmi = self._amdsmi_mod()
         raw = self._smi_try(amdsmi.amdsmi_get_violation_status, h, default=None)
         if not isinstance(raw, dict):
@@ -1634,6 +1868,14 @@ def get_throttle(self, h) -> MetricThrottle:
         )
 
     def _flatten_2d(self, v: object) -> list[object]:
+        """Flatten a 2D list into a 1D list, or normalize scalars/None to lists.
+
+        Args:
+            v (object): Input value (scalar, list, or 2D-list)
+
+        Returns:
+            list[object]: Flattened list of objects
+        """
         if isinstance(v, list) and v and isinstance(v[0], list):
             out: list[object] = []
             for row in v:
@@ -1647,11 +1889,19 @@ def _flatten_2d(self, v: object) -> list[object]:
     def _coerce_throttle_value(
         self, v: object, unit: str = ""
     ) -> Optional[Union[MetricThrottleVu, ValueUnit]]:
-        """
-        Convert ints/floats/strings/lists/2D-lists/dicts into:
-          - ValueUnit
-          - MetricThrottleVu(xcp_0=[...])
-          - None for N/A/empty
+        """Convert various throttle data formats to ValueUnit or MetricThrottleVu.
+
+        Converts integers/floats/strings/lists/2D-lists/dicts into appropriate types:
+          - ValueUnit for scalar values
+          - MetricThrottleVu(xcp_0=[...]) for lists/arrays
+          - None for N/A or empty values
+
+        Args:
+            v (object): Input throttle value in various formats
+            unit (str, optional): Unit of measurement. Defaults to empty string.
+
+        Returns:
+            Optional[Union[MetricThrottleVu, ValueUnit]]: Coerced throttle value or None
         """
         if v in (None, "", "N/A"):
             return None

From 06aa5ebd50c600c3f60a4a02a0a621c04bf292c9 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Thu, 13 Nov 2025 09:45:19 -0600
Subject: [PATCH 4/9] fix + enhanced utest + deprecation warning fix

---
 .../plugins/inband/amdsmi/amdsmidata.py       |  2 +-
 test/unit/plugin/test_amdsmi_collector.py     | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
index edab6044..7fda0378 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
@@ -64,7 +64,7 @@ class AmdSmiBaseModel(BaseModel):
 
     def __init__(self, **data):
         # Convert  Union[int, str, float] -> ValueUnit
-        for field_name, field_type in self.model_fields.items():
+        for field_name, field_type in self.__class__.model_fields.items():
             annotation = field_type.annotation
             target_type, container = find_annotation_in_container(annotation, ValueUnit)
             if target_type is None:
diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py
index 2a34551b..66ab2b6f 100644
--- a/test/unit/plugin/test_amdsmi_collector.py
+++ b/test/unit/plugin/test_amdsmi_collector.py
@@ -151,6 +151,35 @@ def amdsmi_get_clk_freq(h, clk_type):
 
     m.amdsmi_get_clk_freq = amdsmi_get_clk_freq
 
+    m.amdsmi_get_gpu_bad_page_info = lambda h: {"page_list": []}
+
+    m.amdsmi_get_gpu_metrics_info = lambda h: {
+        "temperature_hotspot": 55,
+        "temperature_mem": 50,
+        "average_socket_power": 150,
+        "current_gfxclk": 1500,
+        "current_uclk": 1000,
+    }
+
+    m.amdsmi_get_gpu_od_volt_info = lambda h: {
+        "curve": {"vc_points": [{"frequency": 1500, "voltage": 850}]}
+    }
+
+    m.amdsmi_get_gpu_vram_usage = lambda h: {
+        "vram_used": 1024 * 1024 * 1024,
+        "vram_total": 64 * 1024 * 1024 * 1024,
+    }
+
+    m.amdsmi_get_gpu_memory_total = lambda h, mem_type: 64 * 1024 * 1024 * 1024
+
+    m.amdsmi_get_gpu_total_ecc_count = lambda h: {"correctable_count": 0, "uncorrectable_count": 0}
+
+    m.amdsmi_get_violation_status = lambda h: {
+        "acc_counter": 0,
+        "acc_prochot_thrm": 0,
+        "acc_ppt_pwr": 0,
+    }
+
     m.amdsmi_get_fw_info = lambda h: {
         "fw_list": [
             {"fw_name": "SMU", "fw_version": "55.33"},

From f7259a854a3b6af88e272989d2731c1ac404e8c6 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Thu, 13 Nov 2025 12:16:33 -0600
Subject: [PATCH 5/9] removed deprecated calls

---
 .../plugins/inband/amdsmi/amdsmi_collector.py  | 10 ++--------
 .../plugins/inband/amdsmi/amdsmidata.py        | 18 +++++++++---------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
index bcdcfae3..244b994f 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
@@ -1010,13 +1010,13 @@ def _as_list_str(v: Any) -> list[str]:
         return out
 
     def _get_limit_info(self, handle: Any) -> Optional[StaticLimit]:
-        """Get power cap and temperature limit information.
+        """Get power cap limit information.
 
         Args:
             handle (Any): GPU device handle
 
         Returns:
-            Optional[StaticLimit]: StaticLimit instance or None
+            Optional[StaticLimit]: StaticLimit instance with power cap data or None
         """
         amdsmi = self._amdsmi_mod()
         fn = getattr(amdsmi, "amdsmi_get_power_cap_info", None)
@@ -1031,12 +1031,6 @@ def _get_limit_info(self, handle: Any) -> Optional[StaticLimit]:
             max_power=self._valueunit(data.get("power_cap"), "W"),
             min_power=self._valueunit(data.get("min_power_cap"), "W"),
             socket_power=self._valueunit(data.get("default_power_cap"), "W"),
-            slowdown_edge_temperature=self._valueunit(data.get("slowdown_temp"), "C"),
-            slowdown_hotspot_temperature=self._valueunit(data.get("slowdown_mem_temp"), "C"),
-            slowdown_vram_temperature=self._valueunit(data.get("slowdown_vram_temp"), "C"),
-            shutdown_edge_temperature=self._valueunit(data.get("shutdown_temp"), "C"),
-            shutdown_hotspot_temperature=self._valueunit(data.get("shutdown_mem_temp"), "C"),
-            shutdown_vram_temperature=self._valueunit(data.get("shutdown_vram_temp"), "C"),
         )
 
     def _get_clock(self, handle: Any) -> Optional[StaticClockData]:
diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
index 7fda0378..b3aca761 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py
@@ -277,15 +277,15 @@ class StaticVbios(BaseModel):
 
 
 class StaticLimit(AmdSmiBaseModel):
-    max_power: Optional[ValueUnit]
-    min_power: Optional[ValueUnit]
-    socket_power: Optional[ValueUnit]
-    slowdown_edge_temperature: Optional[ValueUnit]
-    slowdown_hotspot_temperature: Optional[ValueUnit]
-    slowdown_vram_temperature: Optional[ValueUnit]
-    shutdown_edge_temperature: Optional[ValueUnit]
-    shutdown_hotspot_temperature: Optional[ValueUnit]
-    shutdown_vram_temperature: Optional[ValueUnit]
+    max_power: Optional[ValueUnit] = None
+    min_power: Optional[ValueUnit] = None
+    socket_power: Optional[ValueUnit] = None
+    slowdown_edge_temperature: Optional[ValueUnit] = None
+    slowdown_hotspot_temperature: Optional[ValueUnit] = None
+    slowdown_vram_temperature: Optional[ValueUnit] = None
+    shutdown_edge_temperature: Optional[ValueUnit] = None
+    shutdown_hotspot_temperature: Optional[ValueUnit] = None
+    shutdown_vram_temperature: Optional[ValueUnit] = None
     na_validator = field_validator(
         "max_power",
         "min_power",

From a0b825ef7cf1e5bd27d59c75ed3cdc34de1e2db4 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Mon, 17 Nov 2025 12:37:22 -0600
Subject: [PATCH 6/9] updates on missing calls

---
 .../plugins/inband/amdsmi/amdsmi_analyzer.py  | 237 +++++++++++++++++-
 1 file changed, 236 insertions(+), 1 deletion(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py
index 5a836740..5e16f82e 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py
@@ -30,7 +30,15 @@
 from nodescraper.interfaces import DataAnalyzer
 from nodescraper.models import TaskResult
 
-from .amdsmidata import AmdSmiDataModel, AmdSmiStatic, Fw, Partition, Processes
+from .amdsmidata import (
+    AmdSmiDataModel,
+    AmdSmiMetric,
+    AmdSmiStatic,
+    EccData,
+    Fw,
+    Partition,
+    Processes,
+)
 from .analyzer_args import AmdSmiAnalyzerArgs
 
 
@@ -122,6 +130,223 @@ def check_expected_driver_version(
                 },
             )
 
+    def check_amdsmi_metric_pcie(
+        self,
+        amdsmi_metric_data: list[AmdSmiMetric],
+        l0_to_recovery_count_error_threshold: int,
+        l0_to_recovery_count_warning_threshold: int,
+    ):
+        """Check PCIe metrics for link errors
+
+        Checks for PCIe link width, speed, replays, recoveries, and NAKs.
+        Expected width/speeds should come from SKU info.
+
+        Args:
+            amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
+            l0_to_recovery_count_error_threshold (int): Threshold for error events
+            l0_to_recovery_count_warning_threshold (int): Threshold for warning events
+        """
+        for metric in amdsmi_metric_data:
+            pcie_data = metric.pcie
+            gpu = metric.gpu
+
+            if pcie_data.width is not None and pcie_data.width != 16:
+                self._log_event(
+                    category=EventCategory.IO,
+                    description=f"GPU: {gpu} PCIe width is not x16",
+                    priority=EventPriority.ERROR,
+                    data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16},
+                    console_log=True,
+                )
+
+            if pcie_data.speed is not None and pcie_data.speed.value is not None:
+                try:
+                    speed_val = float(pcie_data.speed.value)
+                    if speed_val != 32.0:
+                        self._log_event(
+                            category=EventCategory.IO,
+                            description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)",
+                            priority=EventPriority.ERROR,
+                            data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0},
+                            console_log=True,
+                        )
+                except (ValueError, TypeError):
+                    pass
+
+            if pcie_data.replay_count is not None and pcie_data.replay_count > 0:
+                self._log_event(
+                    category=EventCategory.IO,
+                    description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}",
+                    priority=EventPriority.WARNING,
+                    data={"gpu": gpu, "replay_count": pcie_data.replay_count},
+                    console_log=True,
+                )
+
+            if (
+                pcie_data.replay_roll_over_count is not None
+                and pcie_data.replay_roll_over_count > 0
+            ):
+                self._log_event(
+                    category=EventCategory.IO,
+                    description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}",
+                    priority=EventPriority.WARNING,
+                    data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count},
+                    console_log=True,
+                )
+
+            if pcie_data.l0_to_recovery_count is not None:
+                if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold:
+                    self._log_event(
+                        category=EventCategory.IO,
+                        description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
+                        priority=EventPriority.ERROR,
+                        data={
+                            "gpu": gpu,
+                            "l0_to_recovery_count": pcie_data.l0_to_recovery_count,
+                            "error_threshold": l0_to_recovery_count_error_threshold,
+                        },
+                        console_log=True,
+                    )
+                elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold:
+                    self._log_event(
+                        category=EventCategory.IO,
+                        description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
+                        priority=EventPriority.WARNING,
+                        data={
+                            "gpu": gpu,
+                            "l0_to_recovery_count": pcie_data.l0_to_recovery_count,
+                            "warning_threshold": l0_to_recovery_count_warning_threshold,
+                        },
+                        console_log=True,
+                    )
+
+            if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0:
+                self._log_event(
+                    category=EventCategory.IO,
+                    description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs",
+                    priority=EventPriority.WARNING,
+                    data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count},
+                    console_log=True,
+                )
+
+            if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0:
+                self._log_event(
+                    category=EventCategory.IO,
+                    description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs",
+                    priority=EventPriority.WARNING,
+                    data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count},
+                    console_log=True,
+                )
+
+    def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]):
+        """Check ECC totals for all GPUs
+
+        Raises errors for uncorrectable errors, warnings for correctable and deferred.
+
+        Args:
+            amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
+        """
+        for metric in amdsmi_metric_data:
+            ecc_totals = metric.ecc
+            gpu = metric.gpu
+
+            ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [
+                (
+                    EventPriority.WARNING,
+                    ecc_totals.total_correctable_count,
+                    "Total correctable ECC errors",
+                ),
+                (
+                    EventPriority.ERROR,
+                    ecc_totals.total_uncorrectable_count,
+                    "Total uncorrectable ECC errors",
+                ),
+                (
+                    EventPriority.WARNING,
+                    ecc_totals.total_deferred_count,
+                    "Total deferred ECC errors",
+                ),
+                (
+                    EventPriority.WARNING,
+                    ecc_totals.cache_correctable_count,
+                    "Cache correctable ECC errors",
+                ),
+                (
+                    EventPriority.ERROR,
+                    ecc_totals.cache_uncorrectable_count,
+                    "Cache uncorrectable ECC errors",
+                ),
+            ]
+
+            for priority, count, desc in ecc_checks:
+                if count is not None and count > 0:
+                    self._log_event(
+                        category=EventCategory.RAS,
+                        description=f"GPU: {gpu} has {desc}: {count}",
+                        priority=priority,
+                        data={"gpu": gpu, "error_count": count, "error_type": desc},
+                        console_log=True,
+                    )
+
+    def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]):
+        """Check ECC counts in all blocks for all GPUs
+
+        Raises errors for uncorrectable errors, warnings for correctable and deferred.
+
+        Args:
+            amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
+        """
+        for metric in amdsmi_metric_data:
+            gpu = metric.gpu
+            ecc_blocks = metric.ecc_blocks
+
+            # Skip if ecc_blocks is a string (e.g., "N/A") or empty
+            if isinstance(ecc_blocks, str) or not ecc_blocks:
+                continue
+
+            for block_name, ecc_data in ecc_blocks.items():
+                if not isinstance(ecc_data, EccData):
+                    continue
+
+                if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0:
+                    self._log_event(
+                        category=EventCategory.RAS,
+                        description=f"GPU: {gpu} has correctable ECC errors in block {block_name}",
+                        priority=EventPriority.WARNING,
+                        data={
+                            "gpu": gpu,
+                            "block": block_name,
+                            "correctable_count": ecc_data.correctable_count,
+                        },
+                        console_log=True,
+                    )
+
+                if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0:
+                    self._log_event(
+                        category=EventCategory.RAS,
+                        description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}",
+                        priority=EventPriority.ERROR,
+                        data={
+                            "gpu": gpu,
+                            "block": block_name,
+                            "uncorrectable_count": ecc_data.uncorrectable_count,
+                        },
+                        console_log=True,
+                    )
+
+                if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0:
+                    self._log_event(
+                        category=EventCategory.RAS,
+                        description=f"GPU: {gpu} has deferred ECC errors in block {block_name}",
+                        priority=EventPriority.WARNING,
+                        data={
+                            "gpu": gpu,
+                            "block": block_name,
+                            "deferred_count": ecc_data.deferred_count,
+                        },
+                        console_log=True,
+                    )
+
     def expected_gpu_processes(
         self, processes_data: Optional[list[Processes]], max_num_processes: int
     ):
@@ -427,6 +652,16 @@ def analyze_data(
         if args is None:
             args = AmdSmiAnalyzerArgs()
 
+        if data.metric is not None and len(data.metric) > 0:
+            if args.l0_to_recovery_count_error_threshold is not None:
+                self.check_amdsmi_metric_pcie(
+                    data.metric,
+                    args.l0_to_recovery_count_error_threshold,
+                    args.l0_to_recovery_count_warning_threshold or 1,
+                )
+            self.check_amdsmi_metric_ecc_totals(data.metric)
+            self.check_amdsmi_metric_ecc(data.metric)
+
         if args.expected_gpu_processes:
             self.expected_gpu_processes(data.process, args.expected_gpu_processes)
 

From 80b283b52fff7aeb909a8cf430069dc985df9bce Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Mon, 24 Nov 2025 14:27:08 -0600
Subject: [PATCH 7/9] fix for multi json return

---
 .../plugins/inband/amdsmi/amdsmi_collector.py | 53 +++++++++++++++----
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
index 15894a4a..82b02071 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
@@ -183,19 +183,50 @@ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]:
         cmd_ret = self._run_amd_smi(cmd)
         if cmd_ret:
             try:
+                # Try to parse as single JSON first
                 return json.loads(cmd_ret)
             except json.JSONDecodeError as e:
-                self._log_event(
-                    category=EventCategory.APPLICATION,
-                    description=f"Error parsing command: `{cmd}` json data",
-                    data={
-                        "cmd": cmd,
-                        "exception": get_exception_traceback(e),
-                    },
-                    priority=EventPriority.ERROR,
-                    console_log=True,
-                )
-                return None
+                # try to extract and parse multiple JSON objects
+                try:
+                    json_objects = []
+                    decoder = json.JSONDecoder()
+                    idx = 0
+                    cmd_ret_stripped = cmd_ret.strip()
+
+                    while idx < len(cmd_ret_stripped):
+                        while idx < len(cmd_ret_stripped) and cmd_ret_stripped[idx].isspace():
+                            idx += 1
+
+                        if idx >= len(cmd_ret_stripped):
+                            break
+
+                        if cmd_ret_stripped[idx] not in ["{", "["]:
+                            break
+
+                        try:
+                            obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx)
+                            json_objects.append(obj)
+                            idx += end_idx
+                        except json.JSONDecodeError:
+                            break
+
+                    if json_objects:
+                        return json_objects if len(json_objects) > 1 else json_objects[0]
+                    else:
+                        raise
+
+                except Exception:
+                    self._log_event(
+                        category=EventCategory.APPLICATION,
+                        description=f"Error parsing command: `{cmd}` json data",
+                        data={
+                            "cmd": cmd,
+                            "exception": get_exception_traceback(e),
+                        },
+                        priority=EventPriority.ERROR,
+                        console_log=True,
+                    )
+                    return None
         return None
 
     def _to_number(self, v: object) -> Optional[Union[int, float]]:

From aa838b582baf4ef7c500745e8e5cfa66bda54361 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Mon, 24 Nov 2025 14:41:52 -0600
Subject: [PATCH 8/9] fix for partition

---
 .../plugins/inband/amdsmi/amdsmi_collector.py | 10 ++-
 test/unit/plugin/test_amdsmi_collector.py     | 81 +++++++++++++++++--
 2 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
index 82b02071..c8a0eb60 100644
--- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
+++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
@@ -206,7 +206,7 @@ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]:
                         try:
                             obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx)
                             json_objects.append(obj)
-                            idx += end_idx
+                            idx = end_idx
                         except json.JSONDecodeError:
                             break
 
@@ -529,7 +529,15 @@ def get_partition(self) -> Optional[Partition]:
         memparts: list[PartitionMemory] = []
         computeparts: list[PartitionCompute] = []
 
+        # Flatten multi-JSON results (partition command returns multiple JSON arrays)
+        flattened_data = []
         for item in partition_data:
+            if isinstance(item, list):
+                flattened_data.extend(item)
+            elif isinstance(item, dict):
+                flattened_data.append(item)
+
+        for item in flattened_data:
             if not isinstance(item, dict):
                 continue
 
diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py
index 6783c407..4eeb76d3 100644
--- a/test/unit/plugin/test_amdsmi_collector.py
+++ b/test/unit/plugin/test_amdsmi_collector.py
@@ -87,11 +87,21 @@ def mock_run_sut_cmd(cmd: str) -> MagicMock:
             )
 
         if "partition --json" in cmd:
-            return make_cmd_result(
+            json_output = (
                 make_json_response(
                     [{"gpu": 0, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}]
                 )
+                + "\n"
+                + make_json_response(
+                    [{"gpu": 1, "memory_partition": "NPS1", "compute_partition": "CPX_DISABLED"}]
+                )
+                + "\n"
+                + make_json_response(
+                    [{"gpu_id": "N/A", "profile_index": "N/A", "partition_id": "0"}]
+                )
+                + "\n\nLegend:\n  * = Current mode"
             )
+            return make_cmd_result(json_output)
 
         if "firmware --json" in cmd:
             return make_cmd_result(
@@ -241,9 +251,8 @@ def test_collect_data(collector):
     assert data.process is not None and len(data.process) == 1
     assert len(data.process[0].process_list) == 2
 
-    # partition
     assert data.partition is not None
-    assert len(data.partition.memory_partition) == 1
+    assert len(data.partition.memory_partition) >= 1
     assert data.partition.memory_partition[0].partition_type == "NPS1"
 
     # firmware
@@ -286,12 +295,12 @@ def test_get_process(collector):
 
 
 def test_get_partition(collector):
-    """Test partition parsing"""
+    """Test partition parsing with multi-JSON output"""
     p = collector.get_partition()
     assert p is not None
-    assert len(p.memory_partition) == 1 and len(p.compute_partition) == 1
+    # The mock now returns realistic multi-JSON output
+    assert len(p.memory_partition) >= 1
     assert p.memory_partition[0].partition_type == "NPS1"
-    assert p.compute_partition[0].partition_type == "CPX_DISABLED"
 
 
 def test_get_firmware(collector):
@@ -369,7 +378,7 @@ def mock_bad_json(cmd: str) -> MagicMock:
     result, data = c.collect_data()
     assert data is not None
     assert data.version is None
-    assert len(result.events) > 0  # Should have error events
+    assert len(result.events) > 0
 
 
 def test_command_error(conn_mock, system_info, monkeypatch):
@@ -392,3 +401,61 @@ def mock_cmd_error(cmd: str) -> MagicMock:
     assert data.version is None
     assert data.gpu_list == []
     assert len(result.events) > 0  # Should have error events
+
+
+def test_multi_json_parsing(conn_mock, system_info, monkeypatch):
+    """Test parsing of multiple JSON objects with trailing text"""
+
+    def mock_multi_json(cmd: str) -> MagicMock:
+        if "which amd-smi" in cmd:
+            return make_cmd_result("/usr/bin/amd-smi")
+        if "test --json" in cmd:
+            multi_json = (
+                '[{"data": 1}]\n'
+                '[{"data": 2}]\n'
+                '[{"data": 3}]\n'
+                "\n\nLegend:\n  * = Current mode\n"
+            )
+            return make_cmd_result(multi_json)
+        return make_cmd_result("")
+
+    c = AmdSmiCollector(
+        system_info=system_info,
+        system_interaction_level=SystemInteractionLevel.PASSIVE,
+        connection=conn_mock,
+    )
+    monkeypatch.setattr(c, "_run_sut_cmd", mock_multi_json)
+
+    result = c._run_amd_smi_dict("test")
+
+    assert result is not None
+    assert isinstance(result, list)
+    assert len(result) == 3
+    assert result[0] == [{"data": 1}]
+    assert result[1] == [{"data": 2}]
+    assert result[2] == [{"data": 3}]
+
+
+def test_single_json_parsing(conn_mock, system_info, monkeypatch):
+    """Test that single JSON parsing still works"""
+
+    def mock_single_json(cmd: str) -> MagicMock:
+        if "which amd-smi" in cmd:
+            return make_cmd_result("/usr/bin/amd-smi")
+        if "version --json" in cmd:
+            return make_cmd_result(make_json_response([{"tool": "amdsmi", "version": "1.0"}]))
+        return make_cmd_result("")
+
+    c = AmdSmiCollector(
+        system_info=system_info,
+        system_interaction_level=SystemInteractionLevel.PASSIVE,
+        connection=conn_mock,
+    )
+    monkeypatch.setattr(c, "_run_sut_cmd", mock_single_json)
+
+    result = c._run_amd_smi_dict("version")
+
+    assert result is not None
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0]["tool"] == "amdsmi"

From d4d85193e304ba6d4c30dcd248a2459045aaaed7 Mon Sep 17 00:00:00 2001
From: Alexandra Bara <alexbara@amd.com>
Date: Thu, 4 Dec 2025 09:20:38 -0600
Subject: [PATCH 9/9] cleanup

---
 test/unit/plugin/test_amdsmi_collector.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/unit/plugin/test_amdsmi_collector.py b/test/unit/plugin/test_amdsmi_collector.py
index 4eeb76d3..c8ba08c6 100644
--- a/test/unit/plugin/test_amdsmi_collector.py
+++ b/test/unit/plugin/test_amdsmi_collector.py
@@ -298,7 +298,6 @@ def test_get_partition(collector):
     """Test partition parsing with multi-JSON output"""
     p = collector.get_partition()
     assert p is not None
-    # The mock now returns realistic multi-JSON output
     assert len(p.memory_partition) >= 1
     assert p.memory_partition[0].partition_type == "NPS1"
 
@@ -400,7 +399,7 @@ def mock_cmd_error(cmd: str) -> MagicMock:
     assert data is not None
     assert data.version is None
     assert data.gpu_list == []
-    assert len(result.events) > 0  # Should have error events
+    assert len(result.events) > 0
 
 
 def test_multi_json_parsing(conn_mock, system_info, monkeypatch):