NVIDIA · rwgk · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/cuda_bindings/tests/nvml/test_device.py b/cuda_bindings/tests/nvml/test_device.py
@@ -38,7 +38,8 @@ def test_clk_mon_status_t():
 
 def test_current_clock_freqs(all_devices):
     for device in all_devices:
-        clk_freqs = nvml.device_get_current_clock_freqs(device)
+        with unsupported_before(device, None):
+            clk_freqs = nvml.device_get_current_clock_freqs(device)
         assert isinstance(clk_freqs, str)
 
 
@@ -87,7 +88,8 @@ def test_device_get_pdi(all_devices):
 
 def test_device_get_performance_modes(all_devices):
     for device in all_devices:
-        modes = nvml.device_get_performance_modes(device)
+        with unsupported_before(device, None):
+            modes = nvml.device_get_performance_modes(device)
         assert isinstance(modes, str)
 
 
@@ -133,7 +135,10 @@ def test_nvlink_low_power_threshold(all_devices):
     for device in all_devices:
         # Docs say supported on HOPPER or newer
         with unsupported_before(device, None):
-            nvml.device_set_nvlink_device_low_power_threshold(device, 0)
+            try:
+                nvml.device_set_nvlink_device_low_power_threshold(device, 0)
+            except nvml.NoPermissionError:
+                pytest.skip("No permission to set NVLink low power threshold")
 
 
 def test_get_power_management_limit(all_devices):

diff --git a/cuda_bindings/tests/nvml/test_gpu.py b/cuda_bindings/tests/nvml/test_gpu.py
@@ -20,7 +20,8 @@ def test_gpu_get_module_id(nvml_init):
         if util.is_vgpu(device):
             continue
 
-        module_id = nvml.device_get_module_id(device)
+        with unsupported_before(device, None):
+            module_id = nvml.device_get_module_id(device)
         assert isinstance(module_id, int)
 
 

diff --git a/cuda_bindings/tests/nvml/test_pynvml.py b/cuda_bindings/tests/nvml/test_pynvml.py
@@ -148,12 +148,12 @@ def test_device_get_power_usage(ngpus, handles):
 
 def test_device_get_total_energy_consumption(ngpus, handles):
     for i in range(ngpus):
-        with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
+        with unsupported_before(handles[i], None):
             energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
 
         for j in range(10):  # idle for 150 ms
             time.sleep(0.015)  # and check for increase every 15 ms
-            with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
+            with unsupported_before(handles[i], None):
                 energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
             assert energy_mjoules2 >= energy_mjoules1
             if energy_mjoules2 > energy_mjoules1:
@@ -169,7 +169,8 @@ def test_device_get_total_energy_consumption(ngpus, handles):
 
 def test_device_get_memory_info(ngpus, handles):
     for i in range(ngpus):
-        meminfo = nvml.device_get_memory_info_v2(handles[i])
+        with unsupported_before(handles[i], None):
+            meminfo = nvml.device_get_memory_info_v2(handles[i])
         assert (meminfo.used <= meminfo.total) and (meminfo.free <= meminfo.total)
 
 
@@ -243,10 +244,11 @@ def test_device_get_utilization_rates(ngpus, handles):
 
 def test_device_get_pcie_throughput(ngpus, handles):
     for i in range(ngpus):
-        with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
+        with unsupported_before(handles[i], None):
             tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
         assert tx_bytes_tp >= 0
-        rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
+        with unsupported_before(handles[i], None):
+            rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
         assert rx_bytes_tp >= 0
 
         # with pytest.raises(nvml.InvalidArgumentError):

diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
@@ -75,7 +75,7 @@ def test_device_architecture():
 
 def test_device_bar1_memory():
     for device in system.Device.get_all_devices():
-        with unsupported_before(device, DeviceArch.KEPLER):
+        with unsupported_before(device, None):
             bar1_memory_info = device.bar1_memory_info
         free, total, used = (
             bar1_memory_info.free,
@@ -136,7 +136,8 @@ def test_device_cuda_compute_capability():
 
 def test_device_memory():
     for device in system.Device.get_all_devices():
-        memory_info = device.memory_info
+        with unsupported_before(device, None):
+            memory_info = device.memory_info
         free, total, used, reserved = memory_info.free, memory_info.total, memory_info.used, memory_info.reserved
 
         assert isinstance(memory_info, system.MemoryInfo)
@@ -212,7 +213,8 @@ def test_device_pci_info():
         assert isinstance(pci_info.get_current_pcie_link_width(), int)
         assert 0 <= pci_info.get_current_pcie_link_width() <= 0xFF
 
-        assert isinstance(pci_info.get_pcie_throughput(system.PcieUtilCounter.PCIE_UTIL_TX_BYTES), int)
+        with unsupported_before(device, None):
+            assert isinstance(pci_info.get_pcie_throughput(system.PcieUtilCounter.PCIE_UTIL_TX_BYTES), int)
 
         assert isinstance(pci_info.get_pcie_replay_counter(), int)
 
@@ -421,7 +423,8 @@ def test_index():
 
 def test_module_id():
     for device in system.Device.get_all_devices():
-        module_id = device.module_id
+        with unsupported_before(device, None):
+            module_id = device.module_id
         assert isinstance(module_id, int)
         assert module_id >= 0
 
@@ -509,7 +512,8 @@ def test_get_inforom_version():
         with unsupported_before(device, "HAS_INFOROM"):
             inforom = device.inforom
 
-        inforom_image_version = inforom.image_version
+        with unsupported_before(device, "HAS_INFOROM"):
+            inforom_image_version = inforom.image_version
         assert isinstance(inforom_image_version, str)
         assert len(inforom_image_version) > 0
 
@@ -558,7 +562,7 @@ def test_clock():
             # These are ordered from oldest API to newest API so we test as much
             # as we can on each hardware architecture.
 
-            with unsupported_before(device, "FERMI"):
+            with unsupported_before(device, None):
                 pstate = device.performance_state
 
             min_, max_ = clock.get_min_max_clock_of_pstate_mhz(pstate)
@@ -600,10 +604,12 @@ def test_clock():
 
 def test_clock_event_reasons():
     for device in system.Device.get_all_devices():
-        reasons = device.get_current_clock_event_reasons()
+        with unsupported_before(device, None):
+            reasons = device.get_current_clock_event_reasons()
         assert all(isinstance(reason, system.ClocksEventReasons) for reason in reasons)
 
-        reasons = device.get_supported_clock_event_reasons()
+        with unsupported_before(device, None):
+            reasons = device.get_supported_clock_event_reasons()
         assert all(isinstance(reason, system.ClocksEventReasons) for reason in reasons)
 
 
@@ -706,7 +712,8 @@ def test_temperature():
 
 def test_pstates():
     for device in system.Device.get_all_devices():
-        pstate = device.performance_state
+        with unsupported_before(device, None):
+            pstate = device.performance_state
         assert isinstance(pstate, system.Pstates)
 
         pstates = device.get_supported_pstates()

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -150,6 +150,7 @@ def test_launch_invalid_values(init_cuda):
         launch(StreamWrapper(stream), config, ker)
 
     launch(stream, config, ker)
+    stream.sync()  # TODO(#1539)
 
 
 # Parametrize: (python_type, cpp_type, init_value)