NVIDIA
diff --git a/‎cpp/tensorrt_llm/thop/fusedAddRMSNormQuant.cpp‎
Lines changed: 6 additions & 6 deletions b/‎cpp/tensorrt_llm/thop/fusedAddRMSNormQuant.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py‎
Lines changed: 7 additions & 2 deletions b/‎tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎tensorrt_llm/tools/layer_wise_benchmarks/runner.py‎
Lines changed: 8 additions & 1 deletion b/‎tensorrt_llm/tools/layer_wise_benchmarks/runner.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎tests/unittest/_torch/modules/mamba/test_causal_conv1d.py‎
Lines changed: 247 additions & 0 deletions b/‎tests/unittest/_torch/modules/mamba/test_causal_conv1d.py‎
Lines changed: 247 additions & 0 deletions
@@ -52,9 +52,9 @@ namespace torch_ext
 //
 // NOTE: This kernel requires SM90 (Hopper) or SM100 (Blackwell) GPU architecture.
 // NOTE: Hidden dimension N must be >= 2048 and <= 16384.
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> fused_add_rms_norm_quant(at::Tensor const& input,
-    at::Tensor const& residual, at::Tensor const& gamma, std::optional<at::Tensor> const& sf_scale, bool use_rms_norm,
-    double eps, bool output_hp_norm)
+std::tuple<at::Tensor, at::Tensor, at::Tensor, std::optional<at::Tensor>> fused_add_rms_norm_quant(
+    at::Tensor const& input, at::Tensor const& residual, at::Tensor const& gamma,
+    std::optional<at::Tensor> const& sf_scale, bool use_rms_norm, double eps, bool output_hp_norm)
 {
     CHECK_TH_CUDA(input);
     CHECK_CONTIGUOUS(input);
@@ -118,7 +118,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> fused_add_rms_norm_qu
     int64_t const sfSizePadded = tensorrt_llm::computeSwizzledLayoutSFSize(m_padded, n / sfVecSize);
     at::Tensor sf_out_padded = at::detail::empty_cuda({sfSizePadded}, SF_DTYPE, input.device(), std::nullopt);
     at::Tensor sf_out = (m_padded == m) ? sf_out_padded : sf_out_padded.narrow(0, 0, sfSize);
-    at::Tensor high_precision_normed_output;
+    std::optional<at::Tensor> high_precision_normed_output = std::nullopt;
     if (output_hp_norm)
     {
         at::Tensor hp_normed_output_padded
@@ -163,7 +163,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> fused_add_rms_norm_qu
         param.gamma = reinterpret_cast<T const*>(gamma.data_ptr());                                                    \
         param.beta = nullptr;                                                                                          \
         param.high_precision_normed_output                                                                             \
-            = output_hp_norm ? reinterpret_cast<T*>(high_precision_normed_output.data_ptr()) : nullptr;                \
+            = output_hp_norm ? reinterpret_cast<T*>(high_precision_normed_output.value().data_ptr()) : nullptr;        \
         param.m = static_cast<int>(m);                                                                                 \
         param.n = static_cast<int>(n);                                                                                 \
         param.layernorm_eps = static_cast<float>(eps);                                                                 \
@@ -204,7 +204,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
     m.def(
         "fused_add_rms_norm_quant(Tensor input, Tensor residual, Tensor gamma, "
         "Tensor? sf_scale, bool use_rms_norm=True, float eps=1e-6, bool output_hp_norm=False) -> (Tensor, Tensor, "
-        "Tensor, Tensor)");
+        "Tensor, Tensor?)");
 }
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
 
@@ -1003,7 +1003,9 @@ def _(
         sf_scale: Optional[torch.Tensor],
         use_rms_norm: bool = True,
         eps: float = 1e-5,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        output_hp_norm: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
+               Optional[torch.Tensor]]:
         m, n = input.shape
         # normed_output_fp4: [M, N/8] as int32 (8 FP4 values packed per int32)
         normed_output_fp4 = input.new_empty((m, n // 8), dtype=torch.int32)
@@ -1013,7 +1015,10 @@ def _(
         sf_vec_size = 16
         sf_size = ((m + 127) // 128) * 128 * ((n // sf_vec_size + 3) // 4) * 4
         sf_out = input.new_empty((sf_size, ), dtype=torch.uint8)
-        return normed_output_fp4, output, sf_out
+        # high_precision_normed_output: [M, N] optional, only when output_hp_norm=True
+        hp_output = input.new_empty(
+            (m, n), dtype=input.dtype) if output_hp_norm else None
+        return normed_output_fp4, output, sf_out, hp_output
 
     @torch.library.register_fake("trtllm::fused_relu2_quantize")
     def _(
 
@@ -451,7 +451,14 @@ def forward(position_ids, hidden_states, attn_metadata, residual, **kwargs):
                         position_ids, hidden_states, attn_metadata, residual, **kwargs
                     )
                 else:
-                    hidden_states = layer(position_ids, hidden_states, attn_metadata, **kwargs)
+                    result = layer(
+                        position_ids, hidden_states, attn_metadata, residual=residual, **kwargs
+                    )
+                    # Some layers (e.g., NemotronH) return (hidden_states, residual) tuple
+                    if isinstance(result, tuple):
+                        hidden_states, residual = result
+                    else:
+                        hidden_states = result
             return hidden_states, residual
 
         model.forward = forward
 
@@ -0,0 +1,247 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from tensorrt_llm._torch.modules.mamba import PAD_SLOT_ID
+
+
+def mamba_conv1d_ref(x, past_conv_state, conv_weight, conv_bias, apply_silu):
+    """
+    Reference implementation for causal conv1d.
+
+    Arguments:
+        x: [batch_size, dim, seq_len]
+        past_conv_state: [batch_size, dim, dconv-1]
+        conv_weight: [dim, 1, dconv]
+        conv_bias: [dim]
+    Output:
+        y: [batch_size, dim, seq_len]
+        present_conv_state: [batch_size, dim, dconv-1]
+    """
+    assert x.dim() == 3
+    assert past_conv_state.dim() == 3
+    assert conv_weight.dim() == 3
+    assert conv_bias.dim() == 1
+    batch_size, dim, seq_len = x.shape
+    assert past_conv_state.shape[0] == batch_size
+    assert past_conv_state.shape[1] == dim
+    dconv = past_conv_state.shape[2] + 1
+    assert conv_weight.shape[0] == dim
+    assert conv_weight.shape[1] == 1
+    assert conv_weight.shape[2] == dconv
+    assert conv_weight.shape[0] == dim
+
+    padded_x = torch.cat([past_conv_state, x], dim=2)
+    present_conv_state = padded_x[:, :, -(dconv - 1) :]
+    x_conv = F.conv1d(padded_x, conv_weight, bias=conv_bias, groups=dim)
+
+    y = F.silu(x_conv) if apply_silu else x_conv
+    return y, present_conv_state
+
+
+def trtllm_causal_conv1d_available():
+    """Check if trtllm.causal_conv1d_fwd is available."""
+    return hasattr(torch.ops, "trtllm") and hasattr(torch.ops.trtllm, "causal_conv1d_fwd")
+
+
+skip_unsupported = pytest.mark.skipif(
+    not torch.cuda.is_available() or not trtllm_causal_conv1d_available(),
+    reason="Requires CUDA and trtllm.causal_conv1d_fwd op",
+)
+
+
+@skip_unsupported
+class TestCausalConv1d:
+    """Tests for the causal_conv1d CUDA kernel."""
+
+    @pytest.mark.parametrize("dtype", ["float16", "bfloat16", "float32"])
+    @pytest.mark.parametrize("apply_silu", [True, False])
+    @pytest.mark.parametrize("dim", [256, 512, 1024, 2048])
+    def test_basic_correctness(self, dtype, apply_silu, dim):
+        """Test basic correctness against reference implementation."""
+        torch.manual_seed(42)
+        device = "cuda"
+        torch_dtype = getattr(torch, dtype)
+
+        batch_size = 4
+        seq_len = 32
+        dconv = 4
+        std_dev = 0.5
+        x = torch.randn(batch_size, dim, seq_len, dtype=torch_dtype, device=device)
+        x = x * std_dev
+        conv_state = torch.zeros(batch_size, dim, dconv - 1, dtype=torch_dtype, device=device)
+        conv_weight = torch.randn(dim, 1, dconv, dtype=torch_dtype, device=device)
+        conv_bias = torch.randn(dim, dtype=torch_dtype, device=device)
+        x_kernel = x.clone()
+        conv_state_kernel = conv_state.clone()
+
+        conv_weight_input = conv_weight.squeeze(1).contiguous()
+        torch.ops.trtllm.causal_conv1d_fwd(
+            x_kernel,
+            conv_weight_input,
+            conv_bias,
+            conv_state_kernel,
+            None,  # query_start_loc
+            None,  # cache_indices
+            None,  # has_initial_state
+            apply_silu,
+            PAD_SLOT_ID,
+        )
+        out_ref, conv_state_ref = mamba_conv1d_ref(
+            x, conv_state, conv_weight, conv_bias, apply_silu
+        )
+
+        torch.testing.assert_close(x_kernel, out_ref, rtol=1e-2, atol=1e-2)
+        torch.testing.assert_close(conv_state_kernel, conv_state_ref, rtol=1e-2, atol=1e-2)
+
+    @pytest.mark.parametrize("batch_size", [1, 2, 4, 8, 16])
+    def test_various_batch_sizes(self, batch_size):
+        """Test with various batch sizes."""
+        torch.manual_seed(42)
+        device = "cuda"
+        dtype = torch.bfloat16
+        dim = 1024
+        seq_len = 64
+        dconv = 4
+        apply_silu = True
+
+        x = torch.randn(batch_size, dim, seq_len, dtype=dtype, device=device) * 0.5
+        conv_state = torch.zeros(batch_size, dim, dconv - 1, dtype=dtype, device=device)
+        conv_weight = torch.randn(dim, 1, dconv, dtype=dtype, device=device)
+        conv_bias = torch.randn(dim, dtype=dtype, device=device)
+        x_kernel = x.clone()
+        conv_state_kernel = conv_state.clone()
+
+        conv_weight_input = conv_weight.squeeze(1).contiguous()
+        torch.ops.trtllm.causal_conv1d_fwd(
+            x_kernel,
+            conv_weight_input,
+            conv_bias,
+            conv_state_kernel,
+            None,
+            None,
+            None,
+            apply_silu,
+            PAD_SLOT_ID,
+        )
+        out_ref, conv_state_ref = mamba_conv1d_ref(
+            x, conv_state, conv_weight, conv_bias, apply_silu
+        )
+
+        torch.testing.assert_close(x_kernel, out_ref, rtol=1e-2, atol=1e-1)
+        torch.testing.assert_close(conv_state_kernel, conv_state_ref, rtol=1e-2, atol=1e-1)
+
+    @pytest.mark.parametrize("dconv", [2, 3, 4])
+    def test_various_kernel_widths(self, dconv):
+        """Test with different convolution kernel widths."""
+        torch.manual_seed(42)
+        device = "cuda"
+        dtype = torch.bfloat16
+
+        batch_size = 4
+        dim = 1024
+        seq_len = 64
+        apply_silu = True
+        x = torch.randn(batch_size, dim, seq_len, dtype=dtype, device=device) * 0.5
+        conv_state = torch.zeros(batch_size, dim, dconv - 1, dtype=dtype, device=device)
+        conv_weight = torch.randn(dim, 1, dconv, dtype=dtype, device=device)
+        conv_bias = torch.randn(dim, dtype=dtype, device=device)
+        x_kernel = x.clone()
+        conv_state_kernel = conv_state.clone()
+
+        conv_weight_input = conv_weight.squeeze(1).contiguous()
+        torch.ops.trtllm.causal_conv1d_fwd(
+            x_kernel,
+            conv_weight_input,
+            conv_bias,
+            conv_state_kernel,
+            None,
+            None,
+            None,
+            apply_silu,
+            PAD_SLOT_ID,
+        )
+        out_ref, conv_state_ref = mamba_conv1d_ref(
+            x, conv_state, conv_weight, conv_bias, apply_silu
+        )
+
+        torch.testing.assert_close(x_kernel, out_ref, rtol=1e-2, atol=1e-1)
+        torch.testing.assert_close(conv_state_kernel, conv_state_ref, rtol=1e-2, atol=1e-1)
+
+    def test_with_initial_state(self):
+        """Test with non-zero initial conv state."""
+        torch.manual_seed(42)
+        device = "cuda"
+        dtype = torch.bfloat16
+
+        batch_size = 4
+        dim = 1024
+        seq_len = 32
+        dconv = 4
+        apply_silu = True
+
+        x = torch.randn(batch_size, dim, seq_len, dtype=dtype, device=device) * 0.5
+        # Non-zero initial state
+        conv_state = torch.randn(batch_size, dim, dconv - 1, dtype=dtype, device=device)
+        conv_state = conv_state * 0.5
+        conv_weight = torch.randn(dim, 1, dconv, dtype=dtype, device=device)
+        conv_bias = torch.randn(dim, dtype=dtype, device=device)
+        conv_state_kernel = conv_state.clone()
+        # Need to tell the kernel about initial state
+        has_initial_state = torch.ones(batch_size, dtype=torch.bool, device=device)
+        query_start_loc = torch.tensor(
+            [0] + [seq_len * (i + 1) for i in range(batch_size)],
+            dtype=torch.int32,
+            device=device,
+        )
+        # Reshape for varlen format
+        x_varlen = x.transpose(1, 2).reshape(-1, dim).T.contiguous()
+
+        conv_weight_input = conv_weight.squeeze(1).contiguous()
+        torch.ops.trtllm.causal_conv1d_fwd(
+            x_varlen,
+            conv_weight_input,
+            conv_bias,
+            conv_state_kernel,
+            query_start_loc,
+            None,  # cache_indices
+            has_initial_state,
+            apply_silu,
+            PAD_SLOT_ID,
+        )
+
+        out_ref_list = []
+        conv_state_ref_list = []
+        for b in range(batch_size):
+            out_b, state_b = mamba_conv1d_ref(
+                x[b : b + 1],
+                conv_state[b : b + 1],
+                conv_weight,
+                conv_bias,
+                apply_silu,
+            )
+            out_ref_list.append(out_b)
+            conv_state_ref_list.append(state_b)
+        out_ref = torch.cat(out_ref_list, dim=0)
+        conv_state_ref = torch.cat(conv_state_ref_list, dim=0)
+        x_kernel_reshaped = (
+            x_varlen.T.reshape(batch_size, seq_len, dim).transpose(1, 2).contiguous()
+        )
+
+        torch.testing.assert_close(x_kernel_reshaped, out_ref, rtol=1e-2, atol=1e-1)
+        torch.testing.assert_close(conv_state_kernel, conv_state_ref, rtol=1e-2, atol=1e-1)