pytorch · rascani · May 18, 2026
@@ -434,8 +434,8 @@ def quantized_linear_meta(
 def quantized_linear_impl(
     input: torch.Tensor,
     weights: torch.Tensor,
-    bias: torch.Tensor,
-    kernel_sum: torch.Tensor,
+    bias: torch.Tensor | None,
+    kernel_sum: torch.Tensor | None,
     input_offset: int,
     filter_offset: int,
     output_offset: int,
@@ -448,10 +448,11 @@ def quantized_linear_impl(
     Functional variant - creates output tensor and calls out variant
     """
 
-    # Leaving both implementations for debugging purposes.
-    compute_using_kernel_sum = True
-
-    if compute_using_kernel_sum:
+    # Mirror CMSIS-NN's arm_fully_connected_s8 contract: the MVE path reads
+    # kernel_sum (ctx.buf) and ignores bias; the DSP and scalar paths read
+    # bias and ignore kernel_sum. The AOT pass populates exactly one of them
+    # based on the target ISA, so dispatch off which one is present.
+    if kernel_sum is not None:
         weights_int32 = weights.to(torch.int32)
 
         input_int32 = input.to(torch.int32)

@@ -1,3 +1,3 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 # Copyright 2025-2026 Arm Limited and/or its affiliates.
@@ -7,10 +7,12 @@
 
 import executorch.backends.cortex_m.ops.operators  # noqa
 
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import torch
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
+from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
 
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
@@ -20,6 +22,7 @@
 
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
 from torch.export.graph_signature import InputKind
 from torch.fx.passes.infra.pass_manager import PassResult
 
@@ -33,21 +36,35 @@
     by call_operator.
     """
 
-    def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
-        """
-        Computes the precomputed kernel sum term (bias optional)
-            a * sum_j(wij + b) + ci
-
-        for i = (1, ..., n), where j indexes the input activations.
+    def __init__(
+        self,
+        exported_program: ExportedProgram,
+        target_config: CortexMTargetConfig | None = None,
+    ) -> None:
+        super().__init__(exported_program)
+        # Default mirrors CortexMPassManager: MVE-capable M55 (the previous
+        # behavior for any caller that constructs the pass without a config).
+        self._target_config = target_config or CortexMTargetConfig(cpu=CortexM.M55)
+
+    @property
+    def target_config(self) -> CortexMTargetConfig:
+        return self._target_config
+
+    def _compute_kernel_sum(self, weights, bias_int32, neg_input_zp, neg_weight_zp):
+        """Precompute the MVE kernel_sum term:  a * sum_j(wij + b) + ci
+
+        Where `a = -input_zp` and `b = -weight_zp` per CMSIS-NN convention.
+        Parameter names use the `neg_*_zp` form to keep that sign explicit at
+        every call site. Bias is optional; pass None for an unbiased Linear.
         """
         weights_transposed = weights.T
         weights_int32 = weights_transposed.to(torch.int32)
-        offset_weights = weights_int32 + weight_offset
+        offset_weights = weights_int32 + neg_weight_zp
         kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32)
-        kernel_sum_offset = kernel_sum * input_offset
+        kernel_sum_offset = kernel_sum * neg_input_zp
 
-        if bias is not None:
-            kernel_sum_offset += bias
+        if bias_int32 is not None:
+            kernel_sum_offset += bias_int32
 
         return kernel_sum_offset
 
@@ -96,37 +113,58 @@
         output_min = node.meta["output_qparams"][0].qmin
         output_max = node.meta["output_qparams"][0].qmax
 
+        # CMSIS-NN's FC path treats weights as per-tensor symmetric (single
+        # `filter_offset`, single multiplier/shift). The non-zero-weight-zp
+        # paths in `arm_nn_vec_mat_mult_t_s8.c` exist but are untested in this
+        # backend — fail loudly if the quantizer ever produces asymmetric
+        # weights so we don't silently land on that codepath.
+        if weight_zp != 0:
+            raise NotImplementedError(
+                f"cortex_m::quantized_linear assumes symmetric weight "
+                f"quantization (weight_zp == 0); got weight_zp={weight_zp}"
+            )
+
         quantized_multiplier, quantized_shift = quantize_multiplier_aot(
             (input_scale * weight_scale) / output_scale
         )
 
-        # TODO: Add support for configuring the backend to support other extensions.
-        # Kernel sum is only used in the CMSIS-NN implementation for the MVE extension,
-        # so this should be optional.
+        # CMSIS-NN's MVE `arm_fully_connected_s8` path reads a precomputed
+        # kernel_sum (input_offset×sum(weight) + bias) from ctx.buf and
+        # ignores the bias argument. The DSP and scalar paths do the opposite
+        # — they read the bias argument at runtime and ignore ctx.buf
+        # (see arm_nn_vec_mat_mult_t_s8.c). Pick the right input format here
+        # based on the target ISA so the runtime gets exactly what it expects.
         weights = node.args[1]
         weights_tensor = get_param_tensor(self.exported_program, weights)
+        bias_node = node.args[2] if len(node.args) > 2 else None
         bias_tensor = (
-            get_param_tensor(self.exported_program, node.args[2])
-            if len(node.args) > 2
+            get_param_tensor(self.exported_program, bias_node)
+            if bias_node is not None
             else None
         )
-        kernel_sum_tensor = self._compute_kernel_sum(
-            weights_tensor, bias_tensor, -input_zp, -weight_zp
-        )
-        with node.graph.inserting_after(weights):
-            kernel_sum = create_constant_placeholder(
-                self.exported_program,
-                node.graph,
-                node.name + "_kernel_sum",
-                InputKind.PARAMETER,
-                kernel_sum_tensor,
+
+        if self.target_config.backend == cmsis_nn.Backend.MVE:
+            kernel_sum_tensor = self._compute_kernel_sum(
+                weights_tensor, bias_tensor, -input_zp, -weight_zp
             )
+            with node.graph.inserting_after(weights):
+                kernel_sum_arg = create_constant_placeholder(
+                    self.exported_program,
+                    node.graph,
+                    node.name + "_kernel_sum",
+                    InputKind.PARAMETER,
+                    kernel_sum_tensor,
+                )
+            bias_arg = None
+        else:
+            kernel_sum_arg = None
+            bias_arg = bias_node
 
         args = (
             node.args[0],
             weights,
-            None,
-            kernel_sum,
+            bias_arg,
+            kernel_sum_arg,
             -input_zp,
             -weight_zp,
             output_zp,

@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Regression test for the cortex_m::quantized_linear bias/kernel_sum bug.
+
+CMSIS-NN's `arm_fully_connected_s8` has three runtime paths, gated by
+compile-time ARM_MATH_MVEI / ARM_MATH_DSP:
+
+* MVE: reads ctx.buf (precomputed kernel_sum that includes bias plus
+  input_offset x sum(weight)), ignores the bias argument.
+* DSP / scalar: read the bias argument directly, ignore ctx.buf.
+
+`ConvertToCortexMPass._get_linear_replacement` selects which input format
+to emit based on `CortexMTargetConfig.backend`. Before the fix, the pass
+unconditionally emitted kernel_sum + None-bias, which on a non-MVE build
+silently dropped both the bias and the input-offset term. The bug only
+showed up when those terms dominated the int32 accumulator, i.e. on
+small-magnitude inputs.
+
+Coverage:
+
+* `test_dialect_small_magnitude_linear` runs each ISA through the Python
+  op impl and checks that bias=True and bias=False variants both round
+  to the same int8 outputs as the float reference.
+* `test_aot_graph_shape_small_magnitude_linear` inspects the post-pass
+  graph and asserts the bias/kernel_sum arg positions match the ISA
+  convention -- this is the direct regression check.
+* `test_implementation_small_magnitude_linear` runs the bias=True case
+  through the default (M55, MVE) build path so the impl test exercises
+  the kernel_sum codepath in simulation.
+"""
+
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
+from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir.dialects._ops import ops as exir_ops
+
+torch.manual_seed(0)
+
+
+class _SmallMagnitudeLinear(nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 4,
+    }
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def __init__(self, bias: bool = True):
+        super().__init__()
+        self.fc = nn.Linear(512, 10, bias=bias)
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+class _SmallMagnitudeLinearNoBias(_SmallMagnitudeLinear):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    def __init__(self):
+        super().__init__(bias=False)
+
+
+def _make_input():
+    return torch.rand(1, 512) * 0.002
+
+
+_calibration_samples = [(_make_input(),) for _ in range(8)]
+
+
+@dataclass(frozen=True)
+class _IsaVariant:
+    case: McuTestCase
+    target_config: CortexMTargetConfig
+    uses_kernel_sum: bool
+
+
+def _variant(model_cls, cpu: CortexM, uses_kernel_sum: bool) -> _IsaVariant:
+    return _IsaVariant(
+        case=McuTestCase(
+            model=model_cls().eval(),
+            example_inputs=lambda: (_make_input(),),
+        ),
+        target_config=CortexMTargetConfig(cpu=cpu),
+        uses_kernel_sum=uses_kernel_sum,
+    )
+
+
+# bias=True covers the regression directly (the bug dropped the bias
+# term); bias=False covers the symmetric case where only the
+# input-offset term is missing on the non-MVE paths.
+test_variants = {
+    "mve_bias": _variant(_SmallMagnitudeLinear, CortexM.M55, uses_kernel_sum=True),
+    "dsp_bias": _variant(_SmallMagnitudeLinear, CortexM.M4, uses_kernel_sum=False),
+    "scalar_bias": _variant(_SmallMagnitudeLinear, CortexM.M0PLUS, uses_kernel_sum=False),
+    "mve_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M55, uses_kernel_sum=True),
+    "dsp_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M4, uses_kernel_sum=False),
+    "scalar_nobias": _variant(
+        _SmallMagnitudeLinearNoBias, CortexM.M0PLUS, uses_kernel_sum=False
+    ),
+}
+
+
+@parametrize("variant", test_variants)
+def test_dialect_small_magnitude_linear(variant: _IsaVariant):
+    tester = CortexMTester(
+        variant.case.model,
+        variant.case.get_example_inputs(),
+        target_config=variant.target_config,
+    )
+    tester.test_dialect(
+        ops_before_transforms=variant.case.model.ops_before_transforms,
+        ops_after_transforms=variant.case.model.ops_after_transforms,
+        qtol=1,
+        calibration_samples=_calibration_samples,
+    )
+
+
+@parametrize("variant", test_variants)
+def test_aot_graph_shape_small_magnitude_linear(variant: _IsaVariant):
+    """Assert the post-pass node args match the ISA's CMSIS-NN convention."""
+    tester = CortexMTester(
+        variant.case.model,
+        variant.case.get_example_inputs(),
+        target_config=variant.target_config,
+    )
+    tester.quantize(None)
+    tester.export()
+    tester.to_edge()
+    tester.run_passes()
+
+    module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
+    linear_target = exir_ops.edge.cortex_m.quantized_linear.default
+    linear_nodes = [
+        n for n in module.graph.nodes if n.op == "call_function" and n.target == linear_target
+    ]
+    assert len(linear_nodes) == 1, f"expected one quantized_linear node, got {len(linear_nodes)}"
+    bias_arg, kernel_sum_arg = linear_nodes[0].args[2], linear_nodes[0].args[3]
+
+    if variant.uses_kernel_sum:
+        assert bias_arg is None, "MVE path must not pass bias (CMSIS-NN ignores it)"
+        assert kernel_sum_arg is not None, "MVE path requires precomputed kernel_sum"
+    else:
+        assert kernel_sum_arg is None, "non-MVE path must not pass kernel_sum"
+        # bias is allowed to be None only if the source nn.Linear had bias=False.
+        expects_bias = variant.case.model.fc.bias is not None
+        if expects_bias:
+            assert bias_arg is not None, "non-MVE path with bias must forward bias to CMSIS-NN"
+
+
+def test_implementation_small_magnitude_linear():
+    """Exercise the MVE kernel_sum codepath via the default M55 simulator build."""
+    case = McuTestCase(
+        model=_SmallMagnitudeLinear().eval(),
+        example_inputs=lambda: (_make_input(),),
+    )
+    tester = CortexMTester(case.model, case.get_example_inputs())
+    tester.test_implementation(qtol=1, calibration_samples=_calibration_samples)