From ccb8b3bd6fac47e7dfc3f7de10e2916001be260a Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Mon, 18 May 2026 12:25:25 -0700
Subject: [PATCH] Cortex-M backend: dispatch quantized_linear AOT layout on
 target ISA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CMSIS-NN's `arm_fully_connected_s8` has three runtime paths, gated by
compile-time `ARM_MATH_MVEI` / `ARM_MATH_DSP`. They split the bias and
input_offset×sum(weight) offset term between two inputs, in
incompatible conventions:

* MVE (Helium): reads `ctx.buf` as a precomputed kernel_sum that must
  already include `input_offset × sum(weight)` and the bias
  contribution. The `bias` argument is `(void)bias;` — ignored.
* DSP / scalar (Armv7E-M, Armv7-M, Armv6-M, Armv8-M Baseline): read
  the `bias` argument directly and fold the input_offset contribution
  at runtime. `ctx.buf` (kernel_sum) is `(void)kernel_sum;` — ignored.

`ConvertToCortexMPass._get_linear_replacement` previously emitted only
the MVE shape (kernel_sum populated, bias=None). On any non-MVE build
the DSP/scalar path started the int32 accumulator at 0 instead of at
`bias + input_offset × sum(weight)`, dropping both the bias and the
offset contribution. The accumulator wound up much smaller than
intended, requantization collapsed it to the output zero point, and
every classifier with a deep, narrow tail produced essentially
uniform near-zero outputs on Cortex-M0/0+/M3/M4/M7/M23/M33 builds —
exactly where `CortexMTargetConfig.backend != cmsis_nn.Backend.MVE`.

Use the target-ISA plumbing added by the CortexMTargetConfig PR
(#19470) to dispatch the right input shape at AOT time: on MVE
targets emit kernel_sum with bias folded in (bias=None); on DSP and
scalar targets emit the raw int32 bias directly (kernel_sum=None).
The CMSIS-NN runtime then matches exactly what it expects — no
redundant copy of the bias in the .pte, no silent miscompute on
target mismatch (the runtime errors loudly if ctx.buf is None on an
MVE build, instead of producing garbage).

Update `quantized_linear_impl` in `operators.py` to mirror the same
contract: dispatch off whichever of kernel_sum / bias is non-None.
Threading happens automatically via `CortexMPassManager`'s signature
injection of `target_config` into the pass's `__init__`.

Add `backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py`
as a regression. A tiny `nn.Linear(512, 10)` on uniform[0, 0.002]
input is the minimal reproducer for the small-magnitude regime where
the missing offset terms dominate. The dialect test parametrizes over
MVE/DSP/scalar target configs; the implementation test runs against
whatever path the runner build matches.

Identified by bisecting STResNet Pico's int8-output collapse on
Corstone-300. The same collapse explains the historical MV2 / MV3
"deep classifier PTQ flakiness" xfails — both classifiers have
small-magnitude inputs to their final quantized_linear and likely
hit the same bug on any non-MVE deployment target.

Authored with Claude.
---
 backends/cortex_m/ops/operators.py            |  13 +-
 .../passes/convert_to_cortex_m_pass.py        |  92 +++++++---
 .../test_quantized_linear_small_magnitude.py  | 173 ++++++++++++++++++
 3 files changed, 245 insertions(+), 33 deletions(-)
 create mode 100644 backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py

diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
index 2c35ed8730b..f617a960189 100644
--- a/backends/cortex_m/ops/operators.py
+++ b/backends/cortex_m/ops/operators.py
@@ -434,8 +434,8 @@ def quantized_linear_meta(
 def quantized_linear_impl(
     input: torch.Tensor,
     weights: torch.Tensor,
-    bias: torch.Tensor,
-    kernel_sum: torch.Tensor,
+    bias: torch.Tensor | None,
+    kernel_sum: torch.Tensor | None,
     input_offset: int,
     filter_offset: int,
     output_offset: int,
@@ -448,10 +448,11 @@ def quantized_linear_impl(
     Functional variant - creates output tensor and calls out variant
     """
 
-    # Leaving both implementations for debugging purposes.
-    compute_using_kernel_sum = True
-
-    if compute_using_kernel_sum:
+    # Mirror CMSIS-NN's arm_fully_connected_s8 contract: the MVE path reads
+    # kernel_sum (ctx.buf) and ignores bias; the DSP and scalar paths read
+    # bias and ignore kernel_sum. The AOT pass populates exactly one of them
+    # based on the target ISA, so dispatch off which one is present.
+    if kernel_sum is not None:
         weights_int32 = weights.to(torch.int32)
 
         input_int32 = input.to(torch.int32)
diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
index 418f6cd63ff..e8b9583889f 100644
--- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py
+++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py
@@ -7,10 +7,12 @@
 
 import executorch.backends.cortex_m.ops.operators  # noqa
 
+import cmsis_nn  # type: ignore[import-not-found, import-untyped]
 import torch
 import torch.fx
 from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
 from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
+from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
 
 from executorch.backends.transforms.utils import (
     create_constant_placeholder,
@@ -20,6 +22,7 @@
 
 from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
 from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export import ExportedProgram
 from torch.export.graph_signature import InputKind
 from torch.fx.passes.infra.pass_manager import PassResult
 
@@ -33,21 +36,35 @@ class ConvertToCortexMPass(XNNPACKPass):
     by call_operator.
     """
 
-    def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
-        """
-        Computes the precomputed kernel sum term (bias optional)
-            a * sum_j(wij + b) + ci
-
-        for i = (1, ..., n), where j indexes the input activations.
+    def __init__(
+        self,
+        exported_program: ExportedProgram,
+        target_config: CortexMTargetConfig | None = None,
+    ) -> None:
+        super().__init__(exported_program)
+        # Default mirrors CortexMPassManager: MVE-capable M55 (the previous
+        # behavior for any caller that constructs the pass without a config).
+        self._target_config = target_config or CortexMTargetConfig(cpu=CortexM.M55)
+
+    @property
+    def target_config(self) -> CortexMTargetConfig:
+        return self._target_config
+
+    def _compute_kernel_sum(self, weights, bias_int32, neg_input_zp, neg_weight_zp):
+        """Precompute the MVE kernel_sum term:  a * sum_j(wij + b) + ci
+
+        Where `a = -input_zp` and `b = -weight_zp` per CMSIS-NN convention.
+        Parameter names use the `neg_*_zp` form to keep that sign explicit at
+        every call site. Bias is optional; pass None for an unbiased Linear.
         """
         weights_transposed = weights.T
         weights_int32 = weights_transposed.to(torch.int32)
-        offset_weights = weights_int32 + weight_offset
+        offset_weights = weights_int32 + neg_weight_zp
         kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32)
-        kernel_sum_offset = kernel_sum * input_offset
+        kernel_sum_offset = kernel_sum * neg_input_zp
 
-        if bias is not None:
-            kernel_sum_offset += bias
+        if bias_int32 is not None:
+            kernel_sum_offset += bias_int32
 
         return kernel_sum_offset
 
@@ -96,37 +113,58 @@ def _get_linear_replacement(self, node):
         output_min = node.meta["output_qparams"][0].qmin
         output_max = node.meta["output_qparams"][0].qmax
 
+        # CMSIS-NN's FC path treats weights as per-tensor symmetric (single
+        # `filter_offset`, single multiplier/shift). The non-zero-weight-zp
+        # paths in `arm_nn_vec_mat_mult_t_s8.c` exist but are untested in this
+        # backend — fail loudly if the quantizer ever produces asymmetric
+        # weights so we don't silently land on that codepath.
+        if weight_zp != 0:
+            raise NotImplementedError(
+                f"cortex_m::quantized_linear assumes symmetric weight "
+                f"quantization (weight_zp == 0); got weight_zp={weight_zp}"
+            )
+
         quantized_multiplier, quantized_shift = quantize_multiplier_aot(
             (input_scale * weight_scale) / output_scale
         )
 
-        # TODO: Add support for configuring the backend to support other extensions.
-        # Kernel sum is only used in the CMSIS-NN implementation for the MVE extension,
-        # so this should be optional.
+        # CMSIS-NN's MVE `arm_fully_connected_s8` path reads a precomputed
+        # kernel_sum (input_offset×sum(weight) + bias) from ctx.buf and
+        # ignores the bias argument. The DSP and scalar paths do the opposite
+        # — they read the bias argument at runtime and ignore ctx.buf
+        # (see arm_nn_vec_mat_mult_t_s8.c). Pick the right input format here
+        # based on the target ISA so the runtime gets exactly what it expects.
         weights = node.args[1]
         weights_tensor = get_param_tensor(self.exported_program, weights)
+        bias_node = node.args[2] if len(node.args) > 2 else None
         bias_tensor = (
-            get_param_tensor(self.exported_program, node.args[2])
-            if len(node.args) > 2
+            get_param_tensor(self.exported_program, bias_node)
+            if bias_node is not None
             else None
         )
-        kernel_sum_tensor = self._compute_kernel_sum(
-            weights_tensor, bias_tensor, -input_zp, -weight_zp
-        )
-        with node.graph.inserting_after(weights):
-            kernel_sum = create_constant_placeholder(
-                self.exported_program,
-                node.graph,
-                node.name + "_kernel_sum",
-                InputKind.PARAMETER,
-                kernel_sum_tensor,
+
+        if self.target_config.backend == cmsis_nn.Backend.MVE:
+            kernel_sum_tensor = self._compute_kernel_sum(
+                weights_tensor, bias_tensor, -input_zp, -weight_zp
             )
+            with node.graph.inserting_after(weights):
+                kernel_sum_arg = create_constant_placeholder(
+                    self.exported_program,
+                    node.graph,
+                    node.name + "_kernel_sum",
+                    InputKind.PARAMETER,
+                    kernel_sum_tensor,
+                )
+            bias_arg = None
+        else:
+            kernel_sum_arg = None
+            bias_arg = bias_node
 
         args = (
             node.args[0],
             weights,
-            None,
-            kernel_sum,
+            bias_arg,
+            kernel_sum_arg,
             -input_zp,
             -weight_zp,
             output_zp,
diff --git a/backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py b/backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py
new file mode 100644
index 00000000000..b07f9963e66
--- /dev/null
+++ b/backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py
@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Regression test for the cortex_m::quantized_linear bias/kernel_sum bug.
+
+CMSIS-NN's `arm_fully_connected_s8` has three runtime paths, gated by
+compile-time ARM_MATH_MVEI / ARM_MATH_DSP:
+
+* MVE: reads ctx.buf (precomputed kernel_sum that includes bias plus
+  input_offset x sum(weight)), ignores the bias argument.
+* DSP / scalar: read the bias argument directly, ignore ctx.buf.
+
+`ConvertToCortexMPass._get_linear_replacement` selects which input format
+to emit based on `CortexMTargetConfig.backend`. Before the fix, the pass
+unconditionally emitted kernel_sum + None-bias, which on a non-MVE build
+silently dropped both the bias and the input-offset term. The bug only
+showed up when those terms dominated the int32 accumulator, i.e. on
+small-magnitude inputs.
+
+Coverage:
+
+* `test_dialect_small_magnitude_linear` runs each ISA through the Python
+  op impl and checks that bias=True and bias=False variants both round
+  to the same int8 outputs as the float reference.
+* `test_aot_graph_shape_small_magnitude_linear` inspects the post-pass
+  graph and asserts the bias/kernel_sum arg positions match the ISA
+  convention -- this is the direct regression check.
+* `test_implementation_small_magnitude_linear` runs the bias=True case
+  through the default (M55, MVE) build path so the impl test exercises
+  the kernel_sum codepath in simulation.
+"""
+
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from executorch.backends.arm.test.common import parametrize
+from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
+from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir.dialects._ops import ops as exir_ops
+
+torch.manual_seed(0)
+
+
+class _SmallMagnitudeLinear(nn.Module):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 4,
+    }
+    ops_after_transforms = {
+        "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
+        "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
+    }
+
+    def __init__(self, bias: bool = True):
+        super().__init__()
+        self.fc = nn.Linear(512, 10, bias=bias)
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+class _SmallMagnitudeLinearNoBias(_SmallMagnitudeLinear):
+    ops_before_transforms = {
+        "executorch_exir_dialects_edge__ops_aten_linear_default": 1,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
+        "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
+    }
+
+    def __init__(self):
+        super().__init__(bias=False)
+
+
+def _make_input():
+    return torch.rand(1, 512) * 0.002
+
+
+_calibration_samples = [(_make_input(),) for _ in range(8)]
+
+
+@dataclass(frozen=True)
+class _IsaVariant:
+    case: McuTestCase
+    target_config: CortexMTargetConfig
+    uses_kernel_sum: bool
+
+
+def _variant(model_cls, cpu: CortexM, uses_kernel_sum: bool) -> _IsaVariant:
+    return _IsaVariant(
+        case=McuTestCase(
+            model=model_cls().eval(),
+            example_inputs=lambda: (_make_input(),),
+        ),
+        target_config=CortexMTargetConfig(cpu=cpu),
+        uses_kernel_sum=uses_kernel_sum,
+    )
+
+
+# bias=True covers the regression directly (the bug dropped the bias
+# term); bias=False covers the symmetric case where only the
+# input-offset term is missing on the non-MVE paths.
+test_variants = {
+    "mve_bias": _variant(_SmallMagnitudeLinear, CortexM.M55, uses_kernel_sum=True),
+    "dsp_bias": _variant(_SmallMagnitudeLinear, CortexM.M4, uses_kernel_sum=False),
+    "scalar_bias": _variant(_SmallMagnitudeLinear, CortexM.M0PLUS, uses_kernel_sum=False),
+    "mve_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M55, uses_kernel_sum=True),
+    "dsp_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M4, uses_kernel_sum=False),
+    "scalar_nobias": _variant(
+        _SmallMagnitudeLinearNoBias, CortexM.M0PLUS, uses_kernel_sum=False
+    ),
+}
+
+
+@parametrize("variant", test_variants)
+def test_dialect_small_magnitude_linear(variant: _IsaVariant):
+    tester = CortexMTester(
+        variant.case.model,
+        variant.case.get_example_inputs(),
+        target_config=variant.target_config,
+    )
+    tester.test_dialect(
+        ops_before_transforms=variant.case.model.ops_before_transforms,
+        ops_after_transforms=variant.case.model.ops_after_transforms,
+        qtol=1,
+        calibration_samples=_calibration_samples,
+    )
+
+
+@parametrize("variant", test_variants)
+def test_aot_graph_shape_small_magnitude_linear(variant: _IsaVariant):
+    """Assert the post-pass node args match the ISA's CMSIS-NN convention."""
+    tester = CortexMTester(
+        variant.case.model,
+        variant.case.get_example_inputs(),
+        target_config=variant.target_config,
+    )
+    tester.quantize(None)
+    tester.export()
+    tester.to_edge()
+    tester.run_passes()
+
+    module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
+    linear_target = exir_ops.edge.cortex_m.quantized_linear.default
+    linear_nodes = [
+        n for n in module.graph.nodes if n.op == "call_function" and n.target == linear_target
+    ]
+    assert len(linear_nodes) == 1, f"expected one quantized_linear node, got {len(linear_nodes)}"
+    bias_arg, kernel_sum_arg = linear_nodes[0].args[2], linear_nodes[0].args[3]
+
+    if variant.uses_kernel_sum:
+        assert bias_arg is None, "MVE path must not pass bias (CMSIS-NN ignores it)"
+        assert kernel_sum_arg is not None, "MVE path requires precomputed kernel_sum"
+    else:
+        assert kernel_sum_arg is None, "non-MVE path must not pass kernel_sum"
+        # bias is allowed to be None only if the source nn.Linear had bias=False.
+        expects_bias = variant.case.model.fc.bias is not None
+        if expects_bias:
+            assert bias_arg is not None, "non-MVE path with bias must forward bias to CMSIS-NN"
+
+
+def test_implementation_small_magnitude_linear():
+    """Exercise the MVE kernel_sum codepath via the default M55 simulator build."""
+    case = McuTestCase(
+        model=_SmallMagnitudeLinear().eval(),
+        example_inputs=lambda: (_make_input(),),
+    )
+    tester = CortexMTester(case.model, case.get_example_inputs())
+    tester.test_implementation(qtol=1, calibration_samples=_calibration_samples)