Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions backends/cortex_m/ops/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,8 @@ def quantized_linear_meta(
def quantized_linear_impl(
input: torch.Tensor,
weights: torch.Tensor,
bias: torch.Tensor,
kernel_sum: torch.Tensor,
bias: torch.Tensor | None,
kernel_sum: torch.Tensor | None,
input_offset: int,
filter_offset: int,
output_offset: int,
Expand All @@ -448,10 +448,11 @@ def quantized_linear_impl(
Functional variant - creates output tensor and calls out variant
"""

# Leaving both implementations for debugging purposes.
compute_using_kernel_sum = True

if compute_using_kernel_sum:
# Mirror CMSIS-NN's arm_fully_connected_s8 contract: the MVE path reads
# kernel_sum (ctx.buf) and ignores bias; the DSP and scalar paths read
# bias and ignore kernel_sum. The AOT pass populates exactly one of them
# based on the target ISA, so dispatch off which one is present.
if kernel_sum is not None:
weights_int32 = weights.to(torch.int32)

input_int32 = input.to(torch.int32)
Expand Down
92 changes: 65 additions & 27 deletions backends/cortex_m/passes/convert_to_cortex_m_pass.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# Copyright 2025-2026 Arm Limited and/or its affiliates.
Expand All @@ -7,10 +7,12 @@

import executorch.backends.cortex_m.ops.operators # noqa

import cmsis_nn # type: ignore[import-not-found, import-untyped]
import torch
import torch.fx
from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot
from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig

from executorch.backends.transforms.utils import (
create_constant_placeholder,
Expand All @@ -20,6 +22,7 @@

from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass
from executorch.exir.dialects._ops import ops as exir_ops
from torch.export import ExportedProgram
from torch.export.graph_signature import InputKind
from torch.fx.passes.infra.pass_manager import PassResult

Expand All @@ -33,21 +36,35 @@
by call_operator.
"""

def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset):
"""
Computes the precomputed kernel sum term (bias optional)
a * sum_j(wij + b) + ci

for i = (1, ..., n), where j indexes the input activations.
def __init__(
self,
exported_program: ExportedProgram,
target_config: CortexMTargetConfig | None = None,
) -> None:
super().__init__(exported_program)
# Default mirrors CortexMPassManager: MVE-capable M55 (the previous
# behavior for any caller that constructs the pass without a config).
self._target_config = target_config or CortexMTargetConfig(cpu=CortexM.M55)

@property
def target_config(self) -> CortexMTargetConfig:
return self._target_config

def _compute_kernel_sum(self, weights, bias_int32, neg_input_zp, neg_weight_zp):
"""Precompute the MVE kernel_sum term: a * sum_j(wij + b) + ci

Where `a = -input_zp` and `b = -weight_zp` per CMSIS-NN convention.
Parameter names use the `neg_*_zp` form to keep that sign explicit at
every call site. Bias is optional; pass None for an unbiased Linear.
"""
weights_transposed = weights.T
weights_int32 = weights_transposed.to(torch.int32)
offset_weights = weights_int32 + weight_offset
offset_weights = weights_int32 + neg_weight_zp
kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32)
kernel_sum_offset = kernel_sum * input_offset
kernel_sum_offset = kernel_sum * neg_input_zp

if bias is not None:
kernel_sum_offset += bias
if bias_int32 is not None:
kernel_sum_offset += bias_int32

return kernel_sum_offset

Expand Down Expand Up @@ -96,37 +113,58 @@
output_min = node.meta["output_qparams"][0].qmin
output_max = node.meta["output_qparams"][0].qmax

# CMSIS-NN's FC path treats weights as per-tensor symmetric (single
# `filter_offset`, single multiplier/shift). The non-zero-weight-zp
# paths in `arm_nn_vec_mat_mult_t_s8.c` exist but are untested in this
# backend — fail loudly if the quantizer ever produces asymmetric
# weights so we don't silently land on that codepath.
if weight_zp != 0:
raise NotImplementedError(
f"cortex_m::quantized_linear assumes symmetric weight "
f"quantization (weight_zp == 0); got weight_zp={weight_zp}"
)

quantized_multiplier, quantized_shift = quantize_multiplier_aot(
(input_scale * weight_scale) / output_scale
)

# TODO: Add support for configuring the backend to support other extensions.
# Kernel sum is only used in the CMSIS-NN implementation for the MVE extension,
# so this should be optional.
# CMSIS-NN's MVE `arm_fully_connected_s8` path reads a precomputed
# kernel_sum (input_offset×sum(weight) + bias) from ctx.buf and
# ignores the bias argument. The DSP and scalar paths do the opposite
# — they read the bias argument at runtime and ignore ctx.buf
# (see arm_nn_vec_mat_mult_t_s8.c). Pick the right input format here
# based on the target ISA so the runtime gets exactly what it expects.
weights = node.args[1]
weights_tensor = get_param_tensor(self.exported_program, weights)
bias_node = node.args[2] if len(node.args) > 2 else None
bias_tensor = (
get_param_tensor(self.exported_program, node.args[2])
if len(node.args) > 2
get_param_tensor(self.exported_program, bias_node)
if bias_node is not None
else None
)
kernel_sum_tensor = self._compute_kernel_sum(
weights_tensor, bias_tensor, -input_zp, -weight_zp
)
with node.graph.inserting_after(weights):
kernel_sum = create_constant_placeholder(
self.exported_program,
node.graph,
node.name + "_kernel_sum",
InputKind.PARAMETER,
kernel_sum_tensor,

if self.target_config.backend == cmsis_nn.Backend.MVE:
kernel_sum_tensor = self._compute_kernel_sum(
weights_tensor, bias_tensor, -input_zp, -weight_zp
)
with node.graph.inserting_after(weights):
kernel_sum_arg = create_constant_placeholder(
self.exported_program,
node.graph,
node.name + "_kernel_sum",
InputKind.PARAMETER,
kernel_sum_tensor,
)
bias_arg = None
else:
kernel_sum_arg = None
bias_arg = bias_node

args = (
node.args[0],
weights,
None,
kernel_sum,
bias_arg,
kernel_sum_arg,
-input_zp,
-weight_zp,
output_zp,
Expand Down
173 changes: 173 additions & 0 deletions backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Regression test for the cortex_m::quantized_linear bias/kernel_sum bug.

CMSIS-NN's `arm_fully_connected_s8` has three runtime paths, gated by
compile-time ARM_MATH_MVEI / ARM_MATH_DSP:

* MVE: reads ctx.buf (precomputed kernel_sum that includes bias plus
input_offset x sum(weight)), ignores the bias argument.
* DSP / scalar: read the bias argument directly, ignore ctx.buf.

`ConvertToCortexMPass._get_linear_replacement` selects which input format
to emit based on `CortexMTargetConfig.backend`. Before the fix, the pass
unconditionally emitted kernel_sum + None-bias, which on a non-MVE build
silently dropped both the bias and the input-offset term. The bug only
showed up when those terms dominated the int32 accumulator, i.e. on
small-magnitude inputs.

Coverage:

* `test_dialect_small_magnitude_linear` runs each ISA through the Python
op impl and checks that bias=True and bias=False variants both round
to the same int8 outputs as the float reference.
* `test_aot_graph_shape_small_magnitude_linear` inspects the post-pass
graph and asserts the bias/kernel_sum arg positions match the ISA
convention -- this is the direct regression check.
* `test_implementation_small_magnitude_linear` runs the bias=True case
through the default (M55, MVE) build path so the impl test exercises
the kernel_sum codepath in simulation.
"""

from dataclasses import dataclass

import torch
import torch.nn as nn
from executorch.backends.arm.test.common import parametrize
from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig
from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase
from executorch.backends.test.harness.stages import StageType
from executorch.exir.dialects._ops import ops as exir_ops

torch.manual_seed(0)


class _SmallMagnitudeLinear(nn.Module):
ops_before_transforms = {
"executorch_exir_dialects_edge__ops_aten_linear_default": 1,
"executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
"executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 4,
}
ops_after_transforms = {
"executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1,
"executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
"executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
}

def __init__(self, bias: bool = True):
super().__init__()
self.fc = nn.Linear(512, 10, bias=bias)

def forward(self, x):
return self.fc(x)


class _SmallMagnitudeLinearNoBias(_SmallMagnitudeLinear):
ops_before_transforms = {
"executorch_exir_dialects_edge__ops_aten_linear_default": 1,
"executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
"executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3,
}

def __init__(self):
super().__init__(bias=False)


def _make_input():
return torch.rand(1, 512) * 0.002


_calibration_samples = [(_make_input(),) for _ in range(8)]


@dataclass(frozen=True)
class _IsaVariant:
case: McuTestCase
target_config: CortexMTargetConfig
uses_kernel_sum: bool


def _variant(model_cls, cpu: CortexM, uses_kernel_sum: bool) -> _IsaVariant:
return _IsaVariant(
case=McuTestCase(
model=model_cls().eval(),
example_inputs=lambda: (_make_input(),),
),
target_config=CortexMTargetConfig(cpu=cpu),
uses_kernel_sum=uses_kernel_sum,
)


# bias=True covers the regression directly (the bug dropped the bias
# term); bias=False covers the symmetric case where only the
# input-offset term is missing on the non-MVE paths.
test_variants = {
"mve_bias": _variant(_SmallMagnitudeLinear, CortexM.M55, uses_kernel_sum=True),
"dsp_bias": _variant(_SmallMagnitudeLinear, CortexM.M4, uses_kernel_sum=False),
"scalar_bias": _variant(_SmallMagnitudeLinear, CortexM.M0PLUS, uses_kernel_sum=False),
"mve_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M55, uses_kernel_sum=True),
"dsp_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M4, uses_kernel_sum=False),
"scalar_nobias": _variant(
_SmallMagnitudeLinearNoBias, CortexM.M0PLUS, uses_kernel_sum=False
),
}


@parametrize("variant", test_variants)
def test_dialect_small_magnitude_linear(variant: _IsaVariant):
tester = CortexMTester(
variant.case.model,
variant.case.get_example_inputs(),
target_config=variant.target_config,
)
tester.test_dialect(
ops_before_transforms=variant.case.model.ops_before_transforms,
ops_after_transforms=variant.case.model.ops_after_transforms,
qtol=1,
calibration_samples=_calibration_samples,
)


@parametrize("variant", test_variants)
def test_aot_graph_shape_small_magnitude_linear(variant: _IsaVariant):
"""Assert the post-pass node args match the ISA's CMSIS-NN convention."""
tester = CortexMTester(
variant.case.model,
variant.case.get_example_inputs(),
target_config=variant.target_config,
)
tester.quantize(None)
tester.export()
tester.to_edge()
tester.run_passes()

module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module()
linear_target = exir_ops.edge.cortex_m.quantized_linear.default
linear_nodes = [
n for n in module.graph.nodes if n.op == "call_function" and n.target == linear_target
]
assert len(linear_nodes) == 1, f"expected one quantized_linear node, got {len(linear_nodes)}"
bias_arg, kernel_sum_arg = linear_nodes[0].args[2], linear_nodes[0].args[3]

if variant.uses_kernel_sum:
assert bias_arg is None, "MVE path must not pass bias (CMSIS-NN ignores it)"
assert kernel_sum_arg is not None, "MVE path requires precomputed kernel_sum"
else:
assert kernel_sum_arg is None, "non-MVE path must not pass kernel_sum"
# bias is allowed to be None only if the source nn.Linear had bias=False.
expects_bias = variant.case.model.fc.bias is not None

Check failure on line 161 in backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py

View workflow job for this annotation

GitHub Actions / lintrunner-mypy

MYPY union-attr

Item "Tensor" of "Tensor | Module" has no attribute "bias" To disable, use ` # type: ignore[union-attr]`
if expects_bias:
assert bias_arg is not None, "non-MVE path with bias must forward bias to CMSIS-NN"


def test_implementation_small_magnitude_linear():
"""Exercise the MVE kernel_sum codepath via the default M55 simulator build."""
case = McuTestCase(
model=_SmallMagnitudeLinear().eval(),
example_inputs=lambda: (_make_input(),),
)
tester = CortexMTester(case.model, case.get_example_inputs())
tester.test_implementation(qtol=1, calibration_samples=_calibration_samples)
Loading