diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index 2c35ed8730b..f617a960189 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -434,8 +434,8 @@ def quantized_linear_meta( def quantized_linear_impl( input: torch.Tensor, weights: torch.Tensor, - bias: torch.Tensor, - kernel_sum: torch.Tensor, + bias: torch.Tensor | None, + kernel_sum: torch.Tensor | None, input_offset: int, filter_offset: int, output_offset: int, @@ -448,10 +448,11 @@ def quantized_linear_impl( Functional variant - creates output tensor and calls out variant """ - # Leaving both implementations for debugging purposes. - compute_using_kernel_sum = True - - if compute_using_kernel_sum: + # Mirror CMSIS-NN's arm_fully_connected_s8 contract: the MVE path reads + # kernel_sum (ctx.buf) and ignores bias; the DSP and scalar paths read + # bias and ignore kernel_sum. The AOT pass populates exactly one of them + # based on the target ISA, so dispatch off which one is present. + if kernel_sum is not None: weights_int32 = weights.to(torch.int32) input_int32 = input.to(torch.int32) diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 418f6cd63ff..e8b9583889f 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -7,10 +7,12 @@ import executorch.backends.cortex_m.ops.operators # noqa +import cmsis_nn # type: ignore[import-not-found, import-untyped] import torch import torch.fx from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot +from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig from executorch.backends.transforms.utils import ( create_constant_placeholder, @@ -20,6 +22,7 @@ from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass from executorch.exir.dialects._ops import ops as exir_ops +from torch.export import ExportedProgram from torch.export.graph_signature import InputKind from torch.fx.passes.infra.pass_manager import PassResult @@ -33,21 +36,35 @@ class ConvertToCortexMPass(XNNPACKPass): by call_operator. """ - def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset): - """ - Computes the precomputed kernel sum term (bias optional) - a * sum_j(wij + b) + ci - - for i = (1, ..., n), where j indexes the input activations. + def __init__( + self, + exported_program: ExportedProgram, + target_config: CortexMTargetConfig | None = None, + ) -> None: + super().__init__(exported_program) + # Default mirrors CortexMPassManager: MVE-capable M55 (the previous + # behavior for any caller that constructs the pass without a config). + self._target_config = target_config or CortexMTargetConfig(cpu=CortexM.M55) + + @property + def target_config(self) -> CortexMTargetConfig: + return self._target_config + + def _compute_kernel_sum(self, weights, bias_int32, neg_input_zp, neg_weight_zp): + """Precompute the MVE kernel_sum term: a * sum_j(wij + b) + ci + + Where `a = -input_zp` and `b = -weight_zp` per CMSIS-NN convention. + Parameter names use the `neg_*_zp` form to keep that sign explicit at + every call site. Bias is optional; pass None for an unbiased Linear. """ weights_transposed = weights.T weights_int32 = weights_transposed.to(torch.int32) - offset_weights = weights_int32 + weight_offset + offset_weights = weights_int32 + neg_weight_zp kernel_sum = torch.sum(offset_weights, dim=0, keepdim=True, dtype=torch.int32) - kernel_sum_offset = kernel_sum * input_offset + kernel_sum_offset = kernel_sum * neg_input_zp - if bias is not None: - kernel_sum_offset += bias + if bias_int32 is not None: + kernel_sum_offset += bias_int32 return kernel_sum_offset @@ -96,37 +113,58 @@ def _get_linear_replacement(self, node): output_min = node.meta["output_qparams"][0].qmin output_max = node.meta["output_qparams"][0].qmax + # CMSIS-NN's FC path treats weights as per-tensor symmetric (single + # `filter_offset`, single multiplier/shift). The non-zero-weight-zp + # paths in `arm_nn_vec_mat_mult_t_s8.c` exist but are untested in this + # backend — fail loudly if the quantizer ever produces asymmetric + # weights so we don't silently land on that codepath. + if weight_zp != 0: + raise NotImplementedError( + f"cortex_m::quantized_linear assumes symmetric weight " + f"quantization (weight_zp == 0); got weight_zp={weight_zp}" + ) + quantized_multiplier, quantized_shift = quantize_multiplier_aot( (input_scale * weight_scale) / output_scale ) - # TODO: Add support for configuring the backend to support other extensions. - # Kernel sum is only used in the CMSIS-NN implementation for the MVE extension, - # so this should be optional. + # CMSIS-NN's MVE `arm_fully_connected_s8` path reads a precomputed + # kernel_sum (input_offset×sum(weight) + bias) from ctx.buf and + # ignores the bias argument. The DSP and scalar paths do the opposite + # — they read the bias argument at runtime and ignore ctx.buf + # (see arm_nn_vec_mat_mult_t_s8.c). Pick the right input format here + # based on the target ISA so the runtime gets exactly what it expects. weights = node.args[1] weights_tensor = get_param_tensor(self.exported_program, weights) + bias_node = node.args[2] if len(node.args) > 2 else None bias_tensor = ( - get_param_tensor(self.exported_program, node.args[2]) - if len(node.args) > 2 + get_param_tensor(self.exported_program, bias_node) + if bias_node is not None else None ) - kernel_sum_tensor = self._compute_kernel_sum( - weights_tensor, bias_tensor, -input_zp, -weight_zp - ) - with node.graph.inserting_after(weights): - kernel_sum = create_constant_placeholder( - self.exported_program, - node.graph, - node.name + "_kernel_sum", - InputKind.PARAMETER, - kernel_sum_tensor, + + if self.target_config.backend == cmsis_nn.Backend.MVE: + kernel_sum_tensor = self._compute_kernel_sum( + weights_tensor, bias_tensor, -input_zp, -weight_zp ) + with node.graph.inserting_after(weights): + kernel_sum_arg = create_constant_placeholder( + self.exported_program, + node.graph, + node.name + "_kernel_sum", + InputKind.PARAMETER, + kernel_sum_tensor, + ) + bias_arg = None + else: + kernel_sum_arg = None + bias_arg = bias_node args = ( node.args[0], weights, - None, - kernel_sum, + bias_arg, + kernel_sum_arg, -input_zp, -weight_zp, output_zp, diff --git a/backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py b/backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py new file mode 100644 index 00000000000..b07f9963e66 --- /dev/null +++ b/backends/cortex_m/test/misc/test_quantized_linear_small_magnitude.py @@ -0,0 +1,173 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +"""Regression test for the cortex_m::quantized_linear bias/kernel_sum bug. + +CMSIS-NN's `arm_fully_connected_s8` has three runtime paths, gated by +compile-time ARM_MATH_MVEI / ARM_MATH_DSP: + +* MVE: reads ctx.buf (precomputed kernel_sum that includes bias plus + input_offset x sum(weight)), ignores the bias argument. +* DSP / scalar: read the bias argument directly, ignore ctx.buf. + +`ConvertToCortexMPass._get_linear_replacement` selects which input format +to emit based on `CortexMTargetConfig.backend`. Before the fix, the pass +unconditionally emitted kernel_sum + None-bias, which on a non-MVE build +silently dropped both the bias and the input-offset term. The bug only +showed up when those terms dominated the int32 accumulator, i.e. on +small-magnitude inputs. + +Coverage: + +* `test_dialect_small_magnitude_linear` runs each ISA through the Python + op impl and checks that bias=True and bias=False variants both round + to the same int8 outputs as the float reference. +* `test_aot_graph_shape_small_magnitude_linear` inspects the post-pass + graph and asserts the bias/kernel_sum arg positions match the ISA + convention -- this is the direct regression check. +* `test_implementation_small_magnitude_linear` runs the bias=True case + through the default (M55, MVE) build path so the impl test exercises + the kernel_sum codepath in simulation. +""" + +from dataclasses import dataclass + +import torch +import torch.nn as nn +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.target_config import CortexM, CortexMTargetConfig +from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase +from executorch.backends.test.harness.stages import StageType +from executorch.exir.dialects._ops import ops as exir_ops + +torch.manual_seed(0) + + +class _SmallMagnitudeLinear(nn.Module): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_linear_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 4, + } + ops_after_transforms = { + "executorch_exir_dialects_edge__ops_cortex_m_quantized_linear_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, + } + + def __init__(self, bias: bool = True): + super().__init__() + self.fc = nn.Linear(512, 10, bias=bias) + + def forward(self, x): + return self.fc(x) + + +class _SmallMagnitudeLinearNoBias(_SmallMagnitudeLinear): + ops_before_transforms = { + "executorch_exir_dialects_edge__ops_aten_linear_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 3, + } + + def __init__(self): + super().__init__(bias=False) + + +def _make_input(): + return torch.rand(1, 512) * 0.002 + + +_calibration_samples = [(_make_input(),) for _ in range(8)] + + +@dataclass(frozen=True) +class _IsaVariant: + case: McuTestCase + target_config: CortexMTargetConfig + uses_kernel_sum: bool + + +def _variant(model_cls, cpu: CortexM, uses_kernel_sum: bool) -> _IsaVariant: + return _IsaVariant( + case=McuTestCase( + model=model_cls().eval(), + example_inputs=lambda: (_make_input(),), + ), + target_config=CortexMTargetConfig(cpu=cpu), + uses_kernel_sum=uses_kernel_sum, + ) + + +# bias=True covers the regression directly (the bug dropped the bias +# term); bias=False covers the symmetric case where only the +# input-offset term is missing on the non-MVE paths. +test_variants = { + "mve_bias": _variant(_SmallMagnitudeLinear, CortexM.M55, uses_kernel_sum=True), + "dsp_bias": _variant(_SmallMagnitudeLinear, CortexM.M4, uses_kernel_sum=False), + "scalar_bias": _variant(_SmallMagnitudeLinear, CortexM.M0PLUS, uses_kernel_sum=False), + "mve_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M55, uses_kernel_sum=True), + "dsp_nobias": _variant(_SmallMagnitudeLinearNoBias, CortexM.M4, uses_kernel_sum=False), + "scalar_nobias": _variant( + _SmallMagnitudeLinearNoBias, CortexM.M0PLUS, uses_kernel_sum=False + ), +} + + +@parametrize("variant", test_variants) +def test_dialect_small_magnitude_linear(variant: _IsaVariant): + tester = CortexMTester( + variant.case.model, + variant.case.get_example_inputs(), + target_config=variant.target_config, + ) + tester.test_dialect( + ops_before_transforms=variant.case.model.ops_before_transforms, + ops_after_transforms=variant.case.model.ops_after_transforms, + qtol=1, + calibration_samples=_calibration_samples, + ) + + +@parametrize("variant", test_variants) +def test_aot_graph_shape_small_magnitude_linear(variant: _IsaVariant): + """Assert the post-pass node args match the ISA's CMSIS-NN convention.""" + tester = CortexMTester( + variant.case.model, + variant.case.get_example_inputs(), + target_config=variant.target_config, + ) + tester.quantize(None) + tester.export() + tester.to_edge() + tester.run_passes() + + module = tester.get_artifact(StageType.RUN_PASSES).exported_program().module() + linear_target = exir_ops.edge.cortex_m.quantized_linear.default + linear_nodes = [ + n for n in module.graph.nodes if n.op == "call_function" and n.target == linear_target + ] + assert len(linear_nodes) == 1, f"expected one quantized_linear node, got {len(linear_nodes)}" + bias_arg, kernel_sum_arg = linear_nodes[0].args[2], linear_nodes[0].args[3] + + if variant.uses_kernel_sum: + assert bias_arg is None, "MVE path must not pass bias (CMSIS-NN ignores it)" + assert kernel_sum_arg is not None, "MVE path requires precomputed kernel_sum" + else: + assert kernel_sum_arg is None, "non-MVE path must not pass kernel_sum" + # bias is allowed to be None only if the source nn.Linear had bias=False. + expects_bias = variant.case.model.fc.bias is not None + if expects_bias: + assert bias_arg is not None, "non-MVE path with bias must forward bias to CMSIS-NN" + + +def test_implementation_small_magnitude_linear(): + """Exercise the MVE kernel_sum codepath via the default M55 simulator build.""" + case = McuTestCase( + model=_SmallMagnitudeLinear().eval(), + example_inputs=lambda: (_make_input(),), + ) + tester = CortexMTester(case.model, case.get_example_inputs()) + tester.test_implementation(qtol=1, calibration_samples=_calibration_samples)