Optimize BN254 ecmul with field endomorphism

rodiazet · chfast · chfast · commit fd5bd44b2cb2 · 2025-12-18T09:52:56.000+01:00
Add `ecc::decompose()` procedure to split ECC scalar into two smaller
ones. Use the decomposition to speed up BN254 scalar multiplication.

Co-authored-by: Paweł Bylica &lt;pawel@hepcolgum.band&gt;
diff --git a/lib/evmone_precompiles/bn254.cpp b/lib/evmone_precompiles/bn254.cpp
@@ -18,7 +18,17 @@ bool validate(const AffinePoint& pt) noexcept
 
 AffinePoint mul(const AffinePoint& pt, const uint256& c) noexcept
 {
-    const auto pr = ecc::mul(pt, c);
+    if (pt == 0)
+        return pt;
+
+    if (c == 0)
+        return {};
+
+    const auto [k1, k2] = ecc::decompose<Curve>(c);
+
+    const auto q = AffinePoint{Curve::BETA * pt.x, !k2.sign ? pt.y : -pt.y};
+    const auto p = AffinePoint{pt.x, !k1.sign ? pt.y : -pt.y};
+    const auto pr = msm(k1.value, p, k2.value, q);
     return ecc::to_affine(pr);
 }
 }  // namespace evmmax::bn254
diff --git a/lib/evmone_precompiles/bn254.hpp b/lib/evmone_precompiles/bn254.hpp
@@ -31,6 +31,23 @@ struct Curve
 
     static constexpr auto A = 0;
     static constexpr auto B = ecc::FieldElement<Curve>(3);
+
+    /// Endomorphism parameters. See ecc::decompose().
+    /// @{
+    /// λ
+    static constexpr auto LAMBDA = 0xb3c4d79d41a917585bfc41088d8daaa78b17ea66b99c90dd_u256;
+    /// β
+    static constexpr ecc::FieldElement<Curve> BETA{
+        0x59e26bcea0d48bacd4f263f1acdb5c4f5763473177fffffe_u256};
+    /// x₁
+    static constexpr auto X1 = 0x6f4d8248eeb859fd95b806bca6f338ee_u256;
+    /// -y₁
+    static constexpr auto MINUS_Y1 = 0x6f4d8248eeb859fbf83e9682e87cfd45_u256;
+    /// x₂
+    static constexpr auto X2 = 0x6f4d8248eeb859fc8211bbeb7d4f1128_u256;
+    /// y₂
+    static constexpr auto Y2 = 0x6f4d8248eeb859fd0be4e1541221250b_u256;
+    /// @}
 };
 
 using AffinePoint = ecc::AffinePoint<Curve>;
diff --git a/lib/evmone_precompiles/ecc.hpp b/lib/evmone_precompiles/ecc.hpp
@@ -513,4 +513,116 @@ ProjPoint<Curve> msm(const typename Curve::uint_type& u, const AffinePoint<Curve
     return r;
 }
 
+template <typename UIntT>
+struct SignedScalar
+{
+    bool sign = false;  // The sign of the scalar: false = positive, true = negative.
+    UIntT value;
+};
+
+
+/// Verifies k ≡ k₁ + k₂·λ (mod N) and checks that k₁ and k₂ are "short" scalars.
+template <typename Curve>
+[[maybe_unused, nodiscard]] bool verify_scalar_decomposition(const typename Curve::uint_type& k,
+    const SignedScalar<typename Curve::uint_type>& k1,
+    const SignedScalar<typename Curve::uint_type>& k2) noexcept
+{
+    // Verify k ≡ k₁ + k₂·λ (mod N).
+    {
+        static constexpr ModArith N{Curve::ORDER};
+        auto r_k1 = N.to_mont(k1.value);
+        if (k1.sign)
+            r_k1 = N.sub(0, r_k1);
+        auto r_k2 = N.to_mont(k2.value);
+        if (k2.sign)
+            r_k2 = N.sub(0, r_k2);
+
+        const auto r_k = N.to_mont(k);
+
+        const auto right = N.add(r_k1, N.mul(r_k2, N.to_mont(Curve::LAMBDA)));
+        if (r_k != right)
+            return false;
+    }
+
+    // Verify for u = (k₁, k₂) that ‖u‖ <= max(‖v₁‖, ‖v₂‖).
+    {
+        static constexpr auto V1_NORM_SQUARED =
+            Curve::X1 * Curve::X1 + Curve::MINUS_Y1 * Curve::MINUS_Y1;
+        static constexpr auto V2_NORM_SQUARED = Curve::X2 * Curve::X2 + Curve::Y2 * Curve::Y2;
+        static constexpr auto MAX_NORM_SQUARED = std::max(V1_NORM_SQUARED, V2_NORM_SQUARED);
+        const auto u_norm_squared = k1.value * k1.value + k2.value * k2.value;
+        return u_norm_squared <= MAX_NORM_SQUARED;
+    }
+}
+
+/// Decomposes a scalar k into "short" scalars k₁ and k₂ such that k₁ + k₂·λ ≡ k (mod N).
+///
+/// This decomposition allows more efficient scalar multiplication by using the multi-scalar
+/// multiplication (MSM) and the GLV endomorphism.
+/// The endomorphism ϕ: E → E defined as (x,y) → (βx,y) with eigenvalue λ allows computing
+/// [λ](x,y) = (βx,y) with only one multiplication in 𝔽ₚ instead of a full scalar multiplication.
+///
+/// Moreover, to compute the short scalars k₁ and k₂, we need linearly independent short vectors
+/// (v₁=(x₁,y₁), v₂=(x₂,y₂)) such that f(v₁) = f(v₂) = 0,
+/// where f: ℤ×ℤ → ℤₙ is defined as (x,y) → (x + y·λ), where λ² + λ ≡ -1 mod N.
+///
+/// See https://www.iacr.org/archive/crypto2001/21390189.pdf for details.
+///
+/// The Curve type must provide the endomorphism parameters: LAMBDA, BETA, X1, MINUS_Y1, X2, Y2.
+template <typename Curve>
+std::array<SignedScalar<typename Curve::uint_type>, 2> decompose(
+    const typename Curve::uint_type& k) noexcept
+{
+    using UIntT = Curve::uint_type;
+
+    // Validate the provided setup parameters.
+    // λ² + λ ≡ -1 mod n
+    static_assert((umul(Curve::LAMBDA, Curve::LAMBDA) + Curve::LAMBDA + 1) % Curve::ORDER == 0);
+    // f: (x, y) → (x + λy) mod N
+    // f(v₁) = 0
+    static_assert(
+        (Curve::X1 + umul(Curve::ORDER - Curve::MINUS_Y1, Curve::LAMBDA)) % Curve::ORDER == 0);
+    // f(v₂) = 0
+    static_assert((Curve::X2 + umul(Curve::Y2, Curve::LAMBDA)) % Curve::ORDER == 0);
+
+    static constexpr auto round_div = [](const auto& a) noexcept {
+        // DET is the (v₁, v₂) matrix determinant.
+        static constexpr auto WIDE_DET =
+            umul(Curve::X1, Curve::Y2) + umul(Curve::X2, Curve::MINUS_Y1);
+        static_assert(WIDE_DET <= std::numeric_limits<UIntT>::max());
+        static constexpr auto DET = static_cast<UIntT>(WIDE_DET);
+        static constexpr auto HALF_DET = DET / 2;
+
+        const auto [wide_q, r] = udivrem(a, DET);
+        // Division reduces the quotient enough to fit into a single uint.
+        // This can be shown at compile-time by inspecting the DET and Y2/-Y1 values.
+        assert(wide_q < std::numeric_limits<UIntT>::max());
+        const auto q = static_cast<UIntT>(wide_q);
+        return q + (r > HALF_DET);  // Round to nearest.
+    };
+
+    // Solve a system of two equations using Cramer method.
+    // ⎡X1 X2⎤ * ⎡b1⎤ = ⎡k⎤
+    // ⎣Y1 Y2⎦   ⎣b2⎦   ⎣0⎦
+    // and then approximate to the nearest integers:
+    // b1 = ⌊ Y2·k ÷ DET⌉
+    // b2 = ⌊-Y1·k ÷ DET⌉
+    const auto b1 = round_div(umul(k, Curve::Y2));
+    const auto b2 = round_div(umul(k, Curve::MINUS_Y1));
+
+    // k1 = k - (x1*b1 + x2*b2)
+    const auto x1b1_x2b2 = umul(b1, Curve::X1) + umul(b2, Curve::X2);
+    const auto [wide_k1, k1_is_neg] = subc(decltype(x1b1_x2b2){k}, x1b1_x2b2);
+    const auto k1_abs = k1_is_neg ? -static_cast<UIntT>(wide_k1) : static_cast<UIntT>(wide_k1);
+
+    // k2 = 0 - (y1*b1 + y2*b2)
+    const auto [wide_k2, k2_is_neg] = subc(umul(b1, Curve::MINUS_Y1), umul(b2, Curve::Y2));
+    const auto k2_abs = k2_is_neg ? -static_cast<UIntT>(wide_k2) : static_cast<UIntT>(wide_k2);
+
+    const SignedScalar k1{k1_is_neg, k1_abs};
+    const SignedScalar k2{k2_is_neg, k2_abs};
+    assert(verify_scalar_decomposition<Curve>(k, k1, k2));
+    return {k1, k2};
+}
+
 }  // namespace evmmax::ecc
diff --git a/test/unittests/evmmax_bn254_mul_test.cpp b/test/unittests/evmmax_bn254_mul_test.cpp
@@ -9,6 +9,192 @@
 using namespace evmmax::bn254;
 using namespace evmone::test;
 
+TEST(evmmax, bn254_decompose)
+{
+    struct TestCase
+    {
+        uint256 k;
+        bool sign1 = false;
+        uint256 k1;
+        bool sign2 = false;
+        uint256 k2;
+    };
+
+    static const std::vector<TestCase> TEST_CASES = {
+        {0, false, 0, false, 0},
+        {1, false, 1, false, 0},
+        {
+            // For k = LAMBDA, the optimal decomposition is (0, 1)
+            // but the algorithm returns some other solution.
+            Curve::LAMBDA,
+            false,
+            0x89d3256894d213e3,
+            true,
+            Curve::X2 - 1,
+        },
+        {
+            Curve::ORDER - Curve::LAMBDA,
+            false,
+            0,
+            true,
+            1,
+        },
+        {
+            2 * Curve::ORDER,  // DET
+            false,
+            0,
+            false,
+            0,
+        },
+        {
+            2 * Curve::ORDER - 1,  // DET-1
+            true,
+            1,
+            false,
+            0,
+        },
+        {
+            2 * Curve::ORDER + 1,  // DET+1
+            false,
+            1,
+            false,
+            0,
+        },
+        {
+            Curve::ORDER,  // DET/2
+            false,
+            Curve::Y2,
+            false,
+            0x89d3256894d213e3,
+        },
+        {
+            Curve::ORDER - 1,  // DET/2-1
+            false,
+            Curve::Y2 - 1,
+            false,
+            0x89d3256894d213e3,
+        },
+        {
+            Curve::ORDER + 1,  // DET/2+1
+            true,
+            Curve::Y2 - 1,
+            true,
+            0x89d3256894d213e3,
+        },
+        {
+            0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff_u256,
+            true,
+            0x272d9e49b8c8ca4335756fc61411a7a3_u128,
+            false,
+            0x3f296ebc4b455178a6a2b71572d476d6_u128,
+        },
+        {
+            0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff_u256 % Curve::ORDER,
+            true,
+            0x272d9e49b8c8ca42aba24a5d7f3f93c0_u128,
+            true,
+            0x3024138ca3730883db6f04d60a7a9a52_u128,
+        },
+        {
+            0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff_u256 %
+                Curve::FIELD_PRIME,
+            true,
+            0x272d9e49b8c8ca3dd335f9b043dce0ca_u128,
+            false,
+            0x3f296ebc4b45517b57c272205aeeda45_u128,
+        },
+        {
+            Curve::X1,
+            false,
+            0,
+            false,
+            Curve::MINUS_Y1,
+        },
+        {
+            Curve::X2,
+            false,
+            Curve::X2,
+            false,
+            0,
+        },
+        {
+            Curve::MINUS_Y1,
+            false,
+            Curve::MINUS_Y1,
+            false,
+            0,
+        },
+        {
+            Curve::Y2,
+            true,
+            0x89d3256894d213e3,
+            false,
+            Curve::MINUS_Y1,
+        },
+        // Fuzzer finds:
+        {
+            0x30644e72e131a029b85045b68181585d2833e84879b9709143e1fd91c6ea5404_u256,
+            true,
+            0x6f4d8248eeb859fd0be4d9563b36d108_u128,
+            true,
+            0x89d3256894d213e3,
+        },
+        {
+            0x30644e72e131a029b85045b68181585d9781875b000000000000000000000000_u256,
+            false,
+            0x1cc9978e3571b0392917fddedaf4_u128,
+            true,
+            0x89d3256894d213e3,
+        },
+        {
+            0x00b3c4d79d41a917585bfc41088d8daaa78b17e6af48a03bbfd25e8cd0364141_u256,
+            true,
+            0x3b770fc551d2da1732fc9bebf_u128,
+            false,
+            0x100000000000000,
+        },
+        {
+            0x30644e72e131a02a6c151d53c32a6fb58431295107473e18805b7c56ba7d94de_u256,
+            false,
+            0x10000000022dfb1619c5c10e10400_u128,
+            false,
+            1,
+        },
+        {
+            0xfffffffffc2f0000000000000000000000000000000000000000000000000000_u256,
+            true,
+            0x4b3beb451625cff3c66c0effee355638_u128,
+            true,
+            0x11e8b394e6bd3d13de2f7f7d38887a8c_u128,
+        },
+        {
+            0x000000000000000059e26bcea0d48bac65a4e1a8be2302524b7e65dd65dedb2a_u256,
+            false,
+            0x36,
+            true,
+            0x44e992b44a6909f1,
+        },
+    };
+
+    static constexpr auto decompose = evmmax::ecc::decompose<Curve>;
+
+    for (const auto& t : TEST_CASES)
+    {
+        const auto& [first, second] = decompose(t.k);
+        const auto& [sign1, k1] = first;
+        const auto& [sign2, k2] = second;
+
+        SCOPED_TRACE(hex(t.k));
+
+        EXPECT_EQ(sign1, t.sign1);
+        EXPECT_EQ(k1, t.k1) << hex(k1) << " != " << hex(t.k1);
+        EXPECT_EQ(sign2, t.sign2);
+        EXPECT_EQ(k2, t.k2) << hex(k2) << " != " << hex(t.k2);
+
+        EXPECT_TRUE(verify_scalar_decomposition<Curve>(t.k, first, second));
+    }
+}
+
 namespace
 {
 struct TestCase