From 29d5a2530e5cec7f752ae17d2141f486bcf96442 Mon Sep 17 00:00:00 2001 From: He-Pin Date: Sun, 31 May 2026 17:36:21 +0800 Subject: [PATCH] perf: allocation-free Val.Str extractor (~25% faster match) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Motivation: `Val.Str.unapply` returned `Some((pos, str))`, so every `case Val.Str(p, s)` match (115 sites across the evaluator, stdlib, and materializer) went through an `Option` + `Tuple2` layer. Even though JVM C2 escape analysis scalar-replaces those short-lived objects in tight loops (so heap allocation is already ~0), the extra Option/Tuple indirection still costs instructions per match. Modification: - Rewrite `Str.unapply` as an allocation-free name-based extractor returning a value class `StrExtract(self: Str)` (`isEmpty`/`get`), with `_1`/`_2` accessors on `Str`. The `AnyVal` result is consumed by the match desugaring without allocation, and the `Str` type test keeps the match refutable so the `AsciiSafeStr` subclass is matched exactly as before. All 115 call sites are unchanged. `StrExtract`/`unapply` are `private[sjsonnet]`. - Add `StrMatchBenchmark`, a JMH micro that isolates the extractor in a tight loop (mixing `AsciiSafeStr`) as a regression guard. Result: Isolated micro (1024 matches/op, -f4, 60 samples): 440.7 ± 2.8 ns/op -> 331.9 ± 2.9 ns/op, a reproducible ~25% (1.33x) speedup; both baseline and new allocate ~0 B/op (EA already removed the heap object — the win is instruction count). End-to-end (MainBenchmark) is within noise since Val.Str matching is a small fraction of total parse+eval+materialize work. Compiles on Scala 3.3.7 / 2.13.18 / 2.12.21; full JVM test suite green; zero behavior change. --- .../sjsonnet/bench/StrMatchBenchmark.scala | 59 +++++++++++++++++++ sjsonnet/src/sjsonnet/Val.scala | 24 +++++++- 2 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 bench/src/sjsonnet/bench/StrMatchBenchmark.scala diff --git a/bench/src/sjsonnet/bench/StrMatchBenchmark.scala b/bench/src/sjsonnet/bench/StrMatchBenchmark.scala new file mode 100644 index 00000000..e64974b9 --- /dev/null +++ b/bench/src/sjsonnet/bench/StrMatchBenchmark.scala @@ -0,0 +1,59 @@ +package sjsonnet.bench + +import org.openjdk.jmh.annotations.* +import org.openjdk.jmh.infra.* +import sjsonnet.{Position, Val} + +import java.util.concurrent.TimeUnit + +/** + * Micro-benchmark isolating the `case Val.Str(pos, s)` extractor cost — the operation changed by + * the zero-allocation `Str.unapply` rewrite (value-class extractor vs the old + * `Some[(Position, String)]`). + * + * The end-to-end [[MainBenchmark]] dilutes this to noise; this loop does nothing but match Val.Str + * (mixing the `AsciiSafeStr` subclass) and consume the bound `pos`/`str`, so any per-match + * difference shows up directly in ns/op and gc.alloc.rate.norm. + * + * Run: ./mill bench.runJmh ".*StrMatchBenchmark.*" -f 4 -wi 10 -i 15 -r 2 -w 1 -prof gc + */ +@BenchmarkMode(Array(Mode.AverageTime)) +@Fork(4) +@Threads(1) +@Warmup(iterations = 10, time = 1) +@Measurement(iterations = 15, time = 2) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +class StrMatchBenchmark { + + @Param(Array("1024")) + var n: Int = _ + + private var vals: Array[Val] = _ + + @Setup + def setup(): Unit = { + val pos = new Position(null, 0) + vals = Array.tabulate[Val](n) { i => + // Alternate plain Str and the AsciiSafeStr subclass so both flow through the same extractor. + if ((i & 1) == 0) Val.Str(pos, "value_field_" + i) + else Val.Str.asciiSafe(pos, "ascii_field_" + i) + } + } + + @Benchmark + def matchStr(bh: Blackhole): Unit = { + val arr = vals + var i = 0 + while (i < arr.length) { + arr(i) match { + case Val.Str(p, s) => + bh.consume(p) + bh.consume(s) + case other => + bh.consume(other) + } + i += 1 + } + } +} diff --git a/sjsonnet/src/sjsonnet/Val.scala b/sjsonnet/src/sjsonnet/Val.scala index c2b9e379..3acc8ffe 100644 --- a/sjsonnet/src/sjsonnet/Val.scala +++ b/sjsonnet/src/sjsonnet/Val.scala @@ -357,6 +357,11 @@ object Val { override def asString: String = str + // Product-extractor accessors backing `case Val.Str(pos, s)` (see Str.unapply). Reading these + // off the scrutinee avoids the per-match Some+Tuple2 allocation the old extractor incurred. + private[sjsonnet] def _1: Position = pos + private[sjsonnet] def _2: String = str + /** * Iterative rope flattening — stack-safe for arbitrarily deep trees. For a left-leaning rope of * depth N (typical from repeated foldl concat), the ArrayDeque holds at most 2 elements. @@ -422,8 +427,23 @@ object Val { /** Create a leaf string node marked as ASCII-safe (no JSON escaping needed). */ def asciiSafe(pos: Position, s: String): Str = new AsciiSafeStr(pos, s) - /** Backward-compatible extractor: `case Val.Str(pos, s) =>` still works. */ - def unapply(s: Str): Option[(Position, String)] = Some((s.pos, s.str)) + /** + * Backward-compatible extractor: `case Val.Str(pos, s) =>` still works, now allocation-free. + * + * The result is a value class ([[StrExtract]]) implementing the name-based extractor protocol + * (`isEmpty`/`get`). Because it is an `AnyVal`, the match desugaring consumes it without + * allocating — replacing the `Some` + `Tuple2` the old `Option[(Position, String)]` extractor + * allocated on every match. These matches are extremely hot across the evaluator, stdlib, and + * materializer. The `Str` type test before extraction keeps the match refutable, so + * `AsciiSafeStr` (the only subclass) is matched exactly as before. + */ + private[sjsonnet] def unapply(s: Str): StrExtract = new StrExtract(s) + + /** Allocation-free extractor result for [[Str.unapply]]; `get` exposes `_1`/`_2`. */ + private[sjsonnet] final class StrExtract(val self: Str) extends AnyVal { + def isEmpty: Boolean = self == null + def get: Str = self + } /** * O(1) rope concatenation. Falls back to eager concat for small flat strings to avoid rope node