Skip to content

Commit 06034dc

Browse files
committed
optimize wasm impl
1 parent 1747710 commit 06034dc

File tree

7 files changed

+119
-93
lines changed

7 files changed

+119
-93
lines changed

benchmark/decode-string.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ import { getWasmError, utf8DecodeWasm } from "../src/utils/utf8-wasm.ts";
55
// @ts-ignore
66
import Benchmark from "benchmark";
77

8+
// description
9+
console.log("utf8DecodeJs - pure JS implementation");
10+
console.log("utf8DecodeTD - TextDecoder implementation");
11+
console.log("utf8DecodeWasm - WebAssembly implementation");
12+
813
// Show wasm status
914
console.log("=".repeat(60));
1015
console.log("WebAssembly Status:");
@@ -42,7 +47,7 @@ for (const baseStr of ["A", "あ", "🌏"]) {
4247
}
4348
});
4449

45-
suite.add("utf8DecodeTD (TextDecoder)", () => {
50+
suite.add("utf8DecodeTD", () => {
4651
if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
4752
throw new Error("wrong result!");
4853
}

benchmark/encode-string.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ import { getWasmError, utf8EncodeWasm } from "../src/utils/utf8-wasm.ts";
55
// @ts-ignore
66
import Benchmark from "benchmark";
77

8+
// description
9+
console.log("utf8EncodeJs - pure JS implementation");
10+
console.log("utf8EncodeTE - TextEncoder implementation");
11+
console.log("utf8EncodeWasm - WebAssembly implementation");
12+
813
// Show wasm status
914
console.log("=".repeat(60));
1015
console.log("WebAssembly Status:");
@@ -39,7 +44,7 @@ for (const baseStr of ["A", "あ", "🌏"]) {
3944
utf8EncodeJs(str, buffer, 0);
4045
});
4146

42-
suite.add("utf8EncodeTE (TextEncoder)", () => {
47+
suite.add("utf8EncodeTE", () => {
4348
utf8EncodeTE(str, buffer, 0);
4449
});
4550

src/utils/utf8-wasm-binary.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
// Auto-generated by wasm/build.sh - do not edit manually
22
// Source: wasm/utf8.wat
33

4-
export const wasmBinary = "AGFzbQEAAAABHgVgAW8Bf2ACb38Bf2ABfwFkb2ACb28BZG9gAX8BfwJrBA53YXNtOmpzLXN0cmluZwZsZW5ndGgAAA53YXNtOmpzLXN0cmluZwpjaGFyQ29kZUF0AAEOd2FzbTpqcy1zdHJpbmcMZnJvbUNoYXJDb2RlAAIOd2FzbTpqcy1zdHJpbmcGY29uY2F0AAMDBAMAAQQFAwEAAQYIAX8AQYCAAgsHOAQGbWVtb3J5AgAJdXRmOENvdW50AAQKdXRmOEVuY29kZQAFEnV0ZjhEZWNvZGVUb01lbW9yeQAGCqoGA3UBBH8gABAAIQICQANAIAEgAk8NASAAIAEQASEEIARBgAFJBEAgA0EBaiEDBSAEQYAQSQRAIANBAmohAwUgBEGAsANPIARB/7cDTXEEQCADQQRqIQMgAUEBaiEBBSADQQNqIQMLCwsgAUEBaiEBDAALAAsgAwu9AgEFfyAAEAAhAyABIQQCQANAIAIgA08NASAAIAIQASEFIAVBgAFJBEAgBCAFOgAAIARBAWohBAUgBUGAEEkEQCAEIAVBBnZBwAFyOgAAIARBAWogBUE/cUGAAXI6AAAgBEECaiEEBSAFQYCwA08gBUH/twNNcQRAIAJBAWohAiAAIAIQASEGIAVBgLADa0EKdCAGQYC4A2tqQYCABGohBSAEIAVBEnZB8AFyOgAAIARBAWogBUEMdkE/cUGAAXI6AAAgBEECaiAFQQZ2QT9xQYABcjoAACAEQQNqIAVBP3FBgAFyOgAAIARBBGohBAUgBCAFQQx2QeABcjoAACAEQQFqIAVBBnZBP3FBgAFyOgAAIARBAmogBUE/cUGAAXI6AAAgBEEDaiEECwsLIAJBAWohAgwACwALIAQgAWsL8gIBCH9BACEBIAAhAiMAIQMCQANAIAEgAk8NASABLQAAIQQgBEGAAXFFBEAgAyAEOwEAIANBAmohAyABQQFqIQEMAQsgBEHgAXFBwAFGBEAgAUEBai0AACEFIARBH3FBBnQgBUE/cXIhCCADIAg7AQAgA0ECaiEDIAFBAmohAQwBCyAEQfABcUHgAUYEQCABQQFqLQAAIQUgAUECai0AACEGIARBD3FBDHQgBUE/cUEGdHIgBkE/cXIhCCADIAg7AQAgA0ECaiEDIAFBA2ohAQwBCyAEQfgBcUHwAUYEQCABQQFqLQAAIQUgAUECai0AACEGIAFBA2otAAAhByAEQQdxQRJ0IAVBP3FBDHRyIAZBP3FBBnRyIAdBP3FyIQggCEGAgARrIQggAyAIQQp2QYCwA3I7AQAgA0ECaiEDIAMgCEH/B3FBgLgDcjsBACADQQJqIQMgAUEEaiEBDAELIAFBAWohAQwACwALIAMjAGtBAXYL";
4+
export const wasmBinary = "AGFzbQEAAAABNQhedwFgAW8Bf2ADb2QAfwF/YANkAH9/AWRvYAJvfwF/YAJ/ZAABf2ABfwFkAGADZAB/fwFvAl8DDndhc206anMtc3RyaW5nBmxlbmd0aAABDndhc206anMtc3RyaW5nEWludG9DaGFyQ29kZUFycmF5AAIOd2FzbTpqcy1zdHJpbmcRZnJvbUNoYXJDb2RlQXJyYXkAAwMGBQEEBQYHBQMBAAEHVAYGbWVtb3J5AgAJdXRmOENvdW50AAMKdXRmOEVuY29kZQAEEXV0ZjhEZWNvZGVUb0FycmF5AAUKYWxsb2NBcnJheQAGDWFycmF5VG9TdHJpbmcABwr7BgWUAQIEfwFkACAAEAAhASABRQRAQQAPC0EAIAH7BgAhBSAAIAVBABABGgJAA0AgAiABTw0BIAUgAvsNACEEIARBgAFJBEAgA0EBaiEDBSAEQYAQSQRAIANBAmohAwUgBEGAsANPIARB/7cDTXEEQCADQQRqIQMgAkEBaiECBSADQQNqIQMLCwsgAkEBaiECDAALAAsgAwvdAgIFfwFkACAAEAAhAiABIQQgAkUEQEEADwtBACAC+wYAIQcgACAHQQAQARoCQANAIAMgAk8NASAHIAP7DQAhBSAFQYABSQRAIAQgBToAACAEQQFqIQQFIAVBgBBJBEAgBCAFQQZ2QcABcjoAACAEQQFqIAVBP3FBgAFyOgAAIARBAmohBAUgBUGAsANPIAVB/7cDTXEEQCADQQFqIQMgByAD+w0AIQYgBUGAsANrQQp0IAZBgLgDa2pBgIAEaiEFIAQgBUESdkHwAXI6AAAgBEEBaiAFQQx2QT9xQYABcjoAACAEQQJqIAVBBnZBP3FBgAFyOgAAIARBA2ogBUE/cUGAAXI6AAAgBEEEaiEEBSAEIAVBDHZB4AFyOgAAIARBAWogBUEGdkE/cUGAAXI6AAAgBEECaiAFQT9xQYABcjoAACAEQQNqIQQLCwsgA0EBaiEDDAALAAsgBCABawvuAgEIfyAAIQMCQANAIAIgA08NASACLQAAIQUgBUGAAXFFBEAgASAEIAX7DgAgBEEBaiEEIAJBAWohAgwBCyAFQeABcUHAAUYEQCACQQFqLQAAIQYgBUEfcUEGdCAGQT9xciEJIAEgBCAJ+w4AIARBAWohBCACQQJqIQIMAQsgBUHwAXFB4AFGBEAgAkEBai0AACEGIAJBAmotAAAhByAFQQ9xQQx0IAZBP3FBBnRyIAdBP3FyIQkgASAEIAn7DgAgBEEBaiEEIAJBA2ohAgwBCyAFQfgBcUHwAUYEQCACQQFqLQAAIQYgAkECai0AACEHIAJBA2otAAAhCCAFQQdxQRJ0IAZBP3FBDHRyIAdBP3FBBnRyIAhBP3FyIQkgCUGAgARrIQkgASAEIAlBCnZBgLADcvsOACAEQQFqIQQgASAEIAlB/wdxQYC4A3L7DgAgBEEBaiEEIAJBBGohAgwBCyACQQFqIQIMAAsACyAECwkAQQAgAPsGAAsKACAAIAEgAhACCw==";

src/utils/utf8-wasm.ts

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
/**
2-
* WebAssembly-based UTF-8 string processing using js-string-builtins.
2+
* WebAssembly-based UTF-8 string processing using js-string-builtins with GC arrays.
33
*
44
* Environment variables:
55
* - MSGPACK_WASM=force: Force wasm mode, throw error if wasm fails to load
66
* - MSGPACK_WASM=never: Disable wasm, always use pure JS
77
*
8-
* Three-tier fallback:
9-
* 1. Native js-string-builtins (Chrome 130+, Firefox 134+)
10-
* 2. Wasm + polyfill (older browsers with WebAssembly)
11-
* 3. Pure JS (no WebAssembly support)
8+
* This implementation uses WASM GC arrays with intoCharCodeArray/fromCharCodeArray
9+
* for efficient bulk string operations instead of character-by-character processing.
1210
*/
1311

1412
import { wasmBinary } from "./utf8-wasm-binary.ts";
@@ -39,19 +37,18 @@ function getWasmMode(): "force" | "never" | "auto" {
3937

4038
const WASM_MODE = getWasmMode();
4139

40+
// GC array type (opaque reference)
41+
type I16Array = object;
42+
4243
interface WasmExports {
4344
memory: WebAssembly.Memory;
4445
utf8Count(str: string): number;
4546
utf8Encode(str: string, offset: number): number;
46-
utf8DecodeToMemory(length: number): number;
47+
utf8DecodeToArray(length: number, arr: I16Array): number;
48+
allocArray(size: number): I16Array;
49+
arrayToString(arr: I16Array, start: number, end: number): string;
4750
}
4851

49-
// Memory layout constants (must match WAT file)
50-
const UTF16_OFFSET = 32768; // 32KB offset for UTF-16 output
51-
52-
// Shared TextDecoder for UTF-16LE decoding
53-
const utf16Decoder = new TextDecoder("utf-16le");
54-
5552
let wasmInstance: WasmExports | null = null;
5653
let wasmInitError: Error | null = null;
5754

@@ -68,17 +65,6 @@ function base64ToBytes(base64: string): Uint8Array {
6865
return new Uint8Array(Buffer.from(base64, "base64"));
6966
}
7067

71-
// Polyfill for js-string-builtins (used when native builtins unavailable)
72-
const jsStringPolyfill = {
73-
// eslint-disable-next-line @typescript-eslint/naming-convention
74-
"wasm:js-string": {
75-
length: (s: string) => s.length,
76-
charCodeAt: (s: string, i: number) => s.charCodeAt(i),
77-
fromCharCode: (code: number) => String.fromCharCode(code),
78-
concat: (a: string, b: string) => a + b,
79-
},
80-
};
81-
8268
function tryInitWasm(): void {
8369
if (wasmInstance !== null || wasmInitError !== null) {
8470
return; // Already initialized or failed
@@ -96,14 +82,9 @@ function tryInitWasm(): void {
9682

9783
const bytes = base64ToBytes(wasmBinary);
9884

99-
// Try with builtins option (native support)
100-
// If builtins not supported, option is ignored and polyfill is used
101-
102-
85+
// Requires js-string builtins support (Node.js 24+ / Chrome 130+ / Firefox 134+)
10386
const module: WebAssembly.Module = new (WebAssembly.Module as any)(bytes, { builtins: ["js-string"] });
104-
105-
106-
const instance = new (WebAssembly.Instance)(module, jsStringPolyfill);
87+
const instance = new WebAssembly.Instance(module);
10788
wasmInstance = instance.exports as unknown as WasmExports;
10889
} catch (e) {
10990
wasmInitError = e instanceof Error ? e : new Error(String(e));
@@ -121,7 +102,7 @@ tryInitWasm();
121102
* Whether wasm is available and initialized.
122103
*/
123104
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
124-
export const WASM_AVAILABLE = (wasmInstance !== null);
105+
export const WASM_AVAILABLE = wasmInstance !== null;
125106

126107
/**
127108
* Get the wasm initialization error, if any.
@@ -156,16 +137,20 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
156137
throw new Error("wasm not initialized");
157138
}
158139

140+
// Estimate max byte length without a full pass over the string.
141+
// Each UTF-16 code unit can produce at most 3 UTF-8 bytes (BMP chars).
142+
// Surrogate pairs (2 code units) produce 4 bytes, so 3 bytes/code unit is safe.
143+
const maxByteLength = str.length * 3;
144+
159145
// Ensure wasm memory is large enough
160-
const byteLength = wasmInstance.utf8Count(str);
161-
const requiredPages = Math.ceil((outputOffset + byteLength) / 65536);
146+
const requiredPages = Math.ceil(maxByteLength / 65536);
162147
const currentPages = wasmInstance.memory.buffer.byteLength / 65536;
163148

164149
if (requiredPages > currentPages) {
165150
wasmInstance.memory.grow(requiredPages - currentPages);
166151
}
167152

168-
// Encode to wasm memory
153+
// Encode to wasm memory (uses intoCharCodeArray for bulk char extraction)
169154
const bytesWritten = wasmInstance.utf8Encode(str, 0);
170155

171156
// Copy from wasm memory to output buffer
@@ -177,32 +162,36 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
177162

178163
/**
179164
* Decode UTF-8 bytes to string.
165+
* Uses GC arrays with fromCharCodeArray for efficient string creation.
180166
*/
181167
export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
182168
if (wasmInstance === null) {
183169
throw new Error("wasm not initialized");
184170
}
185171

186-
// Ensure wasm memory is large enough
187-
// Need space for UTF-8 input (0 to byteLength) and UTF-16 output (UTF16_OFFSET onwards)
188-
// Max UTF-16 output is 2 bytes per code unit, and max expansion is 2x (for ASCII)
189-
const utf16MaxBytes = byteLength * 2;
190-
const requiredBytes = UTF16_OFFSET + utf16MaxBytes;
191-
const requiredPages = Math.ceil(requiredBytes / 65536);
172+
// Handle empty input
173+
if (byteLength === 0) {
174+
return "";
175+
}
176+
177+
// Ensure wasm memory is large enough for UTF-8 input
178+
const requiredPages = Math.ceil(byteLength / 65536);
192179
const currentPages = wasmInstance.memory.buffer.byteLength / 65536;
193180

194181
if (requiredPages > currentPages) {
195182
wasmInstance.memory.grow(requiredPages - currentPages);
196183
}
197184

198-
// Copy bytes to wasm memory at offset 0
185+
// Copy UTF-8 bytes to wasm linear memory at offset 0
199186
const wasmBytes = new Uint8Array(wasmInstance.memory.buffer, 0, byteLength);
200187
wasmBytes.set(bytes.subarray(inputOffset, inputOffset + byteLength));
201188

202-
// Decode UTF-8 to UTF-16 in wasm memory, get number of code units
203-
const codeUnits = wasmInstance.utf8DecodeToMemory(byteLength);
189+
// Allocate GC array for UTF-16 output (max size = byteLength for ASCII)
190+
const arr = wasmInstance.allocArray(byteLength);
191+
192+
// Decode UTF-8 to UTF-16 in GC array
193+
const codeUnits = wasmInstance.utf8DecodeToArray(byteLength, arr);
204194

205-
// Read UTF-16 code units from wasm memory and decode to string
206-
const utf16Bytes = new Uint8Array(wasmInstance.memory.buffer, UTF16_OFFSET, codeUnits * 2);
207-
return utf16Decoder.decode(utf16Bytes);
195+
// Create string directly from GC array using fromCharCodeArray
196+
return wasmInstance.arrayToString(arr, 0, codeUnits);
208197
}

test/utf8-wasm.test.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ describe("utf8-wasm", () => {
2020
assert.ok(exports !== null);
2121
assert.ok(typeof exports!.utf8Count === "function");
2222
assert.ok(typeof exports!.utf8Encode === "function");
23-
assert.ok(typeof exports!.utf8DecodeToMemory === "function");
23+
assert.ok(typeof exports!.utf8DecodeToArray === "function");
24+
assert.ok(typeof exports!.allocArray === "function");
25+
assert.ok(typeof exports!.arrayToString === "function");
2426
assert.ok(exports!.memory instanceof WebAssembly.Memory);
2527
} else {
2628
assert.strictEqual(exports, null);

wasm/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ set -e
77
cd "$(dirname "$0")"
88

99
echo "Compiling utf8.wat -> utf8.wasm..."
10-
wasm-as utf8.wat -o utf8.wasm --enable-reference-types --enable-gc
10+
wasm-as utf8.wat -o utf8.wasm --enable-reference-types --enable-gc --enable-strings
1111

1212
echo "Generating base64 TypeScript module..."
1313
cat > ../src/utils/utf8-wasm-binary.ts << 'HEADER'

0 commit comments

Comments
 (0)