11/**
2- * WebAssembly-based UTF-8 string processing using js-string-builtins.
2+ * WebAssembly-based UTF-8 string processing using js-string-builtins with GC arrays .
33 *
44 * Environment variables:
55 * - MSGPACK_WASM=force: Force wasm mode, throw error if wasm fails to load
66 * - MSGPACK_WASM=never: Disable wasm, always use pure JS
77 *
8- * Three-tier fallback:
9- * 1. Native js-string-builtins (Chrome 130+, Firefox 134+)
10- * 2. Wasm + polyfill (older browsers with WebAssembly)
11- * 3. Pure JS (no WebAssembly support)
8+ * This implementation uses WASM GC arrays with intoCharCodeArray/fromCharCodeArray
9+ * for efficient bulk string operations instead of character-by-character processing.
1210 */
1311
1412import { wasmBinary } from "./utf8-wasm-binary.ts" ;
@@ -39,19 +37,18 @@ function getWasmMode(): "force" | "never" | "auto" {
3937
4038const WASM_MODE = getWasmMode ( ) ;
4139
40+ // GC array type (opaque reference)
41+ type I16Array = object ;
42+
4243interface WasmExports {
4344 memory : WebAssembly . Memory ;
4445 utf8Count ( str : string ) : number ;
4546 utf8Encode ( str : string , offset : number ) : number ;
46- utf8DecodeToMemory ( length : number ) : number ;
47+ utf8DecodeToArray ( length : number , arr : I16Array ) : number ;
48+ allocArray ( size : number ) : I16Array ;
49+ arrayToString ( arr : I16Array , start : number , end : number ) : string ;
4750}
4851
49- // Memory layout constants (must match WAT file)
50- const UTF16_OFFSET = 32768 ; // 32KB offset for UTF-16 output
51-
52- // Shared TextDecoder for UTF-16LE decoding
53- const utf16Decoder = new TextDecoder ( "utf-16le" ) ;
54-
5552let wasmInstance : WasmExports | null = null ;
5653let wasmInitError : Error | null = null ;
5754
@@ -68,17 +65,6 @@ function base64ToBytes(base64: string): Uint8Array {
6865 return new Uint8Array ( Buffer . from ( base64 , "base64" ) ) ;
6966}
7067
71- // Polyfill for js-string-builtins (used when native builtins unavailable)
72- const jsStringPolyfill = {
73- // eslint-disable-next-line @typescript-eslint/naming-convention
74- "wasm:js-string" : {
75- length : ( s : string ) => s . length ,
76- charCodeAt : ( s : string , i : number ) => s . charCodeAt ( i ) ,
77- fromCharCode : ( code : number ) => String . fromCharCode ( code ) ,
78- concat : ( a : string , b : string ) => a + b ,
79- } ,
80- } ;
81-
8268function tryInitWasm ( ) : void {
8369 if ( wasmInstance !== null || wasmInitError !== null ) {
8470 return ; // Already initialized or failed
@@ -96,14 +82,9 @@ function tryInitWasm(): void {
9682
9783 const bytes = base64ToBytes ( wasmBinary ) ;
9884
99- // Try with builtins option (native support)
100- // If builtins not supported, option is ignored and polyfill is used
101-
102-
85+ // Requires js-string builtins support (Node.js 24+ / Chrome 130+ / Firefox 134+)
10386 const module : WebAssembly . Module = new ( WebAssembly . Module as any ) ( bytes , { builtins : [ "js-string" ] } ) ;
104-
105-
106- const instance = new ( WebAssembly . Instance ) ( module , jsStringPolyfill ) ;
87+ const instance = new WebAssembly . Instance ( module ) ;
10788 wasmInstance = instance . exports as unknown as WasmExports ;
10889 } catch ( e ) {
10990 wasmInitError = e instanceof Error ? e : new Error ( String ( e ) ) ;
@@ -121,7 +102,7 @@ tryInitWasm();
121102 * Whether wasm is available and initialized.
122103 */
123104// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
124- export const WASM_AVAILABLE = ( wasmInstance !== null ) ;
105+ export const WASM_AVAILABLE = wasmInstance !== null ;
125106
126107/**
127108 * Get the wasm initialization error, if any.
@@ -156,16 +137,20 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
156137 throw new Error ( "wasm not initialized" ) ;
157138 }
158139
140+ // Estimate max byte length without a full pass over the string.
141+ // Each UTF-16 code unit can produce at most 3 UTF-8 bytes (BMP chars).
142+ // Surrogate pairs (2 code units) produce 4 bytes, so 3 bytes/code unit is safe.
143+ const maxByteLength = str . length * 3 ;
144+
159145 // Ensure wasm memory is large enough
160- const byteLength = wasmInstance . utf8Count ( str ) ;
161- const requiredPages = Math . ceil ( ( outputOffset + byteLength ) / 65536 ) ;
146+ const requiredPages = Math . ceil ( maxByteLength / 65536 ) ;
162147 const currentPages = wasmInstance . memory . buffer . byteLength / 65536 ;
163148
164149 if ( requiredPages > currentPages ) {
165150 wasmInstance . memory . grow ( requiredPages - currentPages ) ;
166151 }
167152
168- // Encode to wasm memory
153+ // Encode to wasm memory (uses intoCharCodeArray for bulk char extraction)
169154 const bytesWritten = wasmInstance . utf8Encode ( str , 0 ) ;
170155
171156 // Copy from wasm memory to output buffer
@@ -177,32 +162,36 @@ export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: nu
177162
178163/**
179164 * Decode UTF-8 bytes to string.
165+ * Uses GC arrays with fromCharCodeArray for efficient string creation.
180166 */
181167export function utf8DecodeWasm ( bytes : Uint8Array , inputOffset : number , byteLength : number ) : string {
182168 if ( wasmInstance === null ) {
183169 throw new Error ( "wasm not initialized" ) ;
184170 }
185171
186- // Ensure wasm memory is large enough
187- // Need space for UTF-8 input (0 to byteLength) and UTF-16 output (UTF16_OFFSET onwards)
188- // Max UTF-16 output is 2 bytes per code unit, and max expansion is 2x (for ASCII)
189- const utf16MaxBytes = byteLength * 2 ;
190- const requiredBytes = UTF16_OFFSET + utf16MaxBytes ;
191- const requiredPages = Math . ceil ( requiredBytes / 65536 ) ;
172+ // Handle empty input
173+ if ( byteLength === 0 ) {
174+ return "" ;
175+ }
176+
177+ // Ensure wasm memory is large enough for UTF-8 input
178+ const requiredPages = Math . ceil ( byteLength / 65536 ) ;
192179 const currentPages = wasmInstance . memory . buffer . byteLength / 65536 ;
193180
194181 if ( requiredPages > currentPages ) {
195182 wasmInstance . memory . grow ( requiredPages - currentPages ) ;
196183 }
197184
198- // Copy bytes to wasm memory at offset 0
185+ // Copy UTF-8 bytes to wasm linear memory at offset 0
199186 const wasmBytes = new Uint8Array ( wasmInstance . memory . buffer , 0 , byteLength ) ;
200187 wasmBytes . set ( bytes . subarray ( inputOffset , inputOffset + byteLength ) ) ;
201188
202- // Decode UTF-8 to UTF-16 in wasm memory, get number of code units
203- const codeUnits = wasmInstance . utf8DecodeToMemory ( byteLength ) ;
189+ // Allocate GC array for UTF-16 output (max size = byteLength for ASCII)
190+ const arr = wasmInstance . allocArray ( byteLength ) ;
191+
192+ // Decode UTF-8 to UTF-16 in GC array
193+ const codeUnits = wasmInstance . utf8DecodeToArray ( byteLength , arr ) ;
204194
205- // Read UTF-16 code units from wasm memory and decode to string
206- const utf16Bytes = new Uint8Array ( wasmInstance . memory . buffer , UTF16_OFFSET , codeUnits * 2 ) ;
207- return utf16Decoder . decode ( utf16Bytes ) ;
195+ // Create string directly from GC array using fromCharCodeArray
196+ return wasmInstance . arrayToString ( arr , 0 , codeUnits ) ;
208197}
0 commit comments