Skip to content

Commit 1747710

Browse files
committed
optimize utf8decode-wasm
1 parent bda1735 commit 1747710

File tree

4 files changed

+53
-46
lines changed

4 files changed

+53
-46
lines changed

src/utils/utf8-wasm-binary.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
// Auto-generated by wasm/build.sh - do not edit manually
22
// Source: wasm/utf8.wat
33

4-
export const wasmBinary = "AGFzbQEAAAABHwVgAW8Bf2ACb38Bf2ABfwFkb2ACb28BZG9gAn9/AW8CawQOd2FzbTpqcy1zdHJpbmcGbGVuZ3RoAAAOd2FzbTpqcy1zdHJpbmcKY2hhckNvZGVBdAABDndhc206anMtc3RyaW5nDGZyb21DaGFyQ29kZQACDndhc206anMtc3RyaW5nBmNvbmNhdAADAwQDAAEEBQMBAAEHMAQGbWVtb3J5AgAJdXRmOENvdW50AAQKdXRmOEVuY29kZQAFCnV0ZjhEZWNvZGUABgqXBgN1AQR/IAAQACECAkADQCABIAJPDQEgACABEAEhBCAEQYABSQRAIANBAWohAwUgBEGAEEkEQCADQQJqIQMFIARBgLADTyAEQf+3A01xBEAgA0EEaiEDIAFBAWohAQUgA0EDaiEDCwsLIAFBAWohAQwACwALIAMLvQIBBX8gABAAIQMgASEEAkADQCACIANPDQEgACACEAEhBSAFQYABSQRAIAQgBToAACAEQQFqIQQFIAVBgBBJBEAgBCAFQQZ2QcABcjoAACAEQQFqIAVBP3FBgAFyOgAAIARBAmohBAUgBUGAsANPIAVB/7cDTXEEQCACQQFqIQIgACACEAEhBiAFQYCwA2tBCnQgBkGAuANrakGAgARqIQUgBCAFQRJ2QfABcjoAACAEQQFqIAVBDHZBP3FBgAFyOgAAIARBAmogBUEGdkE/cUGAAXI6AAAgBEEDaiAFQT9xQYABcjoAACAEQQRqIQQFIAQgBUEMdkHgAXI6AAAgBEEBaiAFQQZ2QT9xQYABcjoAACAEQQJqIAVBP3FBgAFyOgAAIARBA2ohBAsLCyACQQFqIQIMAAsACyAEIAFrC98CAgd/AW8gACECIAAgAWohA0EAEAIhCQJAA0AgAiADTw0BIAItAAAhBCAEQYABcUUEQCAJIAQQAhADIQkgAkEBaiECDAELIARB4AFxQcABRgRAIAJBAWotAAAhBSAEQR9xQQZ0IAVBP3FyIQggCSAIEAIQAyEJIAJBAmohAgwBCyAEQfABcUHgAUYEQCACQQFqLQAAIQUgAkECai0AACEGIARBD3FBDHQgBUE/cUEGdHIgBkE/cXIhCCAJIAgQAhADIQkgAkEDaiECDAELIARB+AFxQfABRgRAIAJBAWotAAAhBSACQQJqLQAAIQYgAkEDai0AACEHIARBB3FBEnQgBUE/cUEMdHIgBkE/cUEGdHIgB0E/cXIhCCAIQYCABGshCCAJIAhBCnZBgLADchACEAMhCSAJIAhB/wdxQYC4A3IQAhADIQkgAkEEaiECDAELIAJBAWohAgwACwALIAkL";
4+
export const wasmBinary = "AGFzbQEAAAABHgVgAW8Bf2ACb38Bf2ABfwFkb2ACb28BZG9gAX8BfwJrBA53YXNtOmpzLXN0cmluZwZsZW5ndGgAAA53YXNtOmpzLXN0cmluZwpjaGFyQ29kZUF0AAEOd2FzbTpqcy1zdHJpbmcMZnJvbUNoYXJDb2RlAAIOd2FzbTpqcy1zdHJpbmcGY29uY2F0AAMDBAMAAQQFAwEAAQYIAX8AQYCAAgsHOAQGbWVtb3J5AgAJdXRmOENvdW50AAQKdXRmOEVuY29kZQAFEnV0ZjhEZWNvZGVUb01lbW9yeQAGCqoGA3UBBH8gABAAIQICQANAIAEgAk8NASAAIAEQASEEIARBgAFJBEAgA0EBaiEDBSAEQYAQSQRAIANBAmohAwUgBEGAsANPIARB/7cDTXEEQCADQQRqIQMgAUEBaiEBBSADQQNqIQMLCwsgAUEBaiEBDAALAAsgAwu9AgEFfyAAEAAhAyABIQQCQANAIAIgA08NASAAIAIQASEFIAVBgAFJBEAgBCAFOgAAIARBAWohBAUgBUGAEEkEQCAEIAVBBnZBwAFyOgAAIARBAWogBUE/cUGAAXI6AAAgBEECaiEEBSAFQYCwA08gBUH/twNNcQRAIAJBAWohAiAAIAIQASEGIAVBgLADa0EKdCAGQYC4A2tqQYCABGohBSAEIAVBEnZB8AFyOgAAIARBAWogBUEMdkE/cUGAAXI6AAAgBEECaiAFQQZ2QT9xQYABcjoAACAEQQNqIAVBP3FBgAFyOgAAIARBBGohBAUgBCAFQQx2QeABcjoAACAEQQFqIAVBBnZBP3FBgAFyOgAAIARBAmogBUE/cUGAAXI6AAAgBEEDaiEECwsLIAJBAWohAgwACwALIAQgAWsL8gIBCH9BACEBIAAhAiMAIQMCQANAIAEgAk8NASABLQAAIQQgBEGAAXFFBEAgAyAEOwEAIANBAmohAyABQQFqIQEMAQsgBEHgAXFBwAFGBEAgAUEBai0AACEFIARBH3FBBnQgBUE/cXIhCCADIAg7AQAgA0ECaiEDIAFBAmohAQwBCyAEQfABcUHgAUYEQCABQQFqLQAAIQUgAUECai0AACEGIARBD3FBDHQgBUE/cUEGdHIgBkE/cXIhCCADIAg7AQAgA0ECaiEDIAFBA2ohAQwBCyAEQfgBcUHwAUYEQCABQQFqLQAAIQUgAUECai0AACEGIAFBA2otAAAhByAEQQdxQRJ0IAVBP3FBDHRyIAZBP3FBBnRyIAdBP3FyIQggCEGAgARrIQggAyAIQQp2QYCwA3I7AQAgA0ECaiEDIAMgCEH/B3FBgLgDcjsBACADQQJqIQMgAUEEaiEBDAELIAFBAWohAQwACwALIAMjAGtBAXYL";

src/utils/utf8-wasm.ts

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,15 @@ interface WasmExports {
4343
memory: WebAssembly.Memory;
4444
utf8Count(str: string): number;
4545
utf8Encode(str: string, offset: number): number;
46-
utf8Decode(offset: number, length: number): string;
46+
utf8DecodeToMemory(length: number): number;
4747
}
4848

49+
// Memory layout constants (must match WAT file)
50+
const UTF16_OFFSET = 32768; // 32KB offset for UTF-16 output
51+
52+
// Shared TextDecoder for UTF-16LE decoding
53+
const utf16Decoder = new TextDecoder("utf-16le");
54+
4955
let wasmInstance: WasmExports | null = null;
5056
let wasmInitError: Error | null = null;
5157

@@ -178,20 +184,25 @@ export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLengt
178184
}
179185

180186
// Ensure wasm memory is large enough
181-
const requiredPages = Math.ceil(byteLength / 65536);
187+
// Need space for UTF-8 input (0 to byteLength) and UTF-16 output (UTF16_OFFSET onwards)
188+
// Max UTF-16 output is 2 bytes per code unit, and max expansion is 2x (for ASCII)
189+
const utf16MaxBytes = byteLength * 2;
190+
const requiredBytes = UTF16_OFFSET + utf16MaxBytes;
191+
const requiredPages = Math.ceil(requiredBytes / 65536);
182192
const currentPages = wasmInstance.memory.buffer.byteLength / 65536;
183193

184194
if (requiredPages > currentPages) {
185195
wasmInstance.memory.grow(requiredPages - currentPages);
186196
}
187197

188-
// Copy bytes to wasm memory
198+
// Copy bytes to wasm memory at offset 0
189199
const wasmBytes = new Uint8Array(wasmInstance.memory.buffer, 0, byteLength);
190200
wasmBytes.set(bytes.subarray(inputOffset, inputOffset + byteLength));
191201

192-
// Decode from wasm memory
193-
const result = wasmInstance.utf8Decode(0, byteLength);
202+
// Decode UTF-8 to UTF-16 in wasm memory, get number of code units
203+
const codeUnits = wasmInstance.utf8DecodeToMemory(byteLength);
194204

195-
// Remove leading NUL character (artifact of wasm implementation)
196-
return result.length > 0 && result.charCodeAt(0) === 0 ? result.slice(1) : result;
205+
// Read UTF-16 code units from wasm memory and decode to string
206+
const utf16Bytes = new Uint8Array(wasmInstance.memory.buffer, UTF16_OFFSET, codeUnits * 2);
207+
return utf16Decoder.decode(utf16Bytes);
197208
}

test/utf8-wasm.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ describe("utf8-wasm", () => {
2020
assert.ok(exports !== null);
2121
assert.ok(typeof exports!.utf8Count === "function");
2222
assert.ok(typeof exports!.utf8Encode === "function");
23-
assert.ok(typeof exports!.utf8Decode === "function");
23+
assert.ok(typeof exports!.utf8DecodeToMemory === "function");
2424
assert.ok(exports!.memory instanceof WebAssembly.Memory);
2525
} else {
2626
assert.strictEqual(exports, null);

wasm/utf8.wat

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,14 @@
1313
(import "wasm:js-string" "concat"
1414
(func $str_concat (param externref externref) (result (ref extern))))
1515

16-
;; Linear memory for UTF-8 bytes (64KB initial, exported for JS access)
16+
;; Linear memory layout:
17+
;; - 0 to 32KB: UTF-8 input bytes
18+
;; - 32KB onwards: UTF-16 code units output (i16 array)
1719
(memory (export "memory") 1)
1820

21+
;; Offset where UTF-16 output starts (32KB = 32768)
22+
(global $utf16_offset i32 (i32.const 32768))
23+
1924
;; Count UTF-8 byte length of a JS string
2025
;; This is equivalent to Buffer.byteLength(str, 'utf8') or TextEncoder().encode(str).length
2126
(func (export "utf8Count") (param $str externref) (result i32)
@@ -134,22 +139,23 @@
134139

135140
(i32.sub (local.get $pos) (local.get $offset)))
136141

137-
;; Decode UTF-8 bytes from linear memory to JS string
138-
;; Reads from offset for length bytes
139-
(func (export "utf8Decode") (param $offset i32) (param $length i32) (result externref)
142+
;; Decode UTF-8 bytes to UTF-16 code units in memory
143+
;; Reads UTF-8 from offset 0 for $length bytes
144+
;; Writes UTF-16 code units to utf16_offset
145+
;; Returns number of UTF-16 code units written
146+
(func (export "utf8DecodeToMemory") (param $length i32) (result i32)
140147
(local $pos i32)
141148
(local $end i32)
142-
(local $result externref)
149+
(local $outPos i32)
143150
(local $byte1 i32)
144151
(local $byte2 i32)
145152
(local $byte3 i32)
146153
(local $byte4 i32)
147154
(local $codePoint i32)
148155

149-
(local.set $pos (local.get $offset))
150-
(local.set $end (i32.add (local.get $offset) (local.get $length)))
151-
;; Start with empty string (NUL char, will be trimmed by JS side if needed)
152-
(local.set $result (call $str_fromCharCode (i32.const 0)))
156+
(local.set $pos (i32.const 0))
157+
(local.set $end (local.get $length))
158+
(local.set $outPos (global.get $utf16_offset))
153159

154160
(block $break
155161
(loop $continue
@@ -160,10 +166,8 @@
160166
;; 1-byte: 0xxxxxxx
161167
(if (i32.eqz (i32.and (local.get $byte1) (i32.const 0x80)))
162168
(then
163-
(local.set $result
164-
(call $str_concat
165-
(local.get $result)
166-
(call $str_fromCharCode (local.get $byte1))))
169+
(i32.store16 (local.get $outPos) (local.get $byte1))
170+
(local.set $outPos (i32.add (local.get $outPos) (i32.const 2)))
167171
(local.set $pos (i32.add (local.get $pos) (i32.const 1)))
168172
(br $continue)))
169173

@@ -175,10 +179,8 @@
175179
(i32.or
176180
(i32.shl (i32.and (local.get $byte1) (i32.const 0x1F)) (i32.const 6))
177181
(i32.and (local.get $byte2) (i32.const 0x3F))))
178-
(local.set $result
179-
(call $str_concat
180-
(local.get $result)
181-
(call $str_fromCharCode (local.get $codePoint))))
182+
(i32.store16 (local.get $outPos) (local.get $codePoint))
183+
(local.set $outPos (i32.add (local.get $outPos) (i32.const 2)))
182184
(local.set $pos (i32.add (local.get $pos) (i32.const 2)))
183185
(br $continue)))
184186

@@ -193,10 +195,8 @@
193195
(i32.shl (i32.and (local.get $byte1) (i32.const 0x0F)) (i32.const 12))
194196
(i32.shl (i32.and (local.get $byte2) (i32.const 0x3F)) (i32.const 6)))
195197
(i32.and (local.get $byte3) (i32.const 0x3F))))
196-
(local.set $result
197-
(call $str_concat
198-
(local.get $result)
199-
(call $str_fromCharCode (local.get $codePoint))))
198+
(i32.store16 (local.get $outPos) (local.get $codePoint))
199+
(local.set $outPos (i32.add (local.get $outPos) (i32.const 2)))
200200
(local.set $pos (i32.add (local.get $pos) (i32.const 3)))
201201
(br $continue)))
202202

@@ -217,28 +217,24 @@
217217
;; Convert to surrogate pair
218218
(local.set $codePoint (i32.sub (local.get $codePoint) (i32.const 0x10000)))
219219
;; High surrogate
220-
(local.set $result
221-
(call $str_concat
222-
(local.get $result)
223-
(call $str_fromCharCode
224-
(i32.or
225-
(i32.shr_u (local.get $codePoint) (i32.const 10))
226-
(i32.const 0xD800)))))
220+
(i32.store16 (local.get $outPos)
221+
(i32.or
222+
(i32.shr_u (local.get $codePoint) (i32.const 10))
223+
(i32.const 0xD800)))
224+
(local.set $outPos (i32.add (local.get $outPos) (i32.const 2)))
227225
;; Low surrogate
228-
(local.set $result
229-
(call $str_concat
230-
(local.get $result)
231-
(call $str_fromCharCode
232-
(i32.or
233-
(i32.and (local.get $codePoint) (i32.const 0x3FF))
234-
(i32.const 0xDC00)))))
226+
(i32.store16 (local.get $outPos)
227+
(i32.or
228+
(i32.and (local.get $codePoint) (i32.const 0x3FF))
229+
(i32.const 0xDC00)))
230+
(local.set $outPos (i32.add (local.get $outPos) (i32.const 2)))
235231
(local.set $pos (i32.add (local.get $pos) (i32.const 4)))
236232
(br $continue)))
237233

238234
;; Invalid byte, skip
239235
(local.set $pos (i32.add (local.get $pos) (i32.const 1)))
240236
(br $continue)))
241237

242-
;; Result has leading NUL char - JS side will slice it off
243-
(local.get $result))
238+
;; Return number of UTF-16 code units written
239+
(i32.shr_u (i32.sub (local.get $outPos) (global.get $utf16_offset)) (i32.const 1)))
244240
)

0 commit comments

Comments
 (0)