Skip to content

Commit bda1735

Browse files
committed
add wasm implementation again
1 parent cd7b03a commit bda1735

File tree

12 files changed

+1040
-8
lines changed

12 files changed

+1040
-8
lines changed

benchmark/decode-string.ts

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,27 @@
11
/* eslint-disable no-console */
2-
import { utf8EncodeJs, utf8Count, utf8DecodeJs, utf8DecodeTD } from "../src/utils/utf8";
2+
import { utf8EncodeJs, utf8Count, utf8DecodeJs, utf8DecodeTD, WASM_AVAILABLE } from "../src/utils/utf8.ts";
3+
import { getWasmError, utf8DecodeWasm } from "../src/utils/utf8-wasm.ts";
34

45
// @ts-ignore
56
import Benchmark from "benchmark";
67

8+
// Show wasm status
9+
console.log("=".repeat(60));
10+
console.log("WebAssembly Status:");
11+
console.log(` WASM_AVAILABLE: ${WASM_AVAILABLE}`);
12+
if (WASM_AVAILABLE) {
13+
console.log(" js-string-builtins: enabled");
14+
} else {
15+
const error = getWasmError();
16+
console.log(` Error: ${error?.message || "unknown"}`);
17+
if (error?.message?.includes("js-string") || error?.message?.includes("builtin")) {
18+
console.log("\n js-string-builtins is enabled by default in Node.js 24+ (V8 13.6+).");
19+
console.log(" For older versions, run with:");
20+
console.log(" node --experimental-wasm-imported-strings node_modules/.bin/ts-node benchmark/decode-string.ts");
21+
}
22+
}
23+
console.log("=".repeat(60));
24+
725
for (const baseStr of ["A", "あ", "🌏"]) {
826
const dataSet = [10, 100, 500, 1_000].map((n) => {
927
return baseStr.repeat(n);
@@ -24,11 +42,20 @@ for (const baseStr of ["A", "あ", "🌏"]) {
2442
}
2543
});
2644

27-
suite.add("TextDecoder", () => {
45+
suite.add("utf8DecodeTD (TextDecoder)", () => {
2846
if (utf8DecodeTD(bytes, 0, byteLength) !== str) {
2947
throw new Error("wrong result!");
3048
}
3149
});
50+
51+
if (WASM_AVAILABLE) {
52+
suite.add("utf8DecodeWasm", () => {
53+
if (utf8DecodeWasm(bytes, 0, byteLength) !== str) {
54+
throw new Error("wrong result!");
55+
}
56+
});
57+
}
58+
3259
suite.on("cycle", (event: any) => {
3360
console.log(String(event.target));
3461
});

benchmark/encode-string.ts

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,27 @@
11
/* eslint-disable no-console */
2-
import { utf8EncodeJs, utf8Count, utf8EncodeTE } from "../src/utils/utf8";
2+
import { utf8EncodeJs, utf8Count, utf8EncodeTE, WASM_AVAILABLE } from "../src/utils/utf8.ts";
3+
import { getWasmError, utf8EncodeWasm } from "../src/utils/utf8-wasm.ts";
34

45
// @ts-ignore
56
import Benchmark from "benchmark";
67

8+
// Show wasm status
9+
console.log("=".repeat(60));
10+
console.log("WebAssembly Status:");
11+
console.log(` WASM_AVAILABLE: ${WASM_AVAILABLE}`);
12+
if (WASM_AVAILABLE) {
13+
console.log(" js-string-builtins: enabled");
14+
} else {
15+
const error = getWasmError();
16+
console.log(` Error: ${error?.message || "unknown"}`);
17+
if (error?.message?.includes("js-string") || error?.message?.includes("builtin")) {
18+
console.log("\n js-string-builtins is enabled by default in Node.js 24+ (V8 13.6+).");
19+
console.log(" For older versions, run with:");
20+
console.log(" node --experimental-wasm-imported-strings node_modules/.bin/ts-node benchmark/encode-string.ts");
21+
}
22+
}
23+
console.log("=".repeat(60));
24+
725
for (const baseStr of ["A", "あ", "🌏"]) {
826
const dataSet = [10, 30, 50, 100].map((n) => {
927
return baseStr.repeat(n);
@@ -21,9 +39,16 @@ for (const baseStr of ["A", "あ", "🌏"]) {
2139
utf8EncodeJs(str, buffer, 0);
2240
});
2341

24-
suite.add("utf8DecodeTE", () => {
42+
suite.add("utf8EncodeTE (TextEncoder)", () => {
2543
utf8EncodeTE(str, buffer, 0);
2644
});
45+
46+
if (WASM_AVAILABLE) {
47+
suite.add("utf8EncodeWasm", () => {
48+
utf8EncodeWasm(str, buffer, 0);
49+
});
50+
}
51+
2752
suite.on("cycle", (event: any) => {
2853
console.log(String(event.target));
2954
});

benchmark/key-decoder.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* eslint-disable no-console */
2-
import { utf8EncodeJs, utf8Count, utf8DecodeJs } from "../src/utils/utf8";
2+
import { utf8EncodeJs, utf8Count, utf8DecodeJs } from "../src/utils/utf8.ts";
33

44
// @ts-ignore
55
import Benchmark from "benchmark";

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"prepublishOnly": "npm run test:dist",
3030
"clean": "rimraf build dist dist.*",
3131
"test": "mocha 'test/**/*.test.ts'",
32+
"test:wasm": "MSGPACK_WASM=force node --experimental-wasm-imported-strings node_modules/.bin/mocha 'test/**/*.test.ts'",
3233
"test:dist": "npm run lint && npm run test && npm run test:deno",
3334
"test:cover": "npm run cover:clean && npx nyc --no-clean npm run 'test' && npm run cover:report",
3435
"test:node_with_strip_types": "node --experimental-strip-types test/deno_test.ts",

src/utils/utf8-wasm-binary.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
// Auto-generated by wasm/build.sh - do not edit manually
2+
// Source: wasm/utf8.wat
3+
4+
export const wasmBinary = "AGFzbQEAAAABHwVgAW8Bf2ACb38Bf2ABfwFkb2ACb28BZG9gAn9/AW8CawQOd2FzbTpqcy1zdHJpbmcGbGVuZ3RoAAAOd2FzbTpqcy1zdHJpbmcKY2hhckNvZGVBdAABDndhc206anMtc3RyaW5nDGZyb21DaGFyQ29kZQACDndhc206anMtc3RyaW5nBmNvbmNhdAADAwQDAAEEBQMBAAEHMAQGbWVtb3J5AgAJdXRmOENvdW50AAQKdXRmOEVuY29kZQAFCnV0ZjhEZWNvZGUABgqXBgN1AQR/IAAQACECAkADQCABIAJPDQEgACABEAEhBCAEQYABSQRAIANBAWohAwUgBEGAEEkEQCADQQJqIQMFIARBgLADTyAEQf+3A01xBEAgA0EEaiEDIAFBAWohAQUgA0EDaiEDCwsLIAFBAWohAQwACwALIAMLvQIBBX8gABAAIQMgASEEAkADQCACIANPDQEgACACEAEhBSAFQYABSQRAIAQgBToAACAEQQFqIQQFIAVBgBBJBEAgBCAFQQZ2QcABcjoAACAEQQFqIAVBP3FBgAFyOgAAIARBAmohBAUgBUGAsANPIAVB/7cDTXEEQCACQQFqIQIgACACEAEhBiAFQYCwA2tBCnQgBkGAuANrakGAgARqIQUgBCAFQRJ2QfABcjoAACAEQQFqIAVBDHZBP3FBgAFyOgAAIARBAmogBUEGdkE/cUGAAXI6AAAgBEEDaiAFQT9xQYABcjoAACAEQQRqIQQFIAQgBUEMdkHgAXI6AAAgBEEBaiAFQQZ2QT9xQYABcjoAACAEQQJqIAVBP3FBgAFyOgAAIARBA2ohBAsLCyACQQFqIQIMAAsACyAEIAFrC98CAgd/AW8gACECIAAgAWohA0EAEAIhCQJAA0AgAiADTw0BIAItAAAhBCAEQYABcUUEQCAJIAQQAhADIQkgAkEBaiECDAELIARB4AFxQcABRgRAIAJBAWotAAAhBSAEQR9xQQZ0IAVBP3FyIQggCSAIEAIQAyEJIAJBAmohAgwBCyAEQfABcUHgAUYEQCACQQFqLQAAIQUgAkECai0AACEGIARBD3FBDHQgBUE/cUEGdHIgBkE/cXIhCCAJIAgQAhADIQkgAkEDaiECDAELIARB+AFxQfABRgRAIAJBAWotAAAhBSACQQJqLQAAIQYgAkEDai0AACEHIARBB3FBEnQgBUE/cUEMdHIgBkE/cUEGdHIgB0E/cXIhCCAIQYCABGshCCAJIAhBCnZBgLADchACEAMhCSAJIAhB/wdxQYC4A3IQAhADIQkgAkEEaiECDAELIAJBAWohAgwACwALIAkL";

src/utils/utf8-wasm.ts

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
/**
2+
* WebAssembly-based UTF-8 string processing using js-string-builtins.
3+
*
4+
* Environment variables:
5+
* - MSGPACK_WASM=force: Force wasm mode, throw error if wasm fails to load
6+
* - MSGPACK_WASM=never: Disable wasm, always use pure JS
7+
*
8+
* Three-tier fallback:
9+
* 1. Native js-string-builtins (Chrome 130+, Firefox 134+)
10+
* 2. Wasm + polyfill (older browsers with WebAssembly)
11+
* 3. Pure JS (no WebAssembly support)
12+
*/
13+
14+
import { wasmBinary } from "./utf8-wasm-binary.ts";
15+
16+
// Check environment variable for wasm mode
17+
declare const process: { env?: Record<string, string | undefined> } | undefined;
18+
19+
function getWasmMode(): "force" | "never" | "auto" {
20+
try {
21+
if (process?.env) {
22+
const mode = process.env["MSGPACK_WASM"];
23+
if (mode) {
24+
switch (mode.toLowerCase()) {
25+
case "force":
26+
return "force";
27+
case "never":
28+
return "never";
29+
default:
30+
return "auto";
31+
}
32+
}
33+
}
34+
} catch {
35+
// process may not be defined in browser
36+
}
37+
return "auto";
38+
}
39+
40+
const WASM_MODE = getWasmMode();
41+
42+
interface WasmExports {
43+
memory: WebAssembly.Memory;
44+
utf8Count(str: string): number;
45+
utf8Encode(str: string, offset: number): number;
46+
utf8Decode(offset: number, length: number): string;
47+
}
48+
49+
let wasmInstance: WasmExports | null = null;
50+
let wasmInitError: Error | null = null;
51+
52+
function base64ToBytes(base64: string): Uint8Array {
53+
if (typeof atob === "function") {
54+
const binary = atob(base64);
55+
const bytes = new Uint8Array(binary.length);
56+
for (let i = 0; i < binary.length; i++) {
57+
bytes[i] = binary.charCodeAt(i);
58+
}
59+
return bytes;
60+
}
61+
// Node.js fallback
62+
return new Uint8Array(Buffer.from(base64, "base64"));
63+
}
64+
65+
// Polyfill for js-string-builtins (used when native builtins unavailable)
66+
const jsStringPolyfill = {
67+
// eslint-disable-next-line @typescript-eslint/naming-convention
68+
"wasm:js-string": {
69+
length: (s: string) => s.length,
70+
charCodeAt: (s: string, i: number) => s.charCodeAt(i),
71+
fromCharCode: (code: number) => String.fromCharCode(code),
72+
concat: (a: string, b: string) => a + b,
73+
},
74+
};
75+
76+
function tryInitWasm(): void {
77+
if (wasmInstance !== null || wasmInitError !== null) {
78+
return; // Already initialized or failed
79+
}
80+
81+
if (WASM_MODE === "never") {
82+
wasmInitError = new Error("MSGPACK_WASM=never: wasm disabled");
83+
return;
84+
}
85+
86+
try {
87+
if (typeof WebAssembly === "undefined") {
88+
throw new Error("WebAssembly not supported");
89+
}
90+
91+
const bytes = base64ToBytes(wasmBinary);
92+
93+
// Try with builtins option (native support)
94+
// If builtins not supported, option is ignored and polyfill is used
95+
96+
97+
const module: WebAssembly.Module = new (WebAssembly.Module as any)(bytes, { builtins: ["js-string"] });
98+
99+
100+
const instance = new (WebAssembly.Instance)(module, jsStringPolyfill);
101+
wasmInstance = instance.exports as unknown as WasmExports;
102+
} catch (e) {
103+
wasmInitError = e instanceof Error ? e : new Error(String(e));
104+
105+
if (WASM_MODE === "force") {
106+
throw new Error(`MSGPACK_WASM=force but wasm failed to load: ${wasmInitError.message}`);
107+
}
108+
}
109+
}
110+
111+
// Initialize on module load
112+
tryInitWasm();
113+
114+
/**
115+
* Whether wasm is available and initialized.
116+
*/
117+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
118+
export const WASM_AVAILABLE = (wasmInstance !== null);
119+
120+
/**
121+
* Get the wasm initialization error, if any.
122+
*/
123+
export function getWasmError(): Error | null {
124+
return wasmInitError;
125+
}
126+
127+
/**
128+
* Get the raw wasm exports for advanced usage.
129+
*/
130+
export function getWasmExports(): WasmExports | null {
131+
return wasmInstance;
132+
}
133+
134+
/**
135+
* Count UTF-8 byte length of a string.
136+
*/
137+
export function utf8CountWasm(str: string): number {
138+
if (wasmInstance === null) {
139+
throw new Error("wasm not initialized");
140+
}
141+
return wasmInstance.utf8Count(str);
142+
}
143+
144+
/**
145+
* Encode string to UTF-8 bytes in the provided buffer.
146+
* Returns the number of bytes written.
147+
*/
148+
export function utf8EncodeWasm(str: string, output: Uint8Array, outputOffset: number): number {
149+
if (wasmInstance === null) {
150+
throw new Error("wasm not initialized");
151+
}
152+
153+
// Ensure wasm memory is large enough
154+
const byteLength = wasmInstance.utf8Count(str);
155+
const requiredPages = Math.ceil((outputOffset + byteLength) / 65536);
156+
const currentPages = wasmInstance.memory.buffer.byteLength / 65536;
157+
158+
if (requiredPages > currentPages) {
159+
wasmInstance.memory.grow(requiredPages - currentPages);
160+
}
161+
162+
// Encode to wasm memory
163+
const bytesWritten = wasmInstance.utf8Encode(str, 0);
164+
165+
// Copy from wasm memory to output buffer
166+
const wasmBytes = new Uint8Array(wasmInstance.memory.buffer, 0, bytesWritten);
167+
output.set(wasmBytes, outputOffset);
168+
169+
return bytesWritten;
170+
}
171+
172+
/**
173+
* Decode UTF-8 bytes to string.
174+
*/
175+
export function utf8DecodeWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
176+
if (wasmInstance === null) {
177+
throw new Error("wasm not initialized");
178+
}
179+
180+
// Ensure wasm memory is large enough
181+
const requiredPages = Math.ceil(byteLength / 65536);
182+
const currentPages = wasmInstance.memory.buffer.byteLength / 65536;
183+
184+
if (requiredPages > currentPages) {
185+
wasmInstance.memory.grow(requiredPages - currentPages);
186+
}
187+
188+
// Copy bytes to wasm memory
189+
const wasmBytes = new Uint8Array(wasmInstance.memory.buffer, 0, byteLength);
190+
wasmBytes.set(bytes.subarray(inputOffset, inputOffset + byteLength));
191+
192+
// Decode from wasm memory
193+
const result = wasmInstance.utf8Decode(0, byteLength);
194+
195+
// Remove leading NUL character (artifact of wasm implementation)
196+
return result.length > 0 && result.charCodeAt(0) === 0 ? result.slice(1) : result;
197+
}

src/utils/utf8.ts

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
export function utf8Count(str: string): number {
1+
import { WASM_AVAILABLE, utf8CountWasm, utf8EncodeWasm, utf8DecodeWasm } from "./utf8-wasm.ts";
2+
3+
export { WASM_AVAILABLE };
4+
5+
export function utf8CountJs(str: string): number {
26
const strLength = str.length;
37

48
let byteLength = 0;
@@ -38,6 +42,8 @@ export function utf8Count(str: string): number {
3842
return byteLength;
3943
}
4044

45+
export const utf8Count: (str: string) => number = WASM_AVAILABLE ? utf8CountWasm : utf8CountJs;
46+
4147
export function utf8EncodeJs(str: string, output: Uint8Array, outputOffset: number): void {
4248
const strLength = str.length;
4349
let offset = outputOffset;
@@ -98,14 +104,34 @@ export function utf8EncodeTE(str: string, output: Uint8Array, outputOffset: numb
98104
sharedTextEncoder.encodeInto(str, output.subarray(outputOffset));
99105
}
100106

101-
export function utf8Encode(str: string, output: Uint8Array, outputOffset: number): void {
107+
// Wasm threshold: use wasm for medium strings, TextEncoder for large strings
108+
// These thresholds should be determined by benchmarking.
109+
// Run `npx ts-node benchmark/encode-string.ts` for details.
110+
const WASM_ENCODE_MAX = 1000;
111+
112+
function utf8EncodeWithWasm(str: string, output: Uint8Array, outputOffset: number): void {
113+
const len = str.length;
114+
if (len > WASM_ENCODE_MAX) {
115+
utf8EncodeTE(str, output, outputOffset);
116+
} else if (len > TEXT_ENCODER_THRESHOLD) {
117+
utf8EncodeWasm(str, output, outputOffset);
118+
} else {
119+
utf8EncodeJs(str, output, outputOffset);
120+
}
121+
}
122+
123+
function utf8EncodeNoWasm(str: string, output: Uint8Array, outputOffset: number): void {
102124
if (str.length > TEXT_ENCODER_THRESHOLD) {
103125
utf8EncodeTE(str, output, outputOffset);
104126
} else {
105127
utf8EncodeJs(str, output, outputOffset);
106128
}
107129
}
108130

131+
export const utf8Encode: (str: string, output: Uint8Array, outputOffset: number) => void = WASM_AVAILABLE
132+
? utf8EncodeWithWasm
133+
: utf8EncodeNoWasm;
134+
109135
const CHUNK_SIZE = 0x1_000;
110136

111137
export function utf8DecodeJs(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
@@ -168,10 +194,27 @@ export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength:
168194
return sharedTextDecoder.decode(stringBytes);
169195
}
170196

171-
export function utf8Decode(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
197+
// Wasm decode threshold: use wasm for medium strings, TextDecoder for large strings
198+
const WASM_DECODE_MAX = 1000;
199+
200+
function utf8DecodeWithWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
201+
if (byteLength > WASM_DECODE_MAX) {
202+
return utf8DecodeTD(bytes, inputOffset, byteLength);
203+
} else if (byteLength > TEXT_DECODER_THRESHOLD) {
204+
return utf8DecodeWasm(bytes, inputOffset, byteLength);
205+
} else {
206+
return utf8DecodeJs(bytes, inputOffset, byteLength);
207+
}
208+
}
209+
210+
function utf8DecodeNoWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
172211
if (byteLength > TEXT_DECODER_THRESHOLD) {
173212
return utf8DecodeTD(bytes, inputOffset, byteLength);
174213
} else {
175214
return utf8DecodeJs(bytes, inputOffset, byteLength);
176215
}
177216
}
217+
218+
export const utf8Decode: (bytes: Uint8Array, inputOffset: number, byteLength: number) => string = WASM_AVAILABLE
219+
? utf8DecodeWithWasm
220+
: utf8DecodeNoWasm;

0 commit comments

Comments
 (0)