|
13 | 13 | (import "wasm:js-string" "concat" |
14 | 14 | (func $str_concat (param externref externref) (result (ref extern)))) |
15 | 15 |
|
16 | | - ;; Linear memory for UTF-8 bytes (64KB initial, exported for JS access) |
| 16 | + ;; Linear memory layout: |
| 17 | + ;; - 0 to 32KB: UTF-8 input bytes |
| 18 | + ;; - 32KB onwards: UTF-16 code units output (i16 array) |
17 | 19 | (memory (export "memory") 1) |
18 | 20 |
|
| 21 | + ;; Offset where UTF-16 output starts (32KB = 32768) |
| 22 | + (global $utf16_offset i32 (i32.const 32768)) |
| 23 | + |
19 | 24 | ;; Count UTF-8 byte length of a JS string |
20 | 25 | ;; This is equivalent to Buffer.byteLength(str, 'utf8') or TextEncoder().encode(str).length |
21 | 26 | (func (export "utf8Count") (param $str externref) (result i32) |
|
134 | 139 |
|
135 | 140 | (i32.sub (local.get $pos) (local.get $offset))) |
136 | 141 |
|
137 | | - ;; Decode UTF-8 bytes from linear memory to JS string |
138 | | - ;; Reads from offset for length bytes |
139 | | - (func (export "utf8Decode") (param $offset i32) (param $length i32) (result externref) |
| 142 | + ;; Decode UTF-8 bytes to UTF-16 code units in memory |
| 143 | + ;; Reads UTF-8 from offset 0 for $length bytes |
| 144 | + ;; Writes UTF-16 code units to utf16_offset |
| 145 | + ;; Returns number of UTF-16 code units written |
| 146 | + (func (export "utf8DecodeToMemory") (param $length i32) (result i32) |
140 | 147 | (local $pos i32) |
141 | 148 | (local $end i32) |
142 | | - (local $result externref) |
| 149 | + (local $outPos i32) |
143 | 150 | (local $byte1 i32) |
144 | 151 | (local $byte2 i32) |
145 | 152 | (local $byte3 i32) |
146 | 153 | (local $byte4 i32) |
147 | 154 | (local $codePoint i32) |
148 | 155 |
|
149 | | - (local.set $pos (local.get $offset)) |
150 | | - (local.set $end (i32.add (local.get $offset) (local.get $length))) |
151 | | - ;; Start with empty string (NUL char, will be trimmed by JS side if needed) |
152 | | - (local.set $result (call $str_fromCharCode (i32.const 0))) |
| 156 | + (local.set $pos (i32.const 0)) |
| 157 | + (local.set $end (local.get $length)) |
| 158 | + (local.set $outPos (global.get $utf16_offset)) |
153 | 159 |
|
154 | 160 | (block $break |
155 | 161 | (loop $continue |
|
160 | 166 | ;; 1-byte: 0xxxxxxx |
161 | 167 | (if (i32.eqz (i32.and (local.get $byte1) (i32.const 0x80))) |
162 | 168 | (then |
163 | | - (local.set $result |
164 | | - (call $str_concat |
165 | | - (local.get $result) |
166 | | - (call $str_fromCharCode (local.get $byte1)))) |
| 169 | + (i32.store16 (local.get $outPos) (local.get $byte1)) |
| 170 | + (local.set $outPos (i32.add (local.get $outPos) (i32.const 2))) |
167 | 171 | (local.set $pos (i32.add (local.get $pos) (i32.const 1))) |
168 | 172 | (br $continue))) |
169 | 173 |
|
|
175 | 179 | (i32.or |
176 | 180 | (i32.shl (i32.and (local.get $byte1) (i32.const 0x1F)) (i32.const 6)) |
177 | 181 | (i32.and (local.get $byte2) (i32.const 0x3F)))) |
178 | | - (local.set $result |
179 | | - (call $str_concat |
180 | | - (local.get $result) |
181 | | - (call $str_fromCharCode (local.get $codePoint)))) |
| 182 | + (i32.store16 (local.get $outPos) (local.get $codePoint)) |
| 183 | + (local.set $outPos (i32.add (local.get $outPos) (i32.const 2))) |
182 | 184 | (local.set $pos (i32.add (local.get $pos) (i32.const 2))) |
183 | 185 | (br $continue))) |
184 | 186 |
|
|
193 | 195 | (i32.shl (i32.and (local.get $byte1) (i32.const 0x0F)) (i32.const 12)) |
194 | 196 | (i32.shl (i32.and (local.get $byte2) (i32.const 0x3F)) (i32.const 6))) |
195 | 197 | (i32.and (local.get $byte3) (i32.const 0x3F)))) |
196 | | - (local.set $result |
197 | | - (call $str_concat |
198 | | - (local.get $result) |
199 | | - (call $str_fromCharCode (local.get $codePoint)))) |
| 198 | + (i32.store16 (local.get $outPos) (local.get $codePoint)) |
| 199 | + (local.set $outPos (i32.add (local.get $outPos) (i32.const 2))) |
200 | 200 | (local.set $pos (i32.add (local.get $pos) (i32.const 3))) |
201 | 201 | (br $continue))) |
202 | 202 |
|
|
217 | 217 | ;; Convert to surrogate pair |
218 | 218 | (local.set $codePoint (i32.sub (local.get $codePoint) (i32.const 0x10000))) |
219 | 219 | ;; High surrogate |
220 | | - (local.set $result |
221 | | - (call $str_concat |
222 | | - (local.get $result) |
223 | | - (call $str_fromCharCode |
224 | | - (i32.or |
225 | | - (i32.shr_u (local.get $codePoint) (i32.const 10)) |
226 | | - (i32.const 0xD800))))) |
| 220 | + (i32.store16 (local.get $outPos) |
| 221 | + (i32.or |
| 222 | + (i32.shr_u (local.get $codePoint) (i32.const 10)) |
| 223 | + (i32.const 0xD800))) |
| 224 | + (local.set $outPos (i32.add (local.get $outPos) (i32.const 2))) |
227 | 225 | ;; Low surrogate |
228 | | - (local.set $result |
229 | | - (call $str_concat |
230 | | - (local.get $result) |
231 | | - (call $str_fromCharCode |
232 | | - (i32.or |
233 | | - (i32.and (local.get $codePoint) (i32.const 0x3FF)) |
234 | | - (i32.const 0xDC00))))) |
| 226 | + (i32.store16 (local.get $outPos) |
| 227 | + (i32.or |
| 228 | + (i32.and (local.get $codePoint) (i32.const 0x3FF)) |
| 229 | + (i32.const 0xDC00))) |
| 230 | + (local.set $outPos (i32.add (local.get $outPos) (i32.const 2))) |
235 | 231 | (local.set $pos (i32.add (local.get $pos) (i32.const 4))) |
236 | 232 | (br $continue))) |
237 | 233 |
|
238 | 234 | ;; Invalid byte, skip |
239 | 235 | (local.set $pos (i32.add (local.get $pos) (i32.const 1))) |
240 | 236 | (br $continue))) |
241 | 237 |
|
242 | | - ;; Result has leading NUL char - JS side will slice it off |
243 | | - (local.get $result)) |
| 238 | + ;; Return number of UTF-16 code units written |
| 239 | + (i32.shr_u (i32.sub (local.get $outPos) (global.get $utf16_offset)) (i32.const 1))) |
244 | 240 | ) |
0 commit comments